001 package edu.nrao.sss.html; 002 003 import java.util.Enumeration; 004 import java.util.List; 005 006 import javax.swing.text.MutableAttributeSet; 007 import javax.swing.text.html.HTML; 008 import javax.swing.text.html.HTMLEditorKit; 009 010 /** 011 * A handler of table-related HTML tags that may be called by subclasses of 012 * {@link javax.swing.text.html.HTMLEditorKit.Parser}. Example of use: 013 * <pre> 014 * ParserDelegator parser = new ParserDelegator(); 015 * 016 * parser.parse(new InputStreamReader(htmlPage.openStream()), 017 * new HtmlTableTagHandler(myTableList), true); 018 * </pre> 019 * <p> 020 * <b>Version Info:</b> 021 * <table style="margin-left:2em"> 022 * <tr><td>$Revision: 545 $</td></tr> 023 * <tr><td>$Date: 2007-04-19 10:38:45 -0600 (Thu, 19 Apr 2007) $</td></tr> 024 * <tr><td>$Author: dharland $</td></tr> 025 * </table></p> 026 * 027 * @author David M. Harland 028 * @since 2007-03-16 029 */ 030 public class HtmlTableTagHandler 031 extends HTMLEditorKit.ParserCallback 032 { 033 //One of these two will be null, the other non-null, depending on whether 034 //the client wants one table or a list of all tables. 035 private HtmlTable table; 036 private List<HtmlTable> tableList; 037 038 //Used when client wants the nth table 039 private int desiredIndex; 040 private int currentIndex; 041 042 //Lets all methods know that we're parsing a desired table 043 private boolean readThisTable; 044 045 //Keeps track of tables within tables within... 046 private int tableDepth; 047 048 //Our current location in a table 049 private HtmlTableRow row; 050 private HtmlTableCell cell; 051 052 //Indicators for the unsupported table section tags 053 private boolean thead; 054 private boolean tbody; 055 private boolean tfoot; 056 057 //For enabling reuse of this object 058 private boolean parsingIsDone; 059 060 /** 061 * Creates a handler that will place the n<sup>th</sup> occurrence of an 062 * HTML table presented to it into {@code destination}, where n is 063 * {@code tableOccurrenceToRead}. Indexing begins with zero. 064 * <p> 065 * <b>Note:</b> a table inside of another table will <i>not</i> be counted 066 * as a new table for purposes of counting occurrences.</p> 067 * 068 * @param tableOccurrenceToRead the occurrence of an HTML table to be saved. 069 * Indexing begins at zero. 070 * 071 * @param destination the table to populate. This value must not be <i>null</i>. 072 * If it is, the parsing process will throw a 073 * {@link NullPointerException}. 074 */ 075 public HtmlTableTagHandler(int tableOccurrenceToRead, HtmlTable destination) 076 { 077 init(); 078 079 desiredIndex = tableOccurrenceToRead; 080 table = destination; 081 tableList = null; 082 } 083 084 /** 085 * Creates a handler that will append all tables parsed to the given list. 086 * 087 * @param destination the list to which parsed tables will be appended. 088 * This list may or may not be empty, but must not be 089 * <i>null</i>. 090 * If it is, the parsing process will throw a 091 * {@link NullPointerException}. 092 */ 093 public HtmlTableTagHandler(List<HtmlTable> destination) 094 { 095 init(); 096 097 desiredIndex = -1; 098 table = null; 099 tableList = destination; 100 } 101 102 private void init() 103 { 104 readThisTable = false; 105 currentIndex = -1; 106 tableDepth = 0; 107 row = null; 108 cell = null; 109 thead = false; 110 tbody = false; 111 tfoot = false; 112 parsingIsDone = false; 113 } 114 115 public void reset() 116 { 117 init(); 118 } 119 120 public void reset(int tableOccurrenceToRead, HtmlTable destination) 121 { 122 reset(); 123 124 desiredIndex = tableOccurrenceToRead; 125 table = destination; 126 tableList = null; 127 } 128 129 public void reset(List<HtmlTable> destination) 130 { 131 reset(); 132 133 desiredIndex = -1; 134 table = null; 135 tableList = destination; 136 } 137 138 //============================================================================ 139 // START TAGS 140 //============================================================================ 141 142 /** 143 * Handles start tags for table, table row, table data, and table header. 144 * Any other start tag is ignored, unless it is inside a {@link HtmlTableCell 145 * table cell}, in which case it is added to the cell's unparsed content 146 * without being interpreted as an HTML element. 147 */ 148 @Override 149 public void handleStartTag(HTML.Tag tag, MutableAttributeSet attrs, int pos) 150 { 151 //Force reset 152 if (parsingIsDone) 153 reset(); 154 155 if (tag.equals(HTML.Tag.TABLE)) 156 handleTableStartTag(tag, attrs, pos); 157 158 else if (tag.equals(HTML.Tag.TR)) 159 handleRowStartTag(tag, attrs, pos); 160 161 else if (tag.equals(HTML.Tag.TD) || tag.equals(HTML.Tag.TH)) 162 handleCellStartTag(tag, attrs, pos); 163 164 else //some other tag 165 handleOtherStartTag(tag, attrs, pos); 166 } 167 168 /** Handles start tag for table. */ 169 private void handleTableStartTag(HTML.Tag tag, 170 MutableAttributeSet attrs, int pos) 171 { 172 tableDepth++; 173 174 //Count all top level tables so that we know when we're at the right one 175 if (tableDepth == 1) 176 { 177 currentIndex++; 178 handleNewTable(attrs); 179 } 180 else 181 { 182 addStartTagToCell(tag, attrs, pos, false); 183 } 184 } 185 186 /** Handles start tag for table row. */ 187 private void handleRowStartTag(HTML.Tag tag, 188 MutableAttributeSet attrs, int pos) 189 { 190 if (tableDepth < 1) 191 { 192 throw new RuntimeException( 193 "PROGRAMMER ERROR: table should be open if we're handling a row."); 194 } 195 else if (tableDepth == 1) 196 { 197 if (readThisTable) 198 { 199 if (row == null) 200 { 201 row = new HtmlTableRow(); 202 row.setType(getRowType()); 203 table.addRow(row); 204 } 205 setAttributes(row, attrs); 206 } 207 } 208 else //tableDepth > 1 209 { 210 addStartTagToCell(tag, attrs, pos, false); 211 } 212 } 213 214 /** Responds to a request to create a new {@link HtmlTable}. */ 215 private void handleNewTable(MutableAttributeSet attrs) 216 { 217 if (tableList != null) //Client wants to read all tables 218 { 219 table = new HtmlTable(); 220 setAttributes(table, attrs); 221 tableList.add(table); 222 readThisTable = true; 223 } 224 else //Client wants to read single table 225 { 226 if (currentIndex == desiredIndex) 227 { 228 setAttributes(table, attrs); 229 readThisTable = true; 230 } 231 } 232 } 233 234 /** Handles start tag for table cell. */ 235 private void handleCellStartTag(HTML.Tag tag, 236 MutableAttributeSet attrs, int pos) 237 { 238 if (tableDepth < 1) 239 throw new RuntimeException( 240 "PROGRAMMER ERROR: table & row should be open if we're handling a cell."); 241 242 if (tableDepth == 1) 243 { 244 if (readThisTable) 245 { 246 if (cell == null) 247 { 248 cell = new HtmlTableCell(tag.equals(HTML.Tag.TH) ? 249 HtmlTableCell.Type.HEADER : 250 HtmlTableCell.Type.DATA); 251 row.addCell(cell); 252 } 253 setAttributes(cell, attrs); 254 } 255 } 256 else if (tableDepth > 1) 257 { 258 addStartTagToCell(tag, attrs, pos, false); 259 } 260 } 261 262 /** Handles non-table start tags. */ 263 private void handleOtherStartTag(HTML.Tag tag, 264 MutableAttributeSet attrs, int pos) 265 { 266 addStartTagToCell(tag, attrs, pos, false); 267 } 268 269 private void addStartTagToCell(HTML.Tag tag, 270 MutableAttributeSet attrs, int pos, 271 boolean isSimpleTag) 272 { 273 if (cell == null) 274 return; 275 276 StringBuilder buff = cell.getUnparsedContents(); 277 278 //Tag name 279 buff.append('<').append(tag.toString()); 280 281 //Attributes 282 Enumeration<?> names = attrs.getAttributeNames(); 283 284 while (names.hasMoreElements()) 285 { 286 Object attrName = names.nextElement(); 287 buff.append(' ').append(attrName.toString()).append("=\"") 288 .append(attrs.getAttribute(attrName).toString()).append("\""); 289 } 290 291 if (isSimpleTag) 292 buff.append("/>"); 293 else 294 buff.append('>'); 295 } 296 297 //============================================================================ 298 // END TAGS 299 //============================================================================ 300 301 /** 302 * Handles end tags for table, table row, table data, and table header. 303 * Any other start tag is ignored, unless it is inside a {@link HtmlTableCell 304 * table cell}, in which case it is added to the cell's unparsed content 305 * without being interpreted as an HTML element. 306 */ 307 @Override 308 public void handleEndTag(HTML.Tag tag, int pos) 309 { 310 //Force reset 311 if (parsingIsDone) 312 reset(); 313 314 if (tag.equals(HTML.Tag.TABLE)) 315 handleTableEndTag(tag, pos); 316 317 else if (tag.equals(HTML.Tag.TR)) 318 handleRowEndTag(tag, pos); 319 320 else if (tag.equals(HTML.Tag.TD) || tag.equals(HTML.Tag.TH)) 321 handleCellEndTag(tag, pos); 322 323 else //some other tag 324 handleOtherEndTag(tag, pos); 325 } 326 327 /** Handles end tag for table. */ 328 private void handleTableEndTag(HTML.Tag tag, int pos) 329 { 330 if (tableDepth <= 0) 331 { 332 throw new RuntimeException("PROGRAMMER ERROR: table depth will be < 0."); 333 } 334 else if (tableDepth > 1) 335 { 336 handleOtherEndTag(tag, pos); 337 } 338 else //tableDepth == 1 339 { 340 readThisTable = false; 341 } 342 343 tableDepth--; 344 } 345 346 /** Handles end tag for table row. */ 347 private void handleRowEndTag(HTML.Tag tag, int pos) 348 { 349 //We're now assuming proper closing tags for all non-simple tags, 350 //so we do not force the cell closed. 351 if (tableDepth > 1) 352 handleOtherEndTag(tag, pos); 353 else 354 row = null; 355 } 356 357 /** Handles end tag for table cell. */ 358 private void handleCellEndTag(HTML.Tag tag, int pos) 359 { 360 if (tableDepth > 1) 361 handleOtherEndTag(tag, pos); 362 else 363 { 364 //System.out.println("Table " + currentIndex + ", row " + row.getPositionInTable() + ", col " + cell.getPositionInRow() + 365 // ": " + cell.getUnparsedContents()); 366 cell = null; 367 } 368 } 369 370 /** Handles non-table end tags. */ 371 private void handleOtherEndTag(HTML.Tag tag, int pos) 372 { 373 if (cell != null) 374 cell.getUnparsedContents().append("</").append(tag.toString()) 375 .append(">"); 376 } 377 378 //============================================================================ 379 // SIMPLE TAGS 380 //============================================================================ 381 382 /** 383 * If the simple tag is inside a table cell, it is added to that cell's 384 * unparsed contents, otherwise it is ignored. 385 */ 386 @Override 387 public void handleSimpleTag(HTML.Tag tag, MutableAttributeSet attrs, int pos) 388 { 389 //Force reset 390 if (parsingIsDone) 391 reset(); 392 393 addStartTagToCell(tag, attrs, pos, true); 394 } 395 396 //============================================================================ 397 // 398 //============================================================================ 399 400 /** 401 * If the text is inside a table cell, it is added to that cell's 402 * unparsed contents, otherwise it is ignored. 403 */ 404 @Override 405 public void handleText(char[] data, int pos) 406 { 407 //Force reset 408 if (parsingIsDone) 409 reset(); 410 411 if (cell != null) 412 cell.getUnparsedContents().append(new String(data)); 413 } 414 415 @Override 416 public void flush() 417 { 418 //The parser does not tell us when parsing begins. We wish it did, so that 419 //we could reset the state of this handler just-in-time. Instead, we note 420 //here when parsing finished and then look at this flag in all the handler 421 //methods to determine when a reset is necessary. 422 parsingIsDone = true; 423 } 424 425 //============================================================================ 426 // ERRORS 427 //============================================================================ 428 429 @Override 430 public void handleError(String errMsg, int pos) 431 { 432 //Force reset 433 if (parsingIsDone) 434 reset(); 435 436 boolean putError = false; 437 438 //Java's HTML parser is stuck on HTML 3.2. It does have a mechanism 439 //for dealing with unknown tags. Unfortunately for us, though, they 440 //have code in javax.swing.text.html.parser.Parser.ignoreElement that 441 //short-circuits this mechanism for tags inside the <table> element. 442 //This means the new <thead>, <tfoot>, and <tbody> tags are not treated 443 //like HTML.UnknownTag, but are treated instead as parsing errors. 444 //That's why we have the code below. 445 446 if (errMsg.startsWith("invalid.tagattclass")) 447 { 448 String text = errMsg.replaceAll("\\?", ""); 449 text = text.substring("invalid.tagattclass".length()); 450 451 if (!(text.equalsIgnoreCase("table") || 452 text.equalsIgnoreCase("tr") || 453 text.equalsIgnoreCase("th") || 454 text.equalsIgnoreCase("td"))) 455 putError = readThisTable; 456 } 457 else if (errMsg.startsWith("tag.ignore")) 458 { 459 String text = errMsg.replaceAll("\\?", ""); 460 text = text.substring("tag.ignore".length()); 461 462 //This code assumes odd occurrences are open tags, even are close tags 463 if (text.equalsIgnoreCase("thead")) thead = !thead; 464 else if (text.equalsIgnoreCase("tbody")) tbody = !tbody; 465 else if (text.equalsIgnoreCase("tfoot")) tfoot = !tfoot; 466 else 467 putError = readThisTable; 468 } 469 else 470 { 471 putError = readThisTable; 472 } 473 474 if (putError) 475 table.parsingErrors.add(errMsg + " at character " + pos + 476 " [source=java's DocumentParser]."); 477 } 478 479 //============================================================================ 480 // 481 //============================================================================ 482 483 /** 484 * Converts {@code attributes} into name/value pairs and adds them to 485 * {@code element}. 486 */ 487 private void setAttributes(HtmlElement element, MutableAttributeSet attributes) 488 { 489 Enumeration<?> names = attributes.getAttributeNames(); 490 491 while (names.hasMoreElements()) 492 { 493 Object attrName = names.nextElement(); 494 495 //Ignore any poorly formed attributes 496 try { 497 element.addAttribute( 498 new HtmlAttribute(attrName.toString(), 499 attributes.getAttribute(attrName).toString())); 500 } 501 catch (IllegalArgumentException ex) { 502 if (readThisTable) 503 table.parsingErrors.add(ex.getMessage()); 504 } 505 } 506 } 507 508 private HtmlTableRow.Type getRowType() 509 { 510 HtmlTableRow.Type result = null; 511 512 int type = 0; 513 514 if (thead) type += 1; 515 if (tbody) type += 2; 516 if (tfoot) type += 4; 517 518 switch (type) 519 { 520 case 0: result = HtmlTableRow.Type.DATA; break; 521 case 1: result = HtmlTableRow.Type.HEADER; break; 522 case 2: result = HtmlTableRow.Type.DATA; break; 523 case 4: result = HtmlTableRow.Type.FOOTER; break; 524 525 //This means multiple flags are true. Note error, but call the row DATA. 526 default: 527 table.parsingErrors.add( 528 "More than one of thead, tbody, and tfoot are true; type value = " + 529 type); 530 531 result = HtmlTableRow.Type.DATA; 532 } 533 534 return result; 535 } 536 }