001 package edu.nrao.sss.html; 002 003 import java.io.BufferedReader; 004 import java.io.IOException; 005 import java.io.Reader; 006 import java.io.Writer; 007 import java.net.URL; 008 import java.util.ArrayList; 009 import java.util.List; 010 011 import javax.swing.text.html.HTML; 012 import javax.swing.text.html.parser.ParserDelegator; 013 014 import edu.nrao.sss.util.StringUtil; 015 016 /** 017 * An HTML table. 018 * <p> 019 * <b>Version Info:</b> 020 * <table style="margin-left:2em"> 021 * <tr><td>$Revision: 502 $</td></tr> 022 * <tr><td>$Date: 2007-04-04 11:39:33 -0600 (Wed, 04 Apr 2007) $</td></tr> 023 * <tr><td>$Author: dharland $</td></tr> 024 * </table></p> 025 * 026 * @author David M. Harland 027 * @since 2007-03-15 028 */ 029 public class HtmlTable 030 extends HtmlElement 031 { 032 URL parentPage; 033 List<HtmlTableRow> rows; //Publicly mutable, not publicly settable 034 List<String> parsingErrors; 035 036 /** Creates a new empty table. */ 037 public HtmlTable() 038 { 039 super(HTML.Tag.TABLE); 040 041 rows = new ArrayList<HtmlTableRow>(); 042 parsingErrors = new ArrayList<String>(); 043 } 044 045 /** Returns <i>false</i>. */ 046 @Override 047 public boolean isSimple() { return false; } 048 049 /** 050 * Returns the page to which this table belongs, if any. 051 * 052 * @return the page to which this table belongs. If this table belongs to 053 * no page, the value returned is <i>null</i>. 054 */ 055 public URL getParentPage() 056 { 057 return parentPage; 058 } 059 060 //============================================================================ 061 // ROWS 062 //============================================================================ 063 064 //TODO addRow(index, newRow) and other "listy" methods? 065 066 /** 067 * Adds {@code newRow} to this table. 068 * <p> 069 * If the new row is <i>null</i> or already in this table, 070 * this method does nothing. Otherwise the new row is 071 * removed from its previous table, if any, and added to this 072 * one. The new cell's parent table is set to this one.</p> 073 * 074 * @param newRow a new row for this table. 075 */ 076 public void addRow(HtmlTableRow newRow) 077 { 078 //Nothing to do if new row is null or already in this table 079 if ((newRow != null) && (newRow.parentTable != this)) 080 { 081 if (newRow.parentTable != null) 082 newRow.parentTable.removeRow(newRow); 083 084 rows.add(newRow); 085 086 newRow.parentTable = this; 087 } 088 } 089 090 /** 091 * Removes {@code unwantedRow} from this table. 092 * <p> 093 * If the unwanted row is <i>null</i> or not part of this 094 * table, this method does nothing. Otherwise the unwanted row 095 * is removed from this table, and its parent table is set to <i>null</i>.</p> 096 * 097 * @param unwantedRow the row to be removed from this table. 098 * 099 * @return <i>true</i> if {@code unwantedRow} was removed from this table. 100 */ 101 public boolean removeRow(HtmlTableRow unwantedRow) 102 { 103 boolean removed; 104 105 if ((unwantedRow != null) && rows.contains(unwantedRow)) 106 { 107 removed = rows.remove(unwantedRow); 108 109 unwantedRow.parentTable = null; 110 } 111 else //row was not in this table 112 { 113 removed = false; 114 } 115 116 return removed; 117 } 118 119 /** 120 * Removes all empty rows (those with no cells) from this table. 121 * @return the number of rows removed. 122 */ 123 public int removeEmptyRows() 124 { 125 int oldRowCount = rows.size(); 126 127 for (int r=oldRowCount-1; r >= 0; r--) 128 if (rows.get(r).cells.size() == 0) 129 rows.remove(r); 130 131 return oldRowCount - rows.size(); 132 } 133 134 /** 135 * Returns a copy of this table's list of rows. 136 * <p> 137 * While the list is a copy, the rows in the returned list are 138 * the actual rows held in this table.</p> 139 * 140 * @return a copy of this tables's list of rows. 141 */ 142 public List<HtmlTableRow> getRows() 143 { 144 return new ArrayList<HtmlTableRow>(rows); 145 } 146 147 /** 148 * Returns a new list that contains the rows of this table whose type is 149 * equal to {@code desiredType}. 150 * 151 * @param desiredType the type of row desired. 152 * 153 * @return the rows from this table that are of the desired type. 154 */ 155 public List<HtmlTableRow> getRows(HtmlTableRow.Type desiredType) 156 { 157 ArrayList<HtmlTableRow> selection = new ArrayList<HtmlTableRow>(); 158 159 for (HtmlTableRow row : rows) 160 if (row.getType().equals(desiredType)) 161 selection.add(row); 162 163 return selection; 164 } 165 166 /** 167 * Returns the rows of this table segregated into header rows, 168 * data rows, and footer rows. 169 * <p> 170 * The returned list will always have exactly three elements. 171 * The first list holds header rows, the second holds data rows, 172 * and the last holds footer rows. Each element will always 173 * hold a list -- it will never hold <i>null</i>. 174 * If a table has no rows of a given type, that list will be empty.</p> 175 * 176 * @return the rows of this table segregated into header rows, 177 * data rows, and footer rows. 178 */ 179 public List<List<HtmlTableRow>> getRowsInSections() 180 { 181 ArrayList<HtmlTableRow> headers = new ArrayList<HtmlTableRow>(); 182 ArrayList<HtmlTableRow> data = new ArrayList<HtmlTableRow>(); 183 ArrayList<HtmlTableRow> footers = new ArrayList<HtmlTableRow>(); 184 185 for (HtmlTableRow row : rows) 186 { 187 switch (row.type) 188 { 189 case HEADER: headers.add(row); break; 190 case DATA: data.add(row); break; 191 case FOOTER: footers.add(row); break; 192 193 default: 194 throw new RuntimeException( 195 "PROGRAMMER ERROR: Unknown HtmlTableRow.Type found [" + 196 row.type + "]."); 197 } 198 } 199 200 List<List<HtmlTableRow>> masterList = new ArrayList<List<HtmlTableRow>>(); 201 202 masterList.add(headers); 203 masterList.add(data); 204 masterList.add(footers); 205 206 return masterList; 207 } 208 209 /** 210 * Returns the number of rows in this table. 211 * @return the number of rows in this table. 212 */ 213 public int size() 214 { 215 return rows.size(); 216 } 217 218 //============================================================================ 219 // WRITING 220 //============================================================================ 221 222 @Override 223 void writeContentsAsHtml(Writer device, int padding, int depth) throws IOException 224 { 225 //Segregate the rows into headers, data, and footers 226 List<List<HtmlTableRow>> sections = getRowsInSections(); 227 228 boolean haveHeaders = (sections.get(0).size() > 0); 229 boolean haveData = (sections.get(1).size() > 0); 230 boolean haveFooters = (sections.get(2).size() > 0); 231 boolean writeSectionTags = (haveHeaders || haveFooters); 232 233 int sectionDepth = depth + 1; 234 int dataDepth = writeSectionTags ? depth + 2 : depth + 1; 235 236 //New line after <table name=...> tag 237 device.write(StringUtil.EOL); 238 239 if (haveHeaders) 240 { 241 startSection("thead", device, padding, sectionDepth); 242 243 for (HtmlTableRow row : sections.get(0)) 244 row.writeHtmlTo(device, padding, dataDepth, true); 245 246 endSection("thead", device, padding, sectionDepth); 247 } 248 249 if (haveFooters) 250 { 251 startSection("tfoot", device, padding, sectionDepth); 252 253 for (HtmlTableRow row : sections.get(2)) 254 row.writeHtmlTo(device, padding, dataDepth, true); 255 256 endSection("tfoot", device, padding, sectionDepth); 257 } 258 259 if (haveData) 260 { 261 if (writeSectionTags) 262 startSection("tbody", device, padding, sectionDepth); 263 264 for (HtmlTableRow row : sections.get(1)) 265 row.writeHtmlTo(device, padding, dataDepth, true); 266 267 if (writeSectionTags) 268 endSection("tbody", device, padding, sectionDepth); 269 } 270 271 //Padding before </table> 272 device.write(getPadding(padding, depth)); 273 } 274 275 void startSection(String sectionName, Writer device, int padding, int depth) 276 throws IOException 277 { 278 device.write(getPadding(padding, depth)); 279 device.write('<'); 280 device.write(sectionName); 281 device.write('>'); 282 device.write(StringUtil.EOL); 283 } 284 285 void endSection(String sectionName, Writer device, int padding, int depth) 286 throws IOException 287 { 288 device.write(getPadding(padding, depth)); 289 device.write("</"); 290 device.write(sectionName); 291 device.write('>'); 292 device.write(StringUtil.EOL); 293 } 294 295 /** 296 * Writes this table as text. Each row is separated by 297 * {@code rowDelimiter}. Within each row, each column is 298 * separated by {@code columnDelimiter}. 299 * <p> 300 * <b><u>Handling COLSPAN and ROWSPAN</u></b><br/> 301 * In order to make a rectangular grid of rows and columns, this method 302 * processes the <tt>colspan</tt> and <tt>rowspan</tt> attributes of 303 * each {@link HtmlTableCell}. It does this by repeating a table cell's 304 * information in the appropriate number of rows and columns of the grid. 305 * A hole anywhere in the grid is represented by an empty string, in which 306 * case you will see consecutive {@code columnDelimiter}s. If a table has 307 * been specified erroneously such that two cells overlap in the grid, this 308 * method favors the table cells of the row with the higher index, overwriting 309 * those from previous rows.</p> 310 * <u>Example:</u><br/> 311 * <p> 312 * Given this 4x4 table: <blockquote> 313 * <table border="1" style="border:solid; border-collapse:collapse;"> 314 * <tr> 315 * <td rowspan="2" width="25%">--A--</td> 316 * <td colspan="2" width="50%" align="center">--B--</td> 317 * </tr> 318 * <tr> 319 * <td rowspan="2" colspan="2" align="center">--C--</td> 320 * <td width="25%">--D--</td> 321 * </tr> 322 * <tr> 323 * <td>--E--</td> 324 * <td rowspan="2">--F--</td> 325 * </tr> 326 * <tr> 327 * <td colspan="2" align="center">--G--</td> 328 * </tr> 329 * </table></blockquote> 330 * </p><p> 331 * The result of calling this method with ";" for the column delimiter and 332 * new-line for the row delimiter is:<b><pre> 333 * --A--;--B--;--B--;; 334 * --A--;--C--;--C--;--D--; 335 * --E--;--C--;--C--;--F--; 336 * --G--;--G--;;--F--;</pre></b></p> 337 * 338 * @param device where the text is written. 339 * @param rowDelimiter used to separate the rows. 340 * @param columnDelimiter used to separate columns within a row. 341 * @throws IOException if anything goes wrong while writing to the device. 342 */ 343 public void writeTextTo(Writer device, 344 String rowDelimiter, String columnDelimiter) 345 throws IOException 346 { 347 HtmlCellGrid grid = HtmlCellGrid.buildFrom(this); 348 349 StringBuilder buff = new StringBuilder(); 350 351 int rows = grid.getRowCount(); 352 int cols = grid.getColumnCount(); 353 354 for (int r=0; r < rows; r++) 355 { 356 for (int c=0; c < cols; c++) 357 { 358 //java's HTML parser converts to character 0xA0; reverse this 359 buff.delete(0, buff.length()); 360 buff.append(grid.getCell(r, c).getUnparsedContents()); 361 replaceUnicodeNbspWith(buff, NBSP_TEXT); 362 363 device.write(buff.toString()); 364 device.write(columnDelimiter); 365 } 366 device.write(rowDelimiter); 367 } 368 } 369 370 //============================================================================ 371 // READING 372 //============================================================================ 373 374 /** 375 * Returns a list of new HTML tables created by parsing the given source. 376 * <p> 377 * <b>Caveat:</b> This method translates only "outer" tables -- those 378 * not contained in the cells of other tables. Tables inside table cells 379 * are captured as the unparsed content of 380 * {@link HtmlTableCell HtmlTableCells}. One way to overcome this is 381 * to subject the unparsed content of the returned tables' cells to this 382 * same method.</p> 383 * 384 * @param source a source of HTML to be parsed for tables. 385 * 386 * @return a list of HTML tables created by parsing the given source. 387 * 388 * @throws IOException if anything goes wrong while reading the source. 389 */ 390 public static List<HtmlTable> createFromHtml(Reader source) 391 throws IOException 392 { 393 ArrayList<HtmlTable> tables = new ArrayList<HtmlTable>(); 394 395 if (source != null) 396 { 397 ParserDelegator parser = new ParserDelegator(); 398 parser.parse(source, new HtmlTableTagHandler(tables), true); 399 } 400 401 return tables; 402 } 403 404 /** 405 * Adds rows to this table by parsing source. 406 * Only data from the table at position {@code tableIndex}, starting from the 407 * current cursor position in source, is used. Indexing starts at zero and 408 * starts with the first table open tag found. If no such tag is found, 409 * this table is unaltered by this method. 410 * <p> 411 * <b>Caveat:</b> This method indexes and translates only "outer" tables -- 412 * those not contained in the cells of other tables. Tables inside table 413 * cells of the parsed table are captured as the unparsed content of 414 * {@link HtmlTableCell HtmlTableCells}. One way to overcome this is 415 * to subject the unparsed content of the returned tables' cells to 416 * {@link #createFromHtml(Reader)}.</p> 417 * 418 * @param source a source of HTML to be parsed for tables. 419 * 420 * @throws IOException if anything goes wrong while reading the source. 421 */ 422 public void readHtmlFrom(Reader source, int tableIndex) 423 throws IOException 424 { 425 parsingErrors.clear(); 426 427 if (source == null) 428 { 429 parsingErrors.add("Tried to read from NULL source."); 430 } 431 else 432 { 433 ParserDelegator parser = new ParserDelegator(); 434 parser.parse(source, new HtmlTableTagHandler(tableIndex, this), true); 435 } 436 } 437 438 /** 439 * Returns a list of the parsing errors reported during the most recent 440 * call to {@link #readHtmlFrom(Reader, int)}. The returned list is 441 * <i>not</i> held by this table, so manipulating it will have no 442 * effect on this object. 443 * 444 * @return a list of the parsing errors reported during the most recent 445 * call to {@link #readHtmlFrom(Reader, int)}. 446 */ 447 public List<String> getParsingErrors() 448 { 449 return new ArrayList<String>(parsingErrors); 450 } 451 452 /** 453 * Appends new rows to this table by parsing the delimited text in 454 * {@code reader}. Each line presented by {@code reader} is assumed 455 * to be a new row. 456 * 457 * @param reader a text source of table rows. 458 * 459 * @param columnSeparator the delimiter separating columns in a row. 460 */ 461 public void readTextFrom(Reader reader, String columnSeparator) 462 throws IOException 463 { 464 BufferedReader source = new BufferedReader(reader); 465 String line; 466 467 while ((line = source.readLine()) != null) 468 { 469 HtmlTableRow newRow = new HtmlTableRow(); 470 471 String[] columnText = line.split(columnSeparator); 472 473 for (int c=0; c < columnText.length; c++) 474 newRow.addCell(new HtmlTableCell(columnText[c])); 475 476 addRow(newRow); 477 } 478 } 479 480 //============================================================================ 481 // 482 //============================================================================ 483 484 /** 485 * Returns a new table that is an expanded form of this table. 486 * <p> 487 * By "expanded" we mean that any cells that had <tt>colspan</tt> or 488 * <tt>rowspan</tt> values greater than one have been split into multiple 489 * cells with column and row spans of one. 490 * For example, a cell in this table with a column span of three and a 491 * row span of two is present not once, but six times, in the 492 * returned table. Furthermore, the six occurrences, while 493 * equal to each other, are six distinct instances.</p> 494 * <p> 495 * No cells are shared by this table and the returned table.</p> 496 * <b><u>Example</u></b><br/> 497 * Let the following be the HTML for this table: 498 * <table border="1" style="border:solid; border-collapse:collapse;"> 499 * <tr style="background-color:gray"> 500 * <td rowspan="2" style="color:yellow" width="25%">--A--</td> 501 * <td colspan="2" style="color:white; text-align:center" width="50%">--B--</td> 502 * </tr> 503 * <tr style="background-color:#AACCFF"> 504 * <td rowspan="2" colspan="2" style="font-weight:bold; color:red; text-align:center">--C--</td> 505 * <td>--D--</td> 506 * </tr> 507 * <tr style="background-color:#FFCCAA"> 508 * <td>--E--</td> 509 * <td rowspan="2">--F--</td> 510 * </tr> 511 * <tr style="background-color:#CCFFAA"> 512 * <td colspan="2"style="text-align:center">--G--</td> 513 * <td width="25%"></td> 514 * </tr> 515 * </table> 516 * <br/> 517 * The table created by this method looks like this: 518 * <table style="border:solid; border-collapse:collapse;" border="1"> 519 * <tr style="background-color:gray"> 520 * <td style="color:yellow" width="25%">--A--</td> 521 * <td style="color:white; text-align:center" width="25%">--B--</td> 522 * <td style="color:white; text-align:center" width="25%">--B--</td> 523 * <td></td> 524 * </tr> 525 * <tr style="background-color:#AACCFF"> 526 * <td style="color:yellow" width="25%">--A--</td> 527 * <td style="font-weight:bold; color:red; text-align:center">--C--</td> 528 * <td style="font-weight:bold; color:red; text-align:center">--C--</td> 529 * <td>--D--</td> 530 * </tr> 531 * <tr style="background-color:#FFCCAA"> 532 * <td>--E--</td> 533 * <td style="font-weight:bold; color:red; text-align:center">--C--</td> 534 * <td style="font-weight:bold; color:red; text-align:center">--C--</td> 535 * <td>--F--</td> 536 * </tr> 537 * <tr style="background-color:#CCFFAA"> 538 * <td style="text-align:center">--G--</td> 539 * <td style="text-align:center">--G--</td> 540 * <td width="25%"></td> 541 * <td>--F--</td> 542 * </tr> 543 * </table> 544 * Notice one side effect: the upper right corner of the original table 545 * has a missing cell; in the new table there is, instead, an empty cell. 546 * <br/><br/> 547 * If the second table is then recompressed via {@link #toCompressedTable()}, 548 * it looks like this: 549 * <table style="border:solid; border-collapse:collapse;" border="1"> 550 * <tr style="background-color:gray"> 551 * <td style="color:yellow" width="25%" rowspan="2">--A--</td> 552 * <td style="color:white; text-align:center" width="50%" colspan="2">--B--</td> 553 * <td></td> 554 * </tr> 555 * <tr style="background-color:#AACCFF"> 556 * <td style="font-weight:bold; color:red; text-align:center" colspan="2" rowspan="2">--C--</td> 557 * <td>--D--</td> 558 * </tr> 559 * <tr style="background-color:#FFCCAA"> 560 * <td>--E--</td> 561 * <td rowspan="2">--F--</td> 562 * </tr> 563 * <tr style="background-color:#CCFFAA"> 564 * <td style="text-align:center" colspan="2">--G--</td> 565 * <td width="25%"></td> 566 * </tr> 567 * </table> 568 * The original table is nearly reproduced. The only difference is the empty 569 * cell in place of the missing cell in the upper right corner. 570 * <p> 571 * <b>Caveat:</b> the algorithm used for column widths is only approximate. 572 * It is possible to achieve odd-looking widths for some tables.</p> 573 * 574 * @return an expanded version of this table. 575 */ 576 public HtmlTable toExpandedTable() 577 { 578 return HtmlCellGrid.buildFrom(this).makeExpandedTable(); 579 } 580 581 /** 582 * Returns a new table that is an expanded form of this table. 583 * <p> 584 * By "compressed" we mean that adjacent cells in this table that have 585 * equal contents attributes have been merged, to the extent possible, 586 * into one cell with a column span and/or a row span greater than one.</p> 587 * <p> 588 * No cells are shared by this table and the returned table.</p> 589 * <p> 590 * See the example in {@link #toExpandedTable()}; this method first calls 591 * that method, so any problems mentioned in that method's documentation 592 * may have an effect on the results of this method.</p> 593 * 594 * @return a compressed version of this table. 595 */ 596 public HtmlTable toCompressedTable() 597 { 598 return HtmlCellGrid.buildFrom(this).makeCompressedTable(); 599 } 600 601 //============================================================================ 602 // 603 //============================================================================ 604 /* 605 public static void main(String[] args) throws Exception 606 { 607 //URL htmlPage = new URL("file:///export/home/calmer/dharland/JUNK/tableTest01.html"); 608 URL htmlPage = new URL("file:///export/home/calmer/dharland/jdk1.5.0_07/docs/api/overview-summary.html"); 609 610 HtmlTable table = new HtmlTable(); 611 table.readHtmlFrom(new InputStreamReader(htmlPage.openStream()), 1); //0); 612 StringWriter writer = new StringWriter(); 613 table.writeHtmlTo(writer, 2, true); 614 System.out.println(writer.getBuffer()); 615 //List<HtmlTable> tables = HtmlTable.readTables(htmlPage); 616 System.out.println(); 617 System.out.println("REPORTED ERRORS = " + table.parsingErrors.size()); 618 for (String err : table.parsingErrors) 619 System.out.println(" "+err); 620 System.out.println(); 621 622 writer = new StringWriter(); 623 table.writeTextTo(writer, StringUtil.EOL, ";"); 624 System.out.println(writer.getBuffer()); 625 626 table = new HtmlTable(); 627 URL textPage = new URL("file:///export/home/calmer/dharland/JUNK/charTest.txt"); 628 table.readTextFrom(new InputStreamReader(textPage.openStream()), ";"); 629 writer = new StringWriter(); 630 table.writeHtmlTo(writer, 2, true); 631 System.out.println(writer.getBuffer()); 632 } 633 */ 634 }