001    package edu.nrao.sss.html;
002    
003    import java.io.BufferedReader;
004    import java.io.IOException;
005    import java.io.Reader;
006    import java.io.Writer;
007    import java.net.URL;
008    import java.util.ArrayList;
009    import java.util.List;
010    
011    import javax.swing.text.html.HTML;
012    import javax.swing.text.html.parser.ParserDelegator;
013    
014    import edu.nrao.sss.util.StringUtil;
015    
016    /**
017     * An HTML table.
018     * <p>
019     * <b>Version Info:</b>
020     * <table style="margin-left:2em">
021     *   <tr><td>$Revision: 502 $</td></tr>
022     *   <tr><td>$Date: 2007-04-04 11:39:33 -0600 (Wed, 04 Apr 2007) $</td></tr>
023     *   <tr><td>$Author: dharland $</td></tr>
024     * </table></p>
025     * 
026     * @author David M. Harland
027     * @since 2007-03-15
028     */
029    public class HtmlTable
030      extends HtmlElement
031    {
032      URL                parentPage;
033      List<HtmlTableRow> rows;          //Publicly mutable, not publicly settable
034      List<String>       parsingErrors;
035      
036      /** Creates a new empty table. */
037      public HtmlTable()
038      {
039        super(HTML.Tag.TABLE);
040        
041        rows          = new ArrayList<HtmlTableRow>();
042        parsingErrors = new ArrayList<String>();
043      }
044      
045      /** Returns <i>false</i>. */
046      @Override
047      public boolean isSimple()  { return false; }
048    
049      /**
050       * Returns the page to which this table belongs, if any.
051       * 
052       * @return the page to which this table belongs.  If this table belongs to
053       *         no page, the value returned is <i>null</i>.
054       */
055      public URL getParentPage()
056      {
057        return parentPage;
058      }
059      
060      //============================================================================
061      // ROWS
062      //============================================================================
063    
064      //TODO addRow(index, newRow) and other "listy" methods?
065      
066      /**
067       * Adds {@code newRow} to this table.
068       * <p>
069       * If the new row is <i>null</i> or already in this table,
070       * this method does nothing.  Otherwise the new row is
071       * removed from its previous table, if any, and added to this
072       * one.  The new cell's parent table is set to this one.</p>
073       * 
074       * @param newRow a new row for this table.
075       */
076      public void addRow(HtmlTableRow newRow)
077      {
078        //Nothing to do if new row is null or already in this table
079        if ((newRow != null) && (newRow.parentTable != this))
080        {
081          if (newRow.parentTable != null)
082            newRow.parentTable.removeRow(newRow);
083          
084          rows.add(newRow);
085          
086          newRow.parentTable = this;
087        }
088      }
089      
090      /**
091       * Removes {@code unwantedRow} from this table.
092       * <p>
093       * If the unwanted row is <i>null</i> or not part of this
094       * table, this method does nothing.  Otherwise the unwanted row
095       * is removed from this table, and its parent table is set to <i>null</i>.</p>
096       * 
097       * @param unwantedRow the row to be removed from this table.
098       * 
099       * @return <i>true</i> if {@code unwantedRow} was removed from this table.
100       */
101      public boolean removeRow(HtmlTableRow unwantedRow)
102      {
103        boolean removed;
104        
105        if ((unwantedRow != null) && rows.contains(unwantedRow))
106        {
107          removed = rows.remove(unwantedRow);
108          
109          unwantedRow.parentTable = null;
110        }
111        else //row was not in this table
112        {
113          removed = false;
114        }
115        
116        return removed;
117      }
118      
119      /**
120       * Removes all empty rows (those with no cells) from this table.
121       * @return the number of rows removed.
122       */
123      public int removeEmptyRows()
124      {
125        int oldRowCount = rows.size();
126        
127        for (int r=oldRowCount-1; r >= 0; r--)
128          if (rows.get(r).cells.size() == 0)
129            rows.remove(r);
130        
131        return oldRowCount - rows.size();
132      }
133      
134      /**
135       * Returns a copy of this table's list of rows.
136       * <p>
137       * While the list is a copy, the rows in the returned list are
138       * the actual rows held in this table.</p>
139       * 
140       * @return a copy of this tables's list of rows.
141       */
142      public List<HtmlTableRow> getRows()
143      {
144        return new ArrayList<HtmlTableRow>(rows);
145      }
146      
147      /**
148       * Returns a new list that contains the rows of this table whose type is
149       * equal to {@code desiredType}.
150       * 
151       * @param desiredType the type of row desired.
152       * 
153       * @return the rows from this table that are of the desired type.
154       */
155      public List<HtmlTableRow> getRows(HtmlTableRow.Type desiredType)
156      {
157        ArrayList<HtmlTableRow> selection = new ArrayList<HtmlTableRow>();
158        
159        for (HtmlTableRow row : rows)
160          if (row.getType().equals(desiredType))
161            selection.add(row);
162        
163        return selection;
164      }
165      
166      /**
167       * Returns the rows of this table segregated into header rows,
168       * data rows, and footer rows.
169       * <p>
170       * The returned list will always have exactly three elements.
171       * The first list holds header rows, the second holds data rows,
172       * and the last holds footer rows.  Each element will always
173       * hold a list -- it will never hold <i>null</i>.
174       * If a table has no rows of a given type, that list will be empty.</p> 
175       * 
176       * @return the rows of this table segregated into header rows,
177       *         data rows, and footer rows.
178       */
179      public List<List<HtmlTableRow>> getRowsInSections()
180      {
181        ArrayList<HtmlTableRow> headers = new ArrayList<HtmlTableRow>();
182        ArrayList<HtmlTableRow> data    = new ArrayList<HtmlTableRow>();
183        ArrayList<HtmlTableRow> footers = new ArrayList<HtmlTableRow>();
184        
185        for (HtmlTableRow row : rows)
186        {
187          switch (row.type)
188          {
189            case HEADER:  headers.add(row);  break;
190            case DATA:    data.add(row);     break;
191            case FOOTER:  footers.add(row);  break;
192              
193            default:
194              throw new RuntimeException(
195                "PROGRAMMER ERROR: Unknown HtmlTableRow.Type found [" +
196                row.type + "].");
197          }
198        }
199        
200        List<List<HtmlTableRow>> masterList = new ArrayList<List<HtmlTableRow>>();
201        
202        masterList.add(headers);
203        masterList.add(data);
204        masterList.add(footers);
205        
206        return masterList;
207      }
208      
209      /**
210       * Returns the number of rows in this table.
211       * @return the number of rows in this table.
212       */
213      public int size()
214      {
215        return rows.size();
216      }
217      
218      //============================================================================
219      // WRITING
220      //============================================================================
221      
222      @Override
223      void writeContentsAsHtml(Writer device, int padding, int depth) throws IOException
224      {
225        //Segregate the rows into headers, data, and footers
226        List<List<HtmlTableRow>> sections = getRowsInSections();
227        
228        boolean haveHeaders      = (sections.get(0).size() > 0);
229        boolean haveData         = (sections.get(1).size() > 0);
230        boolean haveFooters      = (sections.get(2).size() > 0);
231        boolean writeSectionTags = (haveHeaders || haveFooters);
232    
233        int sectionDepth = depth + 1;
234        int dataDepth    = writeSectionTags ? depth + 2 : depth + 1;
235        
236        //New line after <table name=...> tag
237        device.write(StringUtil.EOL);
238        
239        if (haveHeaders)
240        {
241          startSection("thead", device, padding, sectionDepth);
242          
243          for (HtmlTableRow row : sections.get(0))
244            row.writeHtmlTo(device, padding, dataDepth, true);
245    
246          endSection("thead", device, padding, sectionDepth);
247        }
248        
249        if (haveFooters)
250        {
251          startSection("tfoot", device, padding, sectionDepth);
252          
253          for (HtmlTableRow row : sections.get(2))
254            row.writeHtmlTo(device, padding, dataDepth, true);
255    
256          endSection("tfoot", device, padding, sectionDepth);
257        }
258    
259        if (haveData)
260        {
261          if (writeSectionTags)
262            startSection("tbody", device, padding, sectionDepth);
263          
264          for (HtmlTableRow row : sections.get(1))
265            row.writeHtmlTo(device, padding, dataDepth, true);
266    
267          if (writeSectionTags)
268            endSection("tbody", device, padding, sectionDepth);
269        }
270    
271        //Padding before </table>
272        device.write(getPadding(padding, depth));
273      }
274      
275      void startSection(String sectionName, Writer device, int padding, int depth)
276        throws IOException
277      {
278        device.write(getPadding(padding, depth));
279        device.write('<');
280        device.write(sectionName);
281        device.write('>');
282        device.write(StringUtil.EOL);
283      }
284      
285      void endSection(String sectionName, Writer device, int padding, int depth)
286        throws IOException
287      {
288        device.write(getPadding(padding, depth));
289        device.write("</");
290        device.write(sectionName);
291        device.write('>');
292        device.write(StringUtil.EOL);
293      }
294    
295      /**
296       * Writes this table as text.  Each row is separated by 
297       * {@code rowDelimiter}.  Within each row, each column is
298       * separated by {@code columnDelimiter}.
299       * <p>
300       * <b><u>Handling COLSPAN and ROWSPAN</u></b><br/>
301       * In order to make a rectangular grid of rows and columns, this method
302       * processes the <tt>colspan</tt> and <tt>rowspan</tt> attributes of
303       * each {@link HtmlTableCell}.  It does this by repeating a table cell's
304       * information in the appropriate number of rows and columns of the grid.
305       * A hole anywhere in the grid is represented by an empty string, in which
306       * case you will see consecutive {@code columnDelimiter}s. If a table has
307       * been specified erroneously such that two cells overlap in the grid, this
308       * method favors the table cells of the row with the higher index, overwriting
309       * those from previous rows.</p>
310       * <u>Example:</u><br/>
311       * <p>
312       * Given this 4x4 table: <blockquote>
313       * <table border="1" style="border:solid; border-collapse:collapse;">
314       *   <tr>
315       *     <td rowspan="2" width="25%">--A--</td>
316       *     <td colspan="2" width="50%" align="center">--B--</td>
317       *   </tr>
318       *   <tr>
319       *     <td rowspan="2" colspan="2" align="center">--C--</td>
320       *     <td width="25%">--D--</td>
321       *   </tr>
322       *   <tr>
323       *     <td>--E--</td>
324       *     <td rowspan="2">--F--</td>
325       *   </tr>
326       *   <tr>
327       *     <td colspan="2" align="center">--G--</td>
328       *   </tr>
329       * </table></blockquote>
330       * </p><p>
331       * The result of calling this method with ";" for the column delimiter and
332       * new-line for the row delimiter is:<b><pre>
333       *    --A--;--B--;--B--;;
334       *    --A--;--C--;--C--;--D--;
335       *    --E--;--C--;--C--;--F--;
336       *    --G--;--G--;;--F--;</pre></b></p>
337       * 
338       * @param device where the text is written.
339       * @param rowDelimiter used to separate the rows.
340       * @param columnDelimiter used to separate columns within a row.
341       * @throws IOException if anything goes wrong while writing to the device.
342       */
343      public void writeTextTo(Writer device,
344                              String rowDelimiter, String columnDelimiter)
345        throws IOException
346      {
347        HtmlCellGrid grid = HtmlCellGrid.buildFrom(this);
348        
349        StringBuilder buff = new StringBuilder();
350        
351        int rows = grid.getRowCount();
352        int cols = grid.getColumnCount();
353        
354        for (int r=0; r < rows; r++)
355        {
356          for (int c=0; c < cols; c++)
357          {
358            //java's HTML parser converts &nbsp; to character 0xA0; reverse this
359            buff.delete(0, buff.length());
360            buff.append(grid.getCell(r, c).getUnparsedContents());
361            replaceUnicodeNbspWith(buff, NBSP_TEXT);
362            
363            device.write(buff.toString());
364            device.write(columnDelimiter);
365          }
366          device.write(rowDelimiter);
367        }
368      }
369      
370      //============================================================================
371      // READING
372      //============================================================================
373    
374      /**
375       * Returns a list of new HTML tables created by parsing the given source.
376       * <p>
377       * <b>Caveat:</b> This method translates only "outer" tables -- those
378       * not contained in the cells of other tables.  Tables inside table cells
379       * are captured as the unparsed content of
380       * {@link HtmlTableCell HtmlTableCells}.  One way to overcome this is
381       * to subject the unparsed content of the returned tables' cells to this
382       * same method.</p>
383       * 
384       * @param source a source of HTML to be parsed for tables.
385       * 
386       * @return a list of HTML tables created by parsing the given source.
387       * 
388       * @throws IOException if anything goes wrong while reading the source.
389       */
390      public static List<HtmlTable> createFromHtml(Reader source)
391        throws IOException
392      {
393        ArrayList<HtmlTable> tables = new ArrayList<HtmlTable>();
394        
395        if (source != null)
396        {    
397          ParserDelegator parser = new ParserDelegator();
398          parser.parse(source, new HtmlTableTagHandler(tables), true);
399        }
400    
401        return tables;
402      }
403      
404      /**
405       * Adds rows to this table by parsing source.
406       * Only data from the table at position {@code tableIndex}, starting from the
407       * current cursor position in source, is used.  Indexing starts at zero and
408       * starts with the first table open tag found.  If no such tag is found,
409       * this table is unaltered by this method.
410       * <p>
411       * <b>Caveat:</b> This method indexes and translates only "outer" tables --
412       * those not contained in the cells of other tables.  Tables inside table
413       * cells of the parsed table are captured as the unparsed content of
414       * {@link HtmlTableCell HtmlTableCells}.  One way to overcome this is
415       * to subject the unparsed content of the returned tables' cells to 
416       * {@link #createFromHtml(Reader)}.</p>
417       * 
418       * @param source a source of HTML to be parsed for tables.
419       * 
420       * @throws IOException if anything goes wrong while reading the source.
421       */
422      public void readHtmlFrom(Reader source, int tableIndex)
423        throws IOException
424      {
425        parsingErrors.clear();
426    
427        if (source == null)
428        {
429          parsingErrors.add("Tried to read from NULL source.");
430        }
431        else
432        {
433          ParserDelegator parser = new ParserDelegator();
434          parser.parse(source, new HtmlTableTagHandler(tableIndex, this), true);
435        }
436      }
437    
438      /**
439       * Returns a list of the parsing errors reported during the most recent
440       * call to {@link #readHtmlFrom(Reader, int)}.  The returned list is
441       * <i>not</i> held by this table, so manipulating it will have no
442       * effect on this object.
443       * 
444       * @return a list of the parsing errors reported during the most recent
445       *         call to {@link #readHtmlFrom(Reader, int)}.
446       */
447      public List<String> getParsingErrors()
448      {
449        return new ArrayList<String>(parsingErrors);
450      }
451      
452      /**
453       * Appends new rows to this table by parsing the delimited text in
454       * {@code reader}.  Each line presented by {@code reader} is assumed
455       * to be a new row.
456       * 
457       * @param reader a text source of table rows.
458       * 
459       * @param columnSeparator the delimiter separating columns in a row.
460       */
461      public void readTextFrom(Reader reader, String columnSeparator)
462        throws IOException
463      {
464        BufferedReader source = new BufferedReader(reader);
465        String         line;
466    
467        while ((line = source.readLine()) != null)
468        {
469          HtmlTableRow newRow = new HtmlTableRow();
470          
471          String[] columnText = line.split(columnSeparator);
472          
473          for (int c=0; c < columnText.length; c++)
474            newRow.addCell(new HtmlTableCell(columnText[c]));
475          
476          addRow(newRow);
477        }
478      }
479      
480      //============================================================================
481      // 
482      //============================================================================
483    
484      /**
485       * Returns a new table that is an expanded form of this table.
486       * <p>
487       * By "expanded" we mean that any cells that had <tt>colspan</tt> or
488       * <tt>rowspan</tt> values greater than one have been split into multiple
489       * cells with column and row spans of one.
490       * For example, a cell in this table with a column span of three and a
491       * row span of two is present not once, but six times, in the
492       * returned table.  Furthermore, the six occurrences, while
493       * equal to each other, are six distinct instances.</p>
494       * <p>
495       * No cells are shared by this table and the returned table.</p>
496       * <b><u>Example</u></b><br/>
497       * Let the following be the HTML for this table:
498       * <table border="1" style="border:solid; border-collapse:collapse;">
499       *   <tr style="background-color:gray">
500       *     <td rowspan="2" style="color:yellow" width="25%">--A--</td>
501       *     <td colspan="2" style="color:white; text-align:center" width="50%">--B--</td>
502       *   </tr>
503       *   <tr style="background-color:#AACCFF">
504       *     <td rowspan="2" colspan="2" style="font-weight:bold; color:red; text-align:center">--C--</td>
505       *     <td>--D--</td>
506       *   </tr>
507       *   <tr style="background-color:#FFCCAA">
508       *     <td>--E--</td>
509       *     <td rowspan="2">--F--</td>
510       *   </tr>
511       *   <tr style="background-color:#CCFFAA">
512       *     <td colspan="2"style="text-align:center">--G--</td>
513       *     <td width="25%"></td>
514       *   </tr>
515       * </table>
516       * <br/>
517       * The table created by this method looks like this:
518       * <table style="border:solid; border-collapse:collapse;" border="1">
519       *   <tr style="background-color:gray">
520       *     <td style="color:yellow" width="25%">--A--</td>
521       *     <td style="color:white; text-align:center" width="25%">--B--</td>
522       *     <td style="color:white; text-align:center" width="25%">--B--</td>
523       *     <td></td>
524       *   </tr>
525       *   <tr style="background-color:#AACCFF">
526       *     <td style="color:yellow" width="25%">--A--</td>
527       *     <td style="font-weight:bold; color:red; text-align:center">--C--</td>
528       *     <td style="font-weight:bold; color:red; text-align:center">--C--</td>
529       *     <td>--D--</td>
530       *   </tr>
531       *   <tr style="background-color:#FFCCAA">
532       *     <td>--E--</td>
533       *     <td style="font-weight:bold; color:red; text-align:center">--C--</td>
534       *     <td style="font-weight:bold; color:red; text-align:center">--C--</td>
535       *     <td>--F--</td>
536       *   </tr>
537       *   <tr style="background-color:#CCFFAA">
538       *     <td style="text-align:center">--G--</td>
539       *     <td style="text-align:center">--G--</td>
540       *     <td width="25%"></td>
541       *     <td>--F--</td>
542       *   </tr>
543       * </table>
544       * Notice one side effect: the upper right corner of the original table
545       * has a missing cell; in the new table there is, instead, an empty cell.
546       * <br/><br/>
547       * If the second table is then recompressed via {@link #toCompressedTable()},
548       * it looks like this:
549       * <table style="border:solid; border-collapse:collapse;" border="1">
550       *    <tr style="background-color:gray">
551       *      <td style="color:yellow" width="25%" rowspan="2">--A--</td>
552       *      <td style="color:white; text-align:center" width="50%" colspan="2">--B--</td>
553       *      <td></td>
554       *    </tr>
555       *    <tr style="background-color:#AACCFF">
556       *      <td style="font-weight:bold; color:red; text-align:center" colspan="2" rowspan="2">--C--</td>
557       *      <td>--D--</td>
558       *    </tr>
559       *    <tr style="background-color:#FFCCAA">
560       *      <td>--E--</td>
561       *      <td rowspan="2">--F--</td>
562       *    </tr>
563       *    <tr style="background-color:#CCFFAA">
564       *      <td style="text-align:center" colspan="2">--G--</td>
565       *      <td width="25%"></td>
566       *    </tr>
567       *  </table>
568       * The original table is nearly reproduced.  The only difference is the empty
569       * cell in place of the missing cell in the upper right corner.
570       * <p>
571       * <b>Caveat:</b> the algorithm used for column widths is only approximate.
572       * It is possible to achieve odd-looking widths for some tables.</p>
573       * 
574       * @return an expanded version of this table.
575       */
576      public HtmlTable toExpandedTable()
577      {
578        return HtmlCellGrid.buildFrom(this).makeExpandedTable();
579      }
580      
581      /**
582       * Returns a new table that is an expanded form of this table.
583       * <p>
584       * By "compressed" we mean that adjacent cells in this table that have
585       * equal contents attributes have been merged, to the extent possible,
586       * into one cell with a column span and/or a row span greater than one.</p>
587       * <p>
588       * No cells are shared by this table and the returned table.</p>
589       * <p>
590       * See the example in {@link #toExpandedTable()}; this method first calls
591       * that method, so any problems mentioned in that method's documentation
592       * may have an effect on the results of this method.</p>
593       * 
594       * @return a compressed version of this table.
595       */
596      public HtmlTable toCompressedTable()
597      {
598        return HtmlCellGrid.buildFrom(this).makeCompressedTable();
599      }
600      
601      //============================================================================
602      // 
603      //============================================================================
604      /*
605      public static void main(String[] args) throws Exception
606      {
607        //URL htmlPage = new URL("file:///export/home/calmer/dharland/JUNK/tableTest01.html"); 
608        URL htmlPage = new URL("file:///export/home/calmer/dharland/jdk1.5.0_07/docs/api/overview-summary.html"); 
609        
610        HtmlTable table = new HtmlTable();
611        table.readHtmlFrom(new InputStreamReader(htmlPage.openStream()), 1); //0);
612        StringWriter writer = new StringWriter();
613        table.writeHtmlTo(writer, 2, true);
614        System.out.println(writer.getBuffer());
615        //List<HtmlTable> tables = HtmlTable.readTables(htmlPage);
616        System.out.println();
617        System.out.println("REPORTED ERRORS = " + table.parsingErrors.size());
618        for (String err : table.parsingErrors)
619          System.out.println("  "+err);
620        System.out.println();
621        
622        writer = new StringWriter();
623        table.writeTextTo(writer, StringUtil.EOL, ";");
624        System.out.println(writer.getBuffer());
625        
626        table = new HtmlTable();
627        URL textPage = new URL("file:///export/home/calmer/dharland/JUNK/charTest.txt");
628        table.readTextFrom(new InputStreamReader(textPage.openStream()), ";");
629        writer = new StringWriter();
630        table.writeHtmlTo(writer, 2, true);
631        System.out.println(writer.getBuffer());
632      }
633      */
634    }