001    package edu.nrao.sss.html;
002    
003    import java.util.Enumeration;
004    import java.util.List;
005    
006    import javax.swing.text.MutableAttributeSet;
007    import javax.swing.text.html.HTML;
008    import javax.swing.text.html.HTMLEditorKit;
009    
010    /**
011     * A handler of table-related HTML tags that may be called by subclasses of
012     * {@link javax.swing.text.html.HTMLEditorKit.Parser}. Example of use:
013     * <pre>
014     *     ParserDelegator parser = new ParserDelegator();
015     *
016     *     parser.parse(new InputStreamReader(htmlPage.openStream()),
017     *                  new HtmlTableTagHandler(myTableList), true);
018     * </pre>
019     * <p>
020     * <b>Version Info:</b>
021     * <table style="margin-left:2em">
022     *   <tr><td>$Revision: 545 $</td></tr>
023     *   <tr><td>$Date: 2007-04-19 10:38:45 -0600 (Thu, 19 Apr 2007) $</td></tr>
024     *   <tr><td>$Author: dharland $</td></tr>
025     * </table></p>
026     * 
027     * @author David M. Harland
028     * @since 2007-03-16
029     */
030    public class HtmlTableTagHandler
031      extends HTMLEditorKit.ParserCallback
032    {
033      //One of these two will be null, the other non-null, depending on whether
034      //the client wants one table or a list of all tables.
035      private HtmlTable       table;
036      private List<HtmlTable> tableList;
037      
038      //Used when client wants the nth table
039      private int desiredIndex;
040      private int currentIndex;
041      
042      //Lets all methods know that we're parsing a desired table
043      private boolean readThisTable;
044      
045      //Keeps track of tables within tables within...
046      private int tableDepth;
047      
048      //Our current location in a table
049      private HtmlTableRow  row;
050      private HtmlTableCell cell;
051      
052      //Indicators for the unsupported table section tags
053      private boolean thead;
054      private boolean tbody;
055      private boolean tfoot;
056      
057      //For enabling reuse of this object
058      private boolean parsingIsDone;
059      
060      /**
061       * Creates a handler that will place the n<sup>th</sup> occurrence of an
062       * HTML table presented to it into {@code destination}, where n is
063       * {@code tableOccurrenceToRead}.  Indexing begins with zero.
064       * <p>
065       * <b>Note:</b> a table inside of another table will <i>not</i> be counted
066       * as a new table for purposes of counting occurrences.</p>
067       * 
068       * @param tableOccurrenceToRead the occurrence of an HTML table to be saved.
069       *                              Indexing begins at zero.
070       *                              
071       * @param destination the table to populate.  This value must not be <i>null</i>.
072       *                    If it is, the parsing process will throw a
073       *                    {@link NullPointerException}.
074       */
075      public HtmlTableTagHandler(int tableOccurrenceToRead, HtmlTable destination)
076      {
077        init();
078        
079        desiredIndex = tableOccurrenceToRead;
080        table        = destination;
081        tableList    = null;
082      }
083      
084      /**
085       * Creates a handler that will append all tables parsed to the given list.
086       * 
087       * @param destination the list to which parsed tables will be appended.
088       *                    This list may or may not be empty, but must not be
089       *                    <i>null</i>.
090       *                    If it is, the parsing process will throw a
091       *                    {@link NullPointerException}.
092       */
093      public HtmlTableTagHandler(List<HtmlTable> destination)
094      {
095        init();
096        
097        desiredIndex = -1;
098        table        = null;
099        tableList    = destination;
100      }
101      
102      private void init()
103      {
104        readThisTable = false;
105        currentIndex  = -1;
106        tableDepth    =  0;
107        row           = null;
108        cell          = null;
109        thead         = false;
110        tbody         = false;
111        tfoot         = false;
112        parsingIsDone = false;
113      }
114      
115      public void reset()
116      {
117        init();
118      }
119      
120      public void reset(int tableOccurrenceToRead, HtmlTable destination)
121      {
122        reset();
123        
124        desiredIndex = tableOccurrenceToRead;
125        table        = destination;
126        tableList    = null;
127      }
128    
129      public void reset(List<HtmlTable> destination)
130      {
131        reset();
132        
133        desiredIndex = -1;
134        table        = null;
135        tableList    = destination;
136      }
137    
138      //============================================================================
139      // START TAGS
140      //============================================================================
141      
142      /**
143       * Handles start tags for table, table row, table data, and table header.
144       * Any other start tag is ignored, unless it is inside a {@link HtmlTableCell
145       * table cell}, in which case it is added to the cell's unparsed content
146       * without being interpreted as an HTML element.
147       */
148      @Override
149      public void handleStartTag(HTML.Tag tag, MutableAttributeSet attrs, int pos)
150      {
151        //Force reset
152        if (parsingIsDone)
153          reset();
154        
155        if (tag.equals(HTML.Tag.TABLE))
156          handleTableStartTag(tag, attrs, pos);
157    
158        else if (tag.equals(HTML.Tag.TR))
159          handleRowStartTag(tag, attrs, pos);
160    
161        else if (tag.equals(HTML.Tag.TD) || tag.equals(HTML.Tag.TH))
162          handleCellStartTag(tag, attrs, pos);
163    
164        else //some other tag
165          handleOtherStartTag(tag, attrs, pos);
166      }
167      
168      /** Handles start tag for table. */
169      private void handleTableStartTag(HTML.Tag tag,
170                                       MutableAttributeSet attrs, int pos)
171      {
172        tableDepth++;
173    
174        //Count all top level tables so that we know when we're at the right one
175        if (tableDepth == 1)
176        {
177          currentIndex++;
178          handleNewTable(attrs);
179        }
180        else
181        {
182          addStartTagToCell(tag, attrs, pos, false);
183        }
184      }
185      
186      /** Handles start tag for table row. */
187      private void handleRowStartTag(HTML.Tag tag,
188                                     MutableAttributeSet attrs, int pos)
189      {
190        if (tableDepth < 1)
191        {
192          throw new RuntimeException(
193            "PROGRAMMER ERROR: table should be open if we're handling a row.");
194        }
195        else if (tableDepth == 1)
196        {
197          if (readThisTable)
198          {
199            if (row == null)
200            {
201              row = new HtmlTableRow();
202              row.setType(getRowType());
203              table.addRow(row);
204            }
205            setAttributes(row, attrs);
206          }
207        }
208        else //tableDepth > 1
209        {
210          addStartTagToCell(tag, attrs, pos, false);
211        }
212      }
213      
214      /** Responds to a request to create a new {@link HtmlTable}. */
215      private void handleNewTable(MutableAttributeSet attrs)
216      {
217        if (tableList != null)        //Client wants to read all tables
218        {
219          table = new HtmlTable();
220          setAttributes(table, attrs);
221          tableList.add(table);
222          readThisTable = true;
223        }
224        else                          //Client wants to read single table
225        {
226          if (currentIndex == desiredIndex)
227          {
228            setAttributes(table, attrs);
229            readThisTable = true;
230          }
231        }
232      }
233    
234      /** Handles start tag for table cell. */
235      private void handleCellStartTag(HTML.Tag tag,
236                                      MutableAttributeSet attrs, int pos)
237      {
238        if (tableDepth < 1)
239          throw new RuntimeException(
240          "PROGRAMMER ERROR: table & row should be open if we're handling a cell.");
241    
242        if (tableDepth == 1)
243        {
244          if (readThisTable)
245          {
246            if (cell == null)
247            {
248              cell = new HtmlTableCell(tag.equals(HTML.Tag.TH) ?
249                                       HtmlTableCell.Type.HEADER :
250                                       HtmlTableCell.Type.DATA);
251              row.addCell(cell);
252            }
253            setAttributes(cell, attrs);
254          }
255        }
256        else if (tableDepth > 1)
257        {
258          addStartTagToCell(tag, attrs, pos, false);
259        }
260      }
261      
262      /** Handles non-table start tags. */
263      private void handleOtherStartTag(HTML.Tag tag,
264                                       MutableAttributeSet attrs, int pos)
265      {
266        addStartTagToCell(tag, attrs, pos, false);
267      }
268      
269      private void addStartTagToCell(HTML.Tag tag,
270                                     MutableAttributeSet attrs, int pos,
271                                     boolean isSimpleTag)
272      {
273        if (cell == null)
274          return;
275        
276        StringBuilder buff = cell.getUnparsedContents();
277        
278        //Tag name
279        buff.append('<').append(tag.toString());
280    
281        //Attributes
282        Enumeration<?> names = attrs.getAttributeNames();
283        
284        while (names.hasMoreElements())
285        {
286          Object attrName = names.nextElement();
287          buff.append(' ').append(attrName.toString()).append("=\"")
288              .append(attrs.getAttribute(attrName).toString()).append("\"");
289        }
290    
291        if (isSimpleTag)
292          buff.append("/>");
293        else
294          buff.append('>');
295      }
296    
297      //============================================================================
298      // END TAGS
299      //============================================================================
300    
301      /**
302       * Handles end tags for table, table row, table data, and table header.
303       * Any other start tag is ignored, unless it is inside a {@link HtmlTableCell
304       * table cell}, in which case it is added to the cell's unparsed content
305       * without being interpreted as an HTML element.
306       */
307      @Override
308      public void handleEndTag(HTML.Tag tag, int pos)
309      {
310        //Force reset
311        if (parsingIsDone)
312          reset();
313        
314        if (tag.equals(HTML.Tag.TABLE))
315          handleTableEndTag(tag, pos);
316    
317        else if (tag.equals(HTML.Tag.TR))
318          handleRowEndTag(tag, pos);
319    
320        else if (tag.equals(HTML.Tag.TD) || tag.equals(HTML.Tag.TH))
321          handleCellEndTag(tag, pos);
322    
323        else //some other tag
324          handleOtherEndTag(tag, pos);
325      }
326      
327      /** Handles end tag for table. */
328      private void handleTableEndTag(HTML.Tag tag, int pos)
329      {
330        if (tableDepth <= 0)
331        {
332          throw new RuntimeException("PROGRAMMER ERROR: table depth will be < 0.");
333        }
334        else if (tableDepth > 1)
335        {
336          handleOtherEndTag(tag, pos);
337        }
338        else //tableDepth == 1
339        {
340          readThisTable = false;
341        }
342        
343        tableDepth--;
344      }
345      
346      /** Handles end tag for table row. */
347      private void handleRowEndTag(HTML.Tag tag, int pos)
348      {
349        //We're now assuming proper closing tags for all non-simple tags,
350        //so we do not force the cell closed.
351        if (tableDepth > 1)
352          handleOtherEndTag(tag, pos);
353        else
354          row = null;
355      }
356      
357      /** Handles end tag for table cell. */
358      private void handleCellEndTag(HTML.Tag tag, int pos)
359      {
360        if (tableDepth > 1)
361          handleOtherEndTag(tag, pos);
362        else
363        {
364          //System.out.println("Table " + currentIndex + ", row " + row.getPositionInTable() + ", col " + cell.getPositionInRow() +
365          //                   ": " + cell.getUnparsedContents());
366          cell = null;
367        }
368      }
369      
370      /** Handles non-table end tags. */
371      private void handleOtherEndTag(HTML.Tag tag, int pos)
372      {
373        if (cell != null)
374          cell.getUnparsedContents().append("</").append(tag.toString())
375                                                 .append(">");
376      }
377      
378      //============================================================================
379      // SIMPLE TAGS
380      //============================================================================
381    
382      /**
383       * If the simple tag is inside a table cell, it is added to that cell's
384       * unparsed contents, otherwise it is ignored.
385       */
386      @Override
387      public void handleSimpleTag(HTML.Tag tag, MutableAttributeSet attrs, int pos)
388      {
389        //Force reset
390        if (parsingIsDone)
391          reset();
392        
393        addStartTagToCell(tag, attrs, pos, true);
394      }
395    
396      //============================================================================
397      // 
398      //============================================================================
399    
400      /**
401       * If the text is inside a table cell, it is added to that cell's
402       * unparsed contents, otherwise it is ignored.
403       */
404      @Override
405      public void handleText(char[] data, int pos)
406      {
407        //Force reset
408        if (parsingIsDone)
409          reset();
410        
411        if (cell != null)
412          cell.getUnparsedContents().append(new String(data));
413      }
414      
415      @Override
416      public void flush()
417      {
418        //The parser does not tell us when parsing begins.  We wish it did, so that
419        //we could reset the state of this handler just-in-time.  Instead, we note
420        //here when parsing finished and then look at this flag in all the handler
421        //methods to determine when a reset is necessary.
422        parsingIsDone = true;
423      }
424      
425      //============================================================================
426      // ERRORS
427      //============================================================================
428      
429      @Override
430      public void handleError(String errMsg, int pos)
431      {
432        //Force reset
433        if (parsingIsDone)
434          reset();
435        
436        boolean putError = false;
437        
438        //Java's HTML parser is stuck on HTML 3.2.  It does have a mechanism
439        //for dealing with unknown tags.  Unfortunately for us, though, they
440        //have code in javax.swing.text.html.parser.Parser.ignoreElement that
441        //short-circuits this mechanism for tags inside the <table> element.
442        //This means the new <thead>, <tfoot>, and <tbody> tags are not treated
443        //like HTML.UnknownTag, but are treated instead as parsing errors.
444        //That's why we have the code below.
445        
446        if (errMsg.startsWith("invalid.tagattclass"))
447        {
448          String text = errMsg.replaceAll("\\?", "");
449          text = text.substring("invalid.tagattclass".length());
450    
451          if (!(text.equalsIgnoreCase("table") ||
452                text.equalsIgnoreCase("tr")    ||
453                text.equalsIgnoreCase("th")    ||
454                text.equalsIgnoreCase("td")))
455            putError = readThisTable;
456        }
457        else if (errMsg.startsWith("tag.ignore"))
458        {
459          String text = errMsg.replaceAll("\\?", "");
460          text = text.substring("tag.ignore".length());
461          
462          //This code assumes odd occurrences are open tags, even are close tags
463          if      (text.equalsIgnoreCase("thead"))  thead = !thead;
464          else if (text.equalsIgnoreCase("tbody"))  tbody = !tbody;
465          else if (text.equalsIgnoreCase("tfoot"))  tfoot = !tfoot;
466          else
467            putError = readThisTable;
468        }
469        else
470        {
471          putError = readThisTable;
472        }
473        
474        if (putError)
475          table.parsingErrors.add(errMsg + " at character " + pos +
476                                  " [source=java's DocumentParser].");
477      }
478      
479      //============================================================================
480      // 
481      //============================================================================
482    
483      /**
484       * Converts {@code attributes} into name/value pairs and adds them to
485       * {@code element}.
486       */
487      private void setAttributes(HtmlElement element, MutableAttributeSet attributes)
488      {
489        Enumeration<?> names = attributes.getAttributeNames();
490        
491        while (names.hasMoreElements())
492        {
493          Object attrName = names.nextElement();
494    
495          //Ignore any poorly formed attributes
496          try {
497            element.addAttribute(
498              new HtmlAttribute(attrName.toString(),
499                                attributes.getAttribute(attrName).toString()));
500          }
501          catch (IllegalArgumentException ex) {
502            if (readThisTable)
503              table.parsingErrors.add(ex.getMessage());
504          }
505        }
506      }
507      
508      private HtmlTableRow.Type getRowType()
509      {
510        HtmlTableRow.Type result = null;
511        
512        int type = 0;
513        
514        if (thead) type += 1;
515        if (tbody) type += 2;
516        if (tfoot) type += 4;
517        
518        switch (type)
519        {
520          case 0: result = HtmlTableRow.Type.DATA;   break;
521          case 1: result = HtmlTableRow.Type.HEADER; break;
522          case 2: result = HtmlTableRow.Type.DATA;   break;
523          case 4: result = HtmlTableRow.Type.FOOTER; break;
524          
525          //This means multiple flags are true.  Note error, but call the row DATA.
526          default:
527            table.parsingErrors.add(
528              "More than one of thead, tbody, and tfoot are true; type value = " +
529              type);
530          
531            result = HtmlTableRow.Type.DATA;
532        }
533        
534        return result;
535      }
536    }