001    package edu.nrao.sss.model.source.parser;
002    
003    import java.io.IOException;
004    import java.io.InputStreamReader;
005    import java.net.MalformedURLException;
006    import java.net.URL;
007    import java.util.ArrayList;
008    import java.util.HashMap;
009    import java.util.List;
010    import java.util.Map;
011    import java.util.TreeMap;
012    
013    import javax.swing.text.html.HTML;
014    
015    import edu.nrao.sss.astronomy.CelestialCoordinateSystem;
016    import edu.nrao.sss.astronomy.Epoch;
017    import edu.nrao.sss.astronomy.SimpleSkyPosition;
018    import edu.nrao.sss.astronomy.StokesParameter;
019    import edu.nrao.sss.html.HtmlAnchor;
020    import edu.nrao.sss.html.HtmlTable;
021    import edu.nrao.sss.html.HtmlTableCell;
022    import edu.nrao.sss.html.HtmlTableRow;
023    import edu.nrao.sss.measure.ArcUnits;
024    import edu.nrao.sss.measure.FluxDensity;
025    import edu.nrao.sss.measure.Frequency;
026    import edu.nrao.sss.measure.FrequencyRange;
027    import edu.nrao.sss.measure.FrequencyUnits;
028    import edu.nrao.sss.measure.Latitude;
029    import edu.nrao.sss.measure.Longitude;
030    import edu.nrao.sss.model.source.BrightnessDistribution;
031    import edu.nrao.sss.model.source.CleanFileBrightness;
032    import edu.nrao.sss.model.source.DescriptiveBrightness;
033    import edu.nrao.sss.model.source.FileBasedBrightness;
034    import edu.nrao.sss.model.source.FitsFileBrightness;
035    import edu.nrao.sss.model.source.Source;
036    import edu.nrao.sss.model.source.SourceBrightness;
037    import edu.nrao.sss.model.source.SourceCatalog;
038    import edu.nrao.sss.model.source.SourceCatalogEntry;
039    import edu.nrao.sss.model.source.SourceImageLink;
040    import edu.nrao.sss.model.source.Subsource;
041    
042    /**
043     * A reader that can turn HTML in the form presented in the
044     * <a href="http://vlbi.gsfc.nasa.gov/solutions/2007a_astro/2007a_astro_cat.html">
045     * 2007a Astro Catalogue</a>
046     * into {@link Source sources}.
047     * <p>
048     * <b>Warning:</b> This reader is not an industrial strength
049     * class and was intended for use by those who are not overly
050     * concerned with the failure to read the HTML.  The main
051     * problem with this class is that several exceptions are
052     * stifled and do not reach clients of this class.</p>
053     * <p>
054     * <b>Version Info:</b>
055     * <table style="margin-left:2em">
056     *   <tr><td>$Revision$</td></tr>
057     *   <tr><td>$Date$</td></tr>
058     *   <tr><td>$Author$ (last person to modify)</td></tr>
059     * </table></p>
060     * 
061     * @author David M. Harland
062     * @since 2007-06-20
063     */
064    public class Astro2007aHtmlTableReader
065    {
066      private static final int NAME_CELL_INDEX = 2;
067      
068      private static final String DEFAULT_PAGE_NAME = 
069        "http://vlbi.gsfc.nasa.gov/solutions/2007a_astro/2007a_astro_cat.html";
070      
071      private String htmlPageName;
072      
073      private Map<String, HtmlTableRow> parsedRows;
074      private Map<String, Source>       parsedSources;
075      
076      /**
077       * Creates a new reader of the catalog found at
078       * <a href="http://vlbi.gsfc.nasa.gov/solutions/2007a_astro/2007a_astro_cat.html">
079       * http://vlbi.gsfc.nasa.gov/solutions/2007a_astro/2007a_astro_cat.html</a>.
080       * 
081       * @throws MalformedURLException if the above URL is malformed.
082       * @throws IOException if the page at the above URL cannot be read.
083       */
084      public Astro2007aHtmlTableReader()
085        throws MalformedURLException, IOException
086      {
087        this(DEFAULT_PAGE_NAME, 2);
088      }
089      
090      /**
091       * Creates a new reader of the catalog found at the URL 
092       * {@code pageName}.
093       * 
094       * @param pageName
095       *          the URL for a catalog in 2007a Astro form.
096       * @param firstDataRow
097       *          the first row of the HTML table found at {@code pageName}
098       *          that contains source data.  Indexing starts at zero.
099       *          Often the first two (indices 0 and 1) rows contain column
100       *          headers.
101       * @throws MalformedURLException
102       *           if {@code pageName} cannot be turned into a URL.
103       * @throws IOException
104       *           if {@code pageName} cannot be read.
105       */
106      public Astro2007aHtmlTableReader(String pageName, int firstDataRow)
107        throws MalformedURLException, IOException
108      {
109        htmlPageName = pageName;
110        
111        URL webPage = new URL(htmlPageName);
112    
113        HtmlTable table = new HtmlTable();
114    
115        table.readHtmlFrom(new InputStreamReader(webPage.openStream()), 0);
116        
117        table.removeEmptyRows();
118        
119        //Put rows in a map, keyed by source name, for quick retrieval
120        parsedRows = new TreeMap<String, HtmlTableRow>();
121    
122        List<HtmlTableRow> rows = table.getRows();
123        int rowCount = rows.size();
124        
125        for (int r=firstDataRow; r < rowCount; r++)
126        {
127          HtmlTableRow row = rows.get(r);
128          parsedRows.put(parseName(row.getCells().get(NAME_CELL_INDEX)), row);
129        }
130    
131        //Will parse sources just-in-time
132        parsedSources = new TreeMap<String, Source>();
133      }
134    
135      /**
136       * Adds to {@code source} any image links contained in this reader that
137       * are associated with a source of the same name.
138       * 
139       * @param source the source to which links might be added.
140       * 
141       * @return <i>true</i> if one or more links were added.
142       */
143      public boolean addImageLinksTo(Source source)
144      {
145        boolean added = false;
146        
147        Source ourSource = getParsedSource(source.getName());
148        
149        //Add the links
150        if (ourSource != null)
151        {
152          ArrayList<String> existingLinks = new ArrayList<String>();
153          
154          for (SourceImageLink link : source.getImageLinks())
155            existingLinks.add(link.getImageLocation().toString());
156          
157          for (SourceImageLink newLink : ourSource.getImageLinks())
158          {
159            if (!existingLinks.contains(newLink.getImageLocation().toString()))
160            {
161              source.getImageLinks().add(newLink);
162              added = true;
163            }
164          }
165        }
166        
167        return added;
168      }
169      
170      /**
171       * Adds to the sources in {@code catalog} any image links contained in this
172       * reader that are associated with sources of the same name.
173       * This is a convenience method that iterates through the sources of
174       * {@code catalog} and calls {@link #addImageLinksTo(Source)}.
175       * 
176       * @param catalog the catalog holding sources to which links might be added.
177       * 
178       * @return <i>true</i> if one or more links were added.
179       */
180      public boolean addImageLinksTo(SourceCatalog catalog)
181      {
182        boolean addedAny = false;
183        
184        for (SourceCatalogEntry entry : catalog.getItems())
185        {
186          if (entry instanceof Source)
187            addedAny |= addImageLinksTo((Source)entry);
188        }
189        
190        return addedAny;
191      }
192    
193      /**
194       * Adds to {@code source} any informational links contained in this
195       * reader that are associated with a source of the same name.
196       * 
197       * @param source the source to which links might be added.
198       * 
199       * @return <i>true</i> if one or more links were added.
200       */
201      public boolean addLinksTo(Source source)
202      {
203        boolean added = false;
204        
205        Source ourSource = getParsedSource(source.getName());
206        
207        //Add the links
208        if (ourSource != null)
209        {
210          List<URL> existingLinks = source.getLinks();
211          
212          for (URL newLink : ourSource.getLinks())
213          {
214            if (!existingLinks.contains(newLink))
215            {
216              source.getLinks().add(newLink);
217              added = true;
218            }
219          }
220        }
221        
222        return added;
223      }
224      
225      /**
226       * Adds to the sources in {@code catalog} any informational links contained
227       * in this reader that are associated with sources of the same name.
228       * This is a convenience method that iterates through the sources of
229       * {@code catalog} and calls {@link #addImageLinksTo(Source)}.
230       * 
231       * @param catalog the catalog holding sources to which links might be added.
232       * 
233       * @return <i>true</i> if one or more links were added.
234       */
235      public boolean addLinksTo(SourceCatalog catalog)
236      {
237        boolean addedAny = false;
238        
239        for (SourceCatalogEntry entry : catalog.getItems())
240        {
241          if (entry instanceof Source)
242            addedAny |= addLinksTo((Source)entry);
243        }
244        
245        return addedAny;
246      }
247    
248      /**
249       * Adds to {@code source} any file-based brightnesses contained in this
250       * reader that are associated with a source of the same name.
251       * 
252       * @param source the source to which brightnesses might be added.
253       * 
254       * @return <i>true</i> if one or more brightnesses were added.
255       */
256      public boolean addFileBrightnessesTo(Source source)
257      {
258        boolean added = false;
259        
260        Source ourSource = getParsedSource(source.getName());
261    
262        //Add the files
263        if (ourSource != null)
264        {
265          ArrayList<String> existingFiles = new ArrayList<String>();
266          
267          for (SourceBrightness sb : source.getCentralSubsource().getBrightnesses())
268          {
269            if (sb instanceof FileBasedBrightness)
270              existingFiles.add(
271                ((FileBasedBrightness)sb).getBrightnessFile().toString());
272          }
273          
274          for (SourceBrightness sb :
275                 ourSource.getCentralSubsource().getBrightnesses())
276          {
277            if (sb instanceof FileBasedBrightness)
278            {
279              FileBasedBrightness newFile = (FileBasedBrightness)sb;
280              
281              if (!existingFiles.contains(newFile.getBrightnessFile().toString()))
282              {
283                source.getCentralSubsource().addBrightness(newFile);
284                added = true;
285              }
286            }
287          }
288        }
289        
290        return added;
291      }
292      
293      /**
294       * Adds to the sources in {@code catalog} any file-based brightnesses
295       * contained in this
296       * reader that are associated with sources of the same name.
297       * This is a convenience method that iterates through the sources of
298       * {@code catalog} and calls {@link #addFileBrightnessesTo(Source)}.
299       * 
300       * @param catalog the catalog holding sources to which brightnesses
301       *                might be added.
302       * 
303       * @return <i>true</i> if one or more brightnesses were added.
304       */
305      public boolean addFileBrightnessesTo(SourceCatalog catalog)
306      {
307        boolean addedAny = false;
308        
309        for (SourceCatalogEntry entry : catalog.getItems())
310        {
311          if (entry instanceof Source)
312            addedAny |= addFileBrightnessesTo((Source)entry);
313        }
314        
315        return addedAny;
316      }
317      
318      //============================================================================
319      // PARSING THE HTML TABLE ROW
320      //============================================================================
321      
322      private Source sourceBeingParsed;
323    
324      /**
325       * Returns already parsed source or performs just-in-time parsing.
326       * Returns null if page had no such source.
327       */
328      private Source getParsedSource(String sourceName)
329      {
330        //See if we already parsed this source
331        Source source = parsedSources.get(sourceName);
332        
333        //If not, try just in time parsing
334        if (source == null)
335        {
336          source = parseRow(parsedRows.get(sourceName));
337          
338          if (source != null)
339            parsedSources.put(sourceName, source);
340        }
341      
342        return source;
343      }
344    
345      /** Turns an HTML Table Row into a source. */
346      private Source parseRow(HtmlTableRow row)
347      {
348        //Quick exit for null row
349        if (row == null)
350          return null;
351        
352        sourceBeingParsed = new Source();
353    
354        List<HtmlTableCell> cells = row.getCells();
355        
356        //Column 0 has unneeded info
357        
358        //Do name first to help err msgs
359        sourceBeingParsed.setName(parseName(cells.get(2)));
360        parseAlias(cells.get(1));
361        
362        Subsource ss = sourceBeingParsed.getCentralSubsource();
363        
364        SimpleSkyPosition ssp = new SimpleSkyPosition();
365        ssp.setCoordinateSystem(CelestialCoordinateSystem.EQUATORIAL);
366        ssp.setEpoch(Epoch.J2000);
367        if (htmlPageName.equals(DEFAULT_PAGE_NAME))
368          ssp.setOriginOfInformation("L. Petrov, solution 2007a_astro (unpublished, available on the Web at http://vlbi.gsfc.nasa.gov/solutions/2007a_astro)");
369        else
370          ssp.setOriginOfInformation(htmlPageName);
371        ss.setPosition(ssp);
372        
373        parseLink(cells.get(2), sourceBeingParsed);
374        parseRA(cells.get(3), ssp);
375        parseDec(cells.get(4), ssp);
376        parseRAUncertainty(cells.get(5), ssp);
377        parseDecUncertainty(cells.get(6), ssp);
378        
379        //Column 7,8,9 have unneeded info
380        
381        parse10Thru15(cells, ss);
382        
383        sourceBeingParsed.setOriginOfInformation(htmlPageName);
384        
385        return sourceBeingParsed;
386      }
387      
388      //Alias (column 1)
389      private void parseAlias(HtmlTableCell cell)
390      {
391        HtmlAnchor anchor =
392          HtmlAnchor.parse(cell.getUnparsedContents().toString());
393          
394        sourceBeingParsed.getAliases().add(anchor.getDisplayText().trim());
395          
396        String imageName = anchor.getAttributeValue(HTML.Attribute.HREF);
397        if (imageName.length() > 0)
398        {
399          SourceImageLink image = new SourceImageLink();
400          image.setDisplayName("VLBI Spectrum");
401          image.setComments("Flux vs. Frequency");
402          image.setPolarization(StokesParameter.I);
403          try
404          {
405            image.setImageLocation(new URL(imageName));
406            sourceBeingParsed.getImageLinks().add(image);
407          }
408          catch (MalformedURLException ex)
409          {
410            //simply not adding a link
411          }
412        }
413      }
414    
415      //J2000 Name (column 2)
416      private String parseName(HtmlTableCell cell)
417      {
418        HtmlAnchor anchor =
419          HtmlAnchor.parse(cell.getUnparsedContents().toString());
420          
421        return anchor.getDisplayText().trim();
422      }
423      
424      private void parseLink(HtmlTableCell cell, Source src)
425      {
426        HtmlAnchor anchor =
427          HtmlAnchor.parse(cell.getUnparsedContents().toString());
428        
429        try {
430          src.getLinks().add(new URL(anchor.getAttributeValue(HTML.Attribute.HREF)));
431        }
432        catch (Exception ex) {
433          //skip the link
434        }
435      }
436      
437      //RA (column 3)
438      private void parseRA(HtmlTableCell cell, SimpleSkyPosition ssp)
439      {
440        Longitude ra = Longitude.parse(cell.getUnparsedContents().toString());
441          
442        ssp.setLongitude(ra);
443      }
444      
445      //Dec (column 4)
446      private void parseDec(HtmlTableCell cell, SimpleSkyPosition ssp)
447      {
448        Latitude dec = Latitude.parse(cell.getUnparsedContents().toString());
449          
450        ssp.setLatitude(dec);
451      }
452      
453      //RA error (column 5)
454      private void parseRAUncertainty(HtmlTableCell cell, SimpleSkyPosition ssp)
455      {
456        String mas = cell.getUnparsedContents().toString();
457          
458        ssp.setLongitudeUncertainty(new Longitude(mas, ArcUnits.MILLI_ARC_SECOND));
459      }
460      
461      //Dec error (column 6)
462      private void parseDecUncertainty(HtmlTableCell cell, SimpleSkyPosition ssp)
463      {
464        try
465        {
466          String mas = cell.getUnparsedContents().toString();
467          
468          ssp.setLatitudeUncertainty(new Latitude(mas, ArcUnits.MILLI_ARC_SECOND));
469        }
470        catch (Exception ex)
471        {
472          //A known error
473          if (htmlPageName.equals(DEFAULT_PAGE_NAME) &&
474              sourceBeingParsed.getName().equals("J0116-2052"))
475          {
476            ssp.setLatitudeUncertainty(new Latitude("999.99", ArcUnits.MILLI_ARC_SECOND));
477          }
478        }
479      }
480      
481      //Handles flux, images, and CLEAN/FITS files (columns 10-15)
482      private void parse10Thru15(List<HtmlTableCell> cells, Subsource ss)
483      {
484        Map<String, String> xMap = new HashMap<String, String>();
485        Map<String, String> sMap = new HashMap<String, String>();
486        
487        preParse10Thru15(cells, xMap, sMap);
488        
489        parseFlux("X", xMap, ss);  //columns 10 & 11
490        parseFlux("S", sMap, ss);  //columns 10 & 11
491        
492        parseImage("X", xMap);     //columns 12 & 13
493        parseImage("S", sMap);     //columns 12 & 13
494        
495        parseClean("X", xMap, ss); //column 14
496        parseClean("S", sMap, ss); //column 14
497        
498        parseFits("X", xMap, ss);  //column 15
499        parseFits("S", sMap, ss);  //column 15
500      }
501      
502      //Break the text up into manageable pieces
503      private void preParse10Thru15(List<HtmlTableCell> cells,
504                                    Map<String, String> xMap,
505                                    Map<String, String> sMap)
506      {
507        for (int c=10; c <= 15; c++)
508        {
509          HtmlTableCell cell = cells.get(c);
510          
511          String text = cell.getUnparsedContents().toString();
512          text = text.replaceAll("<tt>", "").replaceAll("</tt>", "");
513          text = text.replaceAll("\u00A0", "");  //0xA0 = &nbsp;
514          
515          String bands[] = text.split("<br/>");
516          String key     = Integer.toString(c);
517          
518          for (int b=0; b < bands.length; b++)
519          {
520            int firstColonPos = bands[b].indexOf(':');
521            
522            String bandId = bands[b].substring(0, firstColonPos);
523            String value  = bands[b].substring(firstColonPos + 1);
524            
525            if      (bandId.equalsIgnoreCase("X"))  xMap.put(key, value);
526            else if (bandId.equalsIgnoreCase("S"))  sMap.put(key, value);
527          }
528        }
529      }
530      
531      //VLBA receiver frequency ranges
532      private static final FrequencyRange X_BAND_RANGE =
533        new FrequencyRange(new Frequency("8.0", FrequencyUnits.GIGAHERTZ),
534                           new Frequency("8.8", FrequencyUnits.GIGAHERTZ));
535      
536      private static final FrequencyRange S_BAND_RANGE =
537        new FrequencyRange(new Frequency("2.15", FrequencyUnits.GIGAHERTZ),
538                           new Frequency("2.35", FrequencyUnits.GIGAHERTZ));
539    
540      //Total & peak flux (columns 10, 11)
541      private void parseFlux(String bandId, Map<String, String> map, Subsource ss)
542      {
543        String textTotal = map.get("10").trim();
544        String textUnres = map.get("11").trim();
545        
546        boolean noTotal = textTotal.equalsIgnoreCase("n/a");
547        boolean noUnres = textUnres.equalsIgnoreCase("n/a");
548        
549        //Quick exit if no flux
550        if (noTotal && noUnres)
551          return;
552        
553        DescriptiveBrightness sb = (DescriptiveBrightness)
554          SourceBrightness.createBrightness(BrightnessDistribution.POINT);
555        
556        ss.addBrightness(sb);
557        
558        if      (bandId.equals("X"))  sb.setValidFrequency(X_BAND_RANGE.clone());
559        else if (bandId.equals("S"))  sb.setValidFrequency(S_BAND_RANGE.clone());
560    
561        sb.setPolarization(StokesParameter.I);
562    
563        //TODO Set valid time?
564        
565        if (!noTotal)
566          setFlux(sb, textTotal, bandId, 10);
567        
568        if (!noUnres)
569          setFlux(sb, textUnres, bandId, 11);
570      }
571      
572      private void setFlux(DescriptiveBrightness sb,
573                           String fluxText, String bandId, int col)
574      {
575        if (col == 10)
576          sb.setTotalFluxDensity(new FluxDensity(fluxText));
577        else if (col == 11)
578          sb.setPeakFluxDensity(new FluxDensity(fluxText));
579      }
580      
581      //"CLEAN Map" & "Rad Plot" (columns 12, 13)
582      private void parseImage(String bandId, Map<String, String> map)
583      {
584        String textClean = map.get("12").trim();
585        String textRad   = map.get("13").trim();
586        
587        boolean noClean = textClean.equalsIgnoreCase("n/a");
588        boolean noRad   = textRad.equalsIgnoreCase("n/a");
589        
590        //Quick images
591        if (noClean && noRad)
592          return;
593        
594        if (!noClean)
595          setImage(textClean, bandId, 12);
596        
597        if (!noRad)
598          setImage(textRad, bandId, 13);
599      }
600      
601      //TODO Talk to BT re: HTML in comments
602      private static final String[] IMAGE_COMMENTS =
603      {
604        "Naturally weighted CLEAN image" /* (" +
605        "<a href=\"http://vlbi.gsfc.nasa.gov/vcs/misc/image_ps_explain.html\">details</a>)." */
606        ,
607        "Correlated flux density vs. projected baseline length" /* (" +
608        "<a href=\"http://vlbi.gsfc.nasa.gov/vcs/misc/rad_plot_explain.html\">details</a>)." */
609      };
610      
611      private static final int COMMENT_COL_OFFSET = 12;
612      
613      private void setImage(String anchorText, String bandId, int col)
614      {
615        String gHz = bandId.equals("X") ? "8.4" : "2.2";
616    
617        HtmlAnchor anchor = HtmlAnchor.parse(anchorText);
618          
619        SourceImageLink image = new SourceImageLink();
620        image.setDisplayName(anchor.getDisplayText());
621        try
622        {
623          image.setImageLocation(new URL(anchor.getAttributeValue(HTML.Attribute.HREF)));
624          image.setFrequency(new Frequency(gHz));
625          image.setPolarization(StokesParameter.I);
626          image.setComments(IMAGE_COMMENTS[col-COMMENT_COL_OFFSET]);
627          
628          sourceBeingParsed.getImageLinks().add(image);
629        }
630        catch (Exception ex)
631        {
632          //simply not adding a link
633        }
634      }
635      
636      //"CLEAN Map Fits" (column 14)
637      private void parseClean(String bandId, Map<String, String> map, Subsource ss)
638      {
639        String anchorText = map.get("14").trim();
640        
641        if (anchorText.equalsIgnoreCase("n/a"))
642          return;
643        
644        HtmlAnchor anchor = HtmlAnchor.parse(anchorText);
645        CleanFileBrightness cfb = (CleanFileBrightness) 
646          SourceBrightness.createBrightness(BrightnessDistribution.CLEAN_COMPONENTS_FILE);
647        try
648        {
649          cfb.setBrightnessFile(new URL(anchor.getAttributeValue(HTML.Attribute.HREF)));
650          ss.addBrightness(cfb);
651        }
652        catch (Exception ex)
653        {
654          //simply not adding brightness
655        }
656      }
657      
658      //"UV Data Fits" (column 15)
659      private void parseFits(String bandId, Map<String, String> map, Subsource ss)
660      {
661        String anchorText = map.get("15").trim();
662        
663        if (anchorText.equalsIgnoreCase("n/a"))
664          return;
665        
666        HtmlAnchor anchor = HtmlAnchor.parse(anchorText);
667        FitsFileBrightness ffb = (FitsFileBrightness) 
668          SourceBrightness.createBrightness(BrightnessDistribution.FITS_FILE);
669        try
670        {
671          ffb.setBrightnessFile(new URL(anchor.getAttributeValue(HTML.Attribute.HREF)));
672          ss.addBrightness(ffb);
673        }
674        catch (Exception ex)
675        {
676          //simply not adding brightness
677        }
678      }
679    }