001 package edu.nrao.sss.model.source.parser; 002 003 import java.io.IOException; 004 import java.io.InputStreamReader; 005 import java.net.MalformedURLException; 006 import java.net.URL; 007 import java.util.ArrayList; 008 import java.util.HashMap; 009 import java.util.List; 010 import java.util.Map; 011 import java.util.TreeMap; 012 013 import javax.swing.text.html.HTML; 014 015 import edu.nrao.sss.astronomy.CelestialCoordinateSystem; 016 import edu.nrao.sss.astronomy.Epoch; 017 import edu.nrao.sss.astronomy.SimpleSkyPosition; 018 import edu.nrao.sss.astronomy.StokesParameter; 019 import edu.nrao.sss.html.HtmlAnchor; 020 import edu.nrao.sss.html.HtmlTable; 021 import edu.nrao.sss.html.HtmlTableCell; 022 import edu.nrao.sss.html.HtmlTableRow; 023 import edu.nrao.sss.measure.ArcUnits; 024 import edu.nrao.sss.measure.FluxDensity; 025 import edu.nrao.sss.measure.Frequency; 026 import edu.nrao.sss.measure.FrequencyRange; 027 import edu.nrao.sss.measure.FrequencyUnits; 028 import edu.nrao.sss.measure.Latitude; 029 import edu.nrao.sss.measure.Longitude; 030 import edu.nrao.sss.model.source.BrightnessDistribution; 031 import edu.nrao.sss.model.source.CleanFileBrightness; 032 import edu.nrao.sss.model.source.DescriptiveBrightness; 033 import edu.nrao.sss.model.source.FileBasedBrightness; 034 import edu.nrao.sss.model.source.FitsFileBrightness; 035 import edu.nrao.sss.model.source.Source; 036 import edu.nrao.sss.model.source.SourceBrightness; 037 import edu.nrao.sss.model.source.SourceCatalog; 038 import edu.nrao.sss.model.source.SourceCatalogEntry; 039 import edu.nrao.sss.model.source.SourceImageLink; 040 import edu.nrao.sss.model.source.Subsource; 041 042 /** 043 * A reader that can turn HTML in the form presented in the 044 * <a href="http://vlbi.gsfc.nasa.gov/solutions/2007a_astro/2007a_astro_cat.html"> 045 * 2007a Astro Catalogue</a> 046 * into {@link Source sources}. 047 * <p> 048 * <b>Warning:</b> This reader is not an industrial strength 049 * class and was intended for use by those who are not overly 050 * concerned with the failure to read the HTML. The main 051 * problem with this class is that several exceptions are 052 * stifled and do not reach clients of this class.</p> 053 * <p> 054 * <b>Version Info:</b> 055 * <table style="margin-left:2em"> 056 * <tr><td>$Revision$</td></tr> 057 * <tr><td>$Date$</td></tr> 058 * <tr><td>$Author$ (last person to modify)</td></tr> 059 * </table></p> 060 * 061 * @author David M. Harland 062 * @since 2007-06-20 063 */ 064 public class Astro2007aHtmlTableReader 065 { 066 private static final int NAME_CELL_INDEX = 2; 067 068 private static final String DEFAULT_PAGE_NAME = 069 "http://vlbi.gsfc.nasa.gov/solutions/2007a_astro/2007a_astro_cat.html"; 070 071 private String htmlPageName; 072 073 private Map<String, HtmlTableRow> parsedRows; 074 private Map<String, Source> parsedSources; 075 076 /** 077 * Creates a new reader of the catalog found at 078 * <a href="http://vlbi.gsfc.nasa.gov/solutions/2007a_astro/2007a_astro_cat.html"> 079 * http://vlbi.gsfc.nasa.gov/solutions/2007a_astro/2007a_astro_cat.html</a>. 080 * 081 * @throws MalformedURLException if the above URL is malformed. 082 * @throws IOException if the page at the above URL cannot be read. 083 */ 084 public Astro2007aHtmlTableReader() 085 throws MalformedURLException, IOException 086 { 087 this(DEFAULT_PAGE_NAME, 2); 088 } 089 090 /** 091 * Creates a new reader of the catalog found at the URL 092 * {@code pageName}. 093 * 094 * @param pageName 095 * the URL for a catalog in 2007a Astro form. 096 * @param firstDataRow 097 * the first row of the HTML table found at {@code pageName} 098 * that contains source data. Indexing starts at zero. 099 * Often the first two (indices 0 and 1) rows contain column 100 * headers. 101 * @throws MalformedURLException 102 * if {@code pageName} cannot be turned into a URL. 103 * @throws IOException 104 * if {@code pageName} cannot be read. 105 */ 106 public Astro2007aHtmlTableReader(String pageName, int firstDataRow) 107 throws MalformedURLException, IOException 108 { 109 htmlPageName = pageName; 110 111 URL webPage = new URL(htmlPageName); 112 113 HtmlTable table = new HtmlTable(); 114 115 table.readHtmlFrom(new InputStreamReader(webPage.openStream()), 0); 116 117 table.removeEmptyRows(); 118 119 //Put rows in a map, keyed by source name, for quick retrieval 120 parsedRows = new TreeMap<String, HtmlTableRow>(); 121 122 List<HtmlTableRow> rows = table.getRows(); 123 int rowCount = rows.size(); 124 125 for (int r=firstDataRow; r < rowCount; r++) 126 { 127 HtmlTableRow row = rows.get(r); 128 parsedRows.put(parseName(row.getCells().get(NAME_CELL_INDEX)), row); 129 } 130 131 //Will parse sources just-in-time 132 parsedSources = new TreeMap<String, Source>(); 133 } 134 135 /** 136 * Adds to {@code source} any image links contained in this reader that 137 * are associated with a source of the same name. 138 * 139 * @param source the source to which links might be added. 140 * 141 * @return <i>true</i> if one or more links were added. 142 */ 143 public boolean addImageLinksTo(Source source) 144 { 145 boolean added = false; 146 147 Source ourSource = getParsedSource(source.getName()); 148 149 //Add the links 150 if (ourSource != null) 151 { 152 ArrayList<String> existingLinks = new ArrayList<String>(); 153 154 for (SourceImageLink link : source.getImageLinks()) 155 existingLinks.add(link.getImageLocation().toString()); 156 157 for (SourceImageLink newLink : ourSource.getImageLinks()) 158 { 159 if (!existingLinks.contains(newLink.getImageLocation().toString())) 160 { 161 source.getImageLinks().add(newLink); 162 added = true; 163 } 164 } 165 } 166 167 return added; 168 } 169 170 /** 171 * Adds to the sources in {@code catalog} any image links contained in this 172 * reader that are associated with sources of the same name. 173 * This is a convenience method that iterates through the sources of 174 * {@code catalog} and calls {@link #addImageLinksTo(Source)}. 175 * 176 * @param catalog the catalog holding sources to which links might be added. 177 * 178 * @return <i>true</i> if one or more links were added. 179 */ 180 public boolean addImageLinksTo(SourceCatalog catalog) 181 { 182 boolean addedAny = false; 183 184 for (SourceCatalogEntry entry : catalog.getItems()) 185 { 186 if (entry instanceof Source) 187 addedAny |= addImageLinksTo((Source)entry); 188 } 189 190 return addedAny; 191 } 192 193 /** 194 * Adds to {@code source} any informational links contained in this 195 * reader that are associated with a source of the same name. 196 * 197 * @param source the source to which links might be added. 198 * 199 * @return <i>true</i> if one or more links were added. 200 */ 201 public boolean addLinksTo(Source source) 202 { 203 boolean added = false; 204 205 Source ourSource = getParsedSource(source.getName()); 206 207 //Add the links 208 if (ourSource != null) 209 { 210 List<URL> existingLinks = source.getLinks(); 211 212 for (URL newLink : ourSource.getLinks()) 213 { 214 if (!existingLinks.contains(newLink)) 215 { 216 source.getLinks().add(newLink); 217 added = true; 218 } 219 } 220 } 221 222 return added; 223 } 224 225 /** 226 * Adds to the sources in {@code catalog} any informational links contained 227 * in this reader that are associated with sources of the same name. 228 * This is a convenience method that iterates through the sources of 229 * {@code catalog} and calls {@link #addImageLinksTo(Source)}. 230 * 231 * @param catalog the catalog holding sources to which links might be added. 232 * 233 * @return <i>true</i> if one or more links were added. 234 */ 235 public boolean addLinksTo(SourceCatalog catalog) 236 { 237 boolean addedAny = false; 238 239 for (SourceCatalogEntry entry : catalog.getItems()) 240 { 241 if (entry instanceof Source) 242 addedAny |= addLinksTo((Source)entry); 243 } 244 245 return addedAny; 246 } 247 248 /** 249 * Adds to {@code source} any file-based brightnesses contained in this 250 * reader that are associated with a source of the same name. 251 * 252 * @param source the source to which brightnesses might be added. 253 * 254 * @return <i>true</i> if one or more brightnesses were added. 255 */ 256 public boolean addFileBrightnessesTo(Source source) 257 { 258 boolean added = false; 259 260 Source ourSource = getParsedSource(source.getName()); 261 262 //Add the files 263 if (ourSource != null) 264 { 265 ArrayList<String> existingFiles = new ArrayList<String>(); 266 267 for (SourceBrightness sb : source.getCentralSubsource().getBrightnesses()) 268 { 269 if (sb instanceof FileBasedBrightness) 270 existingFiles.add( 271 ((FileBasedBrightness)sb).getBrightnessFile().toString()); 272 } 273 274 for (SourceBrightness sb : 275 ourSource.getCentralSubsource().getBrightnesses()) 276 { 277 if (sb instanceof FileBasedBrightness) 278 { 279 FileBasedBrightness newFile = (FileBasedBrightness)sb; 280 281 if (!existingFiles.contains(newFile.getBrightnessFile().toString())) 282 { 283 source.getCentralSubsource().addBrightness(newFile); 284 added = true; 285 } 286 } 287 } 288 } 289 290 return added; 291 } 292 293 /** 294 * Adds to the sources in {@code catalog} any file-based brightnesses 295 * contained in this 296 * reader that are associated with sources of the same name. 297 * This is a convenience method that iterates through the sources of 298 * {@code catalog} and calls {@link #addFileBrightnessesTo(Source)}. 299 * 300 * @param catalog the catalog holding sources to which brightnesses 301 * might be added. 302 * 303 * @return <i>true</i> if one or more brightnesses were added. 304 */ 305 public boolean addFileBrightnessesTo(SourceCatalog catalog) 306 { 307 boolean addedAny = false; 308 309 for (SourceCatalogEntry entry : catalog.getItems()) 310 { 311 if (entry instanceof Source) 312 addedAny |= addFileBrightnessesTo((Source)entry); 313 } 314 315 return addedAny; 316 } 317 318 //============================================================================ 319 // PARSING THE HTML TABLE ROW 320 //============================================================================ 321 322 private Source sourceBeingParsed; 323 324 /** 325 * Returns already parsed source or performs just-in-time parsing. 326 * Returns null if page had no such source. 327 */ 328 private Source getParsedSource(String sourceName) 329 { 330 //See if we already parsed this source 331 Source source = parsedSources.get(sourceName); 332 333 //If not, try just in time parsing 334 if (source == null) 335 { 336 source = parseRow(parsedRows.get(sourceName)); 337 338 if (source != null) 339 parsedSources.put(sourceName, source); 340 } 341 342 return source; 343 } 344 345 /** Turns an HTML Table Row into a source. */ 346 private Source parseRow(HtmlTableRow row) 347 { 348 //Quick exit for null row 349 if (row == null) 350 return null; 351 352 sourceBeingParsed = new Source(); 353 354 List<HtmlTableCell> cells = row.getCells(); 355 356 //Column 0 has unneeded info 357 358 //Do name first to help err msgs 359 sourceBeingParsed.setName(parseName(cells.get(2))); 360 parseAlias(cells.get(1)); 361 362 Subsource ss = sourceBeingParsed.getCentralSubsource(); 363 364 SimpleSkyPosition ssp = new SimpleSkyPosition(); 365 ssp.setCoordinateSystem(CelestialCoordinateSystem.EQUATORIAL); 366 ssp.setEpoch(Epoch.J2000); 367 if (htmlPageName.equals(DEFAULT_PAGE_NAME)) 368 ssp.setOriginOfInformation("L. Petrov, solution 2007a_astro (unpublished, available on the Web at http://vlbi.gsfc.nasa.gov/solutions/2007a_astro)"); 369 else 370 ssp.setOriginOfInformation(htmlPageName); 371 ss.setPosition(ssp); 372 373 parseLink(cells.get(2), sourceBeingParsed); 374 parseRA(cells.get(3), ssp); 375 parseDec(cells.get(4), ssp); 376 parseRAUncertainty(cells.get(5), ssp); 377 parseDecUncertainty(cells.get(6), ssp); 378 379 //Column 7,8,9 have unneeded info 380 381 parse10Thru15(cells, ss); 382 383 sourceBeingParsed.setOriginOfInformation(htmlPageName); 384 385 return sourceBeingParsed; 386 } 387 388 //Alias (column 1) 389 private void parseAlias(HtmlTableCell cell) 390 { 391 HtmlAnchor anchor = 392 HtmlAnchor.parse(cell.getUnparsedContents().toString()); 393 394 sourceBeingParsed.getAliases().add(anchor.getDisplayText().trim()); 395 396 String imageName = anchor.getAttributeValue(HTML.Attribute.HREF); 397 if (imageName.length() > 0) 398 { 399 SourceImageLink image = new SourceImageLink(); 400 image.setDisplayName("VLBI Spectrum"); 401 image.setComments("Flux vs. Frequency"); 402 image.setPolarization(StokesParameter.I); 403 try 404 { 405 image.setImageLocation(new URL(imageName)); 406 sourceBeingParsed.getImageLinks().add(image); 407 } 408 catch (MalformedURLException ex) 409 { 410 //simply not adding a link 411 } 412 } 413 } 414 415 //J2000 Name (column 2) 416 private String parseName(HtmlTableCell cell) 417 { 418 HtmlAnchor anchor = 419 HtmlAnchor.parse(cell.getUnparsedContents().toString()); 420 421 return anchor.getDisplayText().trim(); 422 } 423 424 private void parseLink(HtmlTableCell cell, Source src) 425 { 426 HtmlAnchor anchor = 427 HtmlAnchor.parse(cell.getUnparsedContents().toString()); 428 429 try { 430 src.getLinks().add(new URL(anchor.getAttributeValue(HTML.Attribute.HREF))); 431 } 432 catch (Exception ex) { 433 //skip the link 434 } 435 } 436 437 //RA (column 3) 438 private void parseRA(HtmlTableCell cell, SimpleSkyPosition ssp) 439 { 440 Longitude ra = Longitude.parse(cell.getUnparsedContents().toString()); 441 442 ssp.setLongitude(ra); 443 } 444 445 //Dec (column 4) 446 private void parseDec(HtmlTableCell cell, SimpleSkyPosition ssp) 447 { 448 Latitude dec = Latitude.parse(cell.getUnparsedContents().toString()); 449 450 ssp.setLatitude(dec); 451 } 452 453 //RA error (column 5) 454 private void parseRAUncertainty(HtmlTableCell cell, SimpleSkyPosition ssp) 455 { 456 String mas = cell.getUnparsedContents().toString(); 457 458 ssp.setLongitudeUncertainty(new Longitude(mas, ArcUnits.MILLI_ARC_SECOND)); 459 } 460 461 //Dec error (column 6) 462 private void parseDecUncertainty(HtmlTableCell cell, SimpleSkyPosition ssp) 463 { 464 try 465 { 466 String mas = cell.getUnparsedContents().toString(); 467 468 ssp.setLatitudeUncertainty(new Latitude(mas, ArcUnits.MILLI_ARC_SECOND)); 469 } 470 catch (Exception ex) 471 { 472 //A known error 473 if (htmlPageName.equals(DEFAULT_PAGE_NAME) && 474 sourceBeingParsed.getName().equals("J0116-2052")) 475 { 476 ssp.setLatitudeUncertainty(new Latitude("999.99", ArcUnits.MILLI_ARC_SECOND)); 477 } 478 } 479 } 480 481 //Handles flux, images, and CLEAN/FITS files (columns 10-15) 482 private void parse10Thru15(List<HtmlTableCell> cells, Subsource ss) 483 { 484 Map<String, String> xMap = new HashMap<String, String>(); 485 Map<String, String> sMap = new HashMap<String, String>(); 486 487 preParse10Thru15(cells, xMap, sMap); 488 489 parseFlux("X", xMap, ss); //columns 10 & 11 490 parseFlux("S", sMap, ss); //columns 10 & 11 491 492 parseImage("X", xMap); //columns 12 & 13 493 parseImage("S", sMap); //columns 12 & 13 494 495 parseClean("X", xMap, ss); //column 14 496 parseClean("S", sMap, ss); //column 14 497 498 parseFits("X", xMap, ss); //column 15 499 parseFits("S", sMap, ss); //column 15 500 } 501 502 //Break the text up into manageable pieces 503 private void preParse10Thru15(List<HtmlTableCell> cells, 504 Map<String, String> xMap, 505 Map<String, String> sMap) 506 { 507 for (int c=10; c <= 15; c++) 508 { 509 HtmlTableCell cell = cells.get(c); 510 511 String text = cell.getUnparsedContents().toString(); 512 text = text.replaceAll("<tt>", "").replaceAll("</tt>", ""); 513 text = text.replaceAll("\u00A0", ""); //0xA0 = 514 515 String bands[] = text.split("<br/>"); 516 String key = Integer.toString(c); 517 518 for (int b=0; b < bands.length; b++) 519 { 520 int firstColonPos = bands[b].indexOf(':'); 521 522 String bandId = bands[b].substring(0, firstColonPos); 523 String value = bands[b].substring(firstColonPos + 1); 524 525 if (bandId.equalsIgnoreCase("X")) xMap.put(key, value); 526 else if (bandId.equalsIgnoreCase("S")) sMap.put(key, value); 527 } 528 } 529 } 530 531 //VLBA receiver frequency ranges 532 private static final FrequencyRange X_BAND_RANGE = 533 new FrequencyRange(new Frequency("8.0", FrequencyUnits.GIGAHERTZ), 534 new Frequency("8.8", FrequencyUnits.GIGAHERTZ)); 535 536 private static final FrequencyRange S_BAND_RANGE = 537 new FrequencyRange(new Frequency("2.15", FrequencyUnits.GIGAHERTZ), 538 new Frequency("2.35", FrequencyUnits.GIGAHERTZ)); 539 540 //Total & peak flux (columns 10, 11) 541 private void parseFlux(String bandId, Map<String, String> map, Subsource ss) 542 { 543 String textTotal = map.get("10").trim(); 544 String textUnres = map.get("11").trim(); 545 546 boolean noTotal = textTotal.equalsIgnoreCase("n/a"); 547 boolean noUnres = textUnres.equalsIgnoreCase("n/a"); 548 549 //Quick exit if no flux 550 if (noTotal && noUnres) 551 return; 552 553 DescriptiveBrightness sb = (DescriptiveBrightness) 554 SourceBrightness.createBrightness(BrightnessDistribution.POINT); 555 556 ss.addBrightness(sb); 557 558 if (bandId.equals("X")) sb.setValidFrequency(X_BAND_RANGE.clone()); 559 else if (bandId.equals("S")) sb.setValidFrequency(S_BAND_RANGE.clone()); 560 561 sb.setPolarization(StokesParameter.I); 562 563 //TODO Set valid time? 564 565 if (!noTotal) 566 setFlux(sb, textTotal, bandId, 10); 567 568 if (!noUnres) 569 setFlux(sb, textUnres, bandId, 11); 570 } 571 572 private void setFlux(DescriptiveBrightness sb, 573 String fluxText, String bandId, int col) 574 { 575 if (col == 10) 576 sb.setTotalFluxDensity(new FluxDensity(fluxText)); 577 else if (col == 11) 578 sb.setPeakFluxDensity(new FluxDensity(fluxText)); 579 } 580 581 //"CLEAN Map" & "Rad Plot" (columns 12, 13) 582 private void parseImage(String bandId, Map<String, String> map) 583 { 584 String textClean = map.get("12").trim(); 585 String textRad = map.get("13").trim(); 586 587 boolean noClean = textClean.equalsIgnoreCase("n/a"); 588 boolean noRad = textRad.equalsIgnoreCase("n/a"); 589 590 //Quick images 591 if (noClean && noRad) 592 return; 593 594 if (!noClean) 595 setImage(textClean, bandId, 12); 596 597 if (!noRad) 598 setImage(textRad, bandId, 13); 599 } 600 601 //TODO Talk to BT re: HTML in comments 602 private static final String[] IMAGE_COMMENTS = 603 { 604 "Naturally weighted CLEAN image" /* (" + 605 "<a href=\"http://vlbi.gsfc.nasa.gov/vcs/misc/image_ps_explain.html\">details</a>)." */ 606 , 607 "Correlated flux density vs. projected baseline length" /* (" + 608 "<a href=\"http://vlbi.gsfc.nasa.gov/vcs/misc/rad_plot_explain.html\">details</a>)." */ 609 }; 610 611 private static final int COMMENT_COL_OFFSET = 12; 612 613 private void setImage(String anchorText, String bandId, int col) 614 { 615 String gHz = bandId.equals("X") ? "8.4" : "2.2"; 616 617 HtmlAnchor anchor = HtmlAnchor.parse(anchorText); 618 619 SourceImageLink image = new SourceImageLink(); 620 image.setDisplayName(anchor.getDisplayText()); 621 try 622 { 623 image.setImageLocation(new URL(anchor.getAttributeValue(HTML.Attribute.HREF))); 624 image.setFrequency(new Frequency(gHz)); 625 image.setPolarization(StokesParameter.I); 626 image.setComments(IMAGE_COMMENTS[col-COMMENT_COL_OFFSET]); 627 628 sourceBeingParsed.getImageLinks().add(image); 629 } 630 catch (Exception ex) 631 { 632 //simply not adding a link 633 } 634 } 635 636 //"CLEAN Map Fits" (column 14) 637 private void parseClean(String bandId, Map<String, String> map, Subsource ss) 638 { 639 String anchorText = map.get("14").trim(); 640 641 if (anchorText.equalsIgnoreCase("n/a")) 642 return; 643 644 HtmlAnchor anchor = HtmlAnchor.parse(anchorText); 645 CleanFileBrightness cfb = (CleanFileBrightness) 646 SourceBrightness.createBrightness(BrightnessDistribution.CLEAN_COMPONENTS_FILE); 647 try 648 { 649 cfb.setBrightnessFile(new URL(anchor.getAttributeValue(HTML.Attribute.HREF))); 650 ss.addBrightness(cfb); 651 } 652 catch (Exception ex) 653 { 654 //simply not adding brightness 655 } 656 } 657 658 //"UV Data Fits" (column 15) 659 private void parseFits(String bandId, Map<String, String> map, Subsource ss) 660 { 661 String anchorText = map.get("15").trim(); 662 663 if (anchorText.equalsIgnoreCase("n/a")) 664 return; 665 666 HtmlAnchor anchor = HtmlAnchor.parse(anchorText); 667 FitsFileBrightness ffb = (FitsFileBrightness) 668 SourceBrightness.createBrightness(BrightnessDistribution.FITS_FILE); 669 try 670 { 671 ffb.setBrightnessFile(new URL(anchor.getAttributeValue(HTML.Attribute.HREF))); 672 ss.addBrightness(ffb); 673 } 674 catch (Exception ex) 675 { 676 //simply not adding brightness 677 } 678 } 679 }