001package Torello.HTML.Tools.NewsSite; 002 003import Torello.HTML.HTMLNode; 004import Torello.HTML.PageStats; 005import java.util.Vector; 006import java.io.Serializable; 007import java.net.URL; 008 009/** 010 * When a news article is downloaded from a {@code URL}, its contents are parsed, and the 011 * information-HTML is stored in this class. 012 * 013 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=ARTICLE> 014 */ 015public class Article implements Serializable 016{ 017 /** <EMBED CLASS='external-html' DATA-FILE-ID=SVUID> */ 018 protected static final long serialVersionUID = 1; 019 020 /** 021 * This should inform the user that an error occurred when downloading an article. If this 022 * field, after instantiation is {@code TRUE}, all other fields in this class should be thought 023 * of as "irrelevant." 024 */ 025 public final boolean wasErrorDownload; 026 027 /** This is the article's URL from the news website. */ 028 public final URL url; 029 030 /** 031 * This is the title that was scraped from the main page. The title is the content of the 032 * {@code <TITLE>...</TITLE>} element on the article HTML page. 033 */ 034 public final String titleElement; 035 036 /** 037 * This is the original, and complete, HTML vectorized-page download. It contains the 038 * original, un-modified, article download. 039 */ 040 public final Vector<HTMLNode> originalPage; 041 042 /** 043 * This is the pared down article-body. It is what is retrieved from {@code class ArticleGet} 044 */ 045 public final Vector<HTMLNode> articleBody; 046 047 /** 048 * The image-URL's that were found in the news-article. The easiest way to think about this 049 * field is that the following instructions were called on the article-body after downloading 050 * the article: 051 * 052 * <BR /><BR /><DIV CLASS="SNIP">{@code 053 * Vector<TagNode> imageNodes = TagNodeGet.all(article, TC.OpeningTags, "img"); 054 * Vector<URL> imageURLs = Links.resolveSRCs(imageNodes, articleURL); 055 * 056 * // The results of the above call are stored in this field / Vector<URL>. 057 * }</DIV> 058 */ 059 public final Vector<URL> imageURLs; 060 061 /** 062 * This list contains the "Image Positions" inside the vectorized-article for each image that 063 * was found inside the article. The easiest way to think about this field is that the 064 * following instructions were called on the article-body after downloading that article: 065 * 066 * <BR /><BR /><DIV CLASS="SNIP">{@code 067 * int[] imagePosArr = TagNodeFind.all(page, TC.OpeningTags, "img"); 068 * }</DIV> 069 */ 070 public final int[] imagePosArr; 071 072 /** 073 * This contains an instance of {@code class PageStats} that has been generated out of an 074 * original Newspaper Article Page. 075 * 076 * <DIV CLASS="LOC">{@code 077 * this.originalPageStats = new PageStats(originalPage); 078 * }</DIV> 079 */ 080 public final PageStats originalPageStats; 081 082 /** 083 * This contains an instance of {@code class PageStats} that has been generated from the 084 * post-processed Newspaper Article. 085 * <DIV CLASS="LOC">{@code 086 * this.processedArticleStats = new PageStats(articleBody); 087 * }</DIV> 088 */ 089 public final PageStats processedArticleStats; 090 091 092 /** 093 * Builds an instance of this class. 094 * 095 * @param url The web-address from whence this news-article was downloaded / retrieved. 096 * @param titleElement The contents of the HTML {@code <TITLE>} tag, as a {@code String}. 097 * @param originalPage Vectorized-HTML of the original article web-page, in its entirety. 098 * @param articleBody Vectorized-HTML of the body of the article's page, as extracted by the 099 * {@code ArticleGet} function-pointer. 100 * @param imageURLs A list of all HTML {@code <IMG>} elements found inside the 101 * {@code 'articleBody'} 102 * @param imagePosArr The {@code Vector}-indices where the images (if any) were found in the 103 * article. 104 */ 105 public Article( 106 URL url, 107 String titleElement, 108 Vector<HTMLNode> originalPage, 109 Vector<HTMLNode> articleBody, 110 Vector<URL> imageURLs, 111 int[] imagePosArr 112 ) 113 { 114 this.wasErrorDownload = false; 115 this.url = url; 116 this.titleElement = titleElement; 117 this.originalPage = originalPage; 118 this.articleBody = articleBody; 119 this.imageURLs = imageURLs; 120 this.imagePosArr = imagePosArr; 121 this.originalPageStats = (originalPage == null) ? null : new PageStats(originalPage); 122 this.processedArticleStats = new PageStats(articleBody); 123 } 124}