1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | package Torello.HTML.Tools.NewsSite; import Torello.HTML.HTMLNode; import Torello.HTML.PageStats; import java.util.Vector; import java.io.Serializable; import java.net.URL; /** * When a news article is downloaded from a {@code URL}, its contents are parsed, and the * information-HTML is stored in this class. * * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=ARTICLE> */ public class Article implements Serializable { /** <EMBED CLASS='external-html' DATA-FILE-ID=SVUID> */ protected static final long serialVersionUID = 1; /** * This should inform the user that an error occurred when downloading an article. If this * field, after instantiation is {@code TRUE}, all other fields in this class should be thought * of as "irrelevant." */ public final boolean wasErrorDownload; /** This is the article's URL from the news website. */ public final URL url; /** * This is the title that was scraped from the main page. The title is the content of the * {@code <TITLE>...</TITLE>} element on the article HTML page. */ public final String titleElement; /** * This is the original, and complete, HTML vectorized-page download. It contains the * original, un-modified, article download. */ public final Vector<HTMLNode> originalPage; /** * This is the pared down article-body. It is what is retrieved from {@code class ArticleGet} */ public final Vector<HTMLNode> articleBody; /** * The image-URL's that were found in the news-article. The easiest way to think about this * field is that the following instructions were called on the article-body after downloading * the article: * * <BR /><BR /><DIV CLASS="SNIP">{@code * Vector<TagNode> imageNodes = TagNodeGet.all(article, TC.OpeningTags, "img"); * Vector<URL> imageURLs = Links.resolveSRCs(imageNodes, articleURL); * * // The results of the above call are stored in this field / Vector<URL>. * }</DIV> */ public final Vector<URL> imageURLs; /** * This list contains the "Image Positions" inside the vectorized-article for each image that * was found inside the article. The easiest way to think about this field is that the * following instructions were called on the article-body after downloading that article: * * <BR /><BR /><DIV CLASS="SNIP">{@code * int[] imagePosArr = TagNodeFind.all(page, TC.OpeningTags, "img"); * }</DIV> */ public final int[] imagePosArr; /** * This contains an instance of {@code class PageStats} that has been generated out of an * original Newspaper Article Page. * * <DIV CLASS="LOC">{@code * this.originalPageStats = new PageStats(originalPage); * }</DIV> */ public final PageStats originalPageStats; /** * This contains an instance of {@code class PageStats} that has been generated from the * post-processed Newspaper Article. * <DIV CLASS="LOC">{@code * this.processedArticleStats = new PageStats(articleBody); * }</DIV> */ public final PageStats processedArticleStats; /** * Builds an instance of this class. * * @param url The web-address from whence this news-article was downloaded / retrieved. * @param titleElement The contents of the HTML {@code <TITLE>} tag, as a {@code String}. * @param originalPage Vectorized-HTML of the original article web-page, in its entirety. * @param articleBody Vectorized-HTML of the body of the article's page, as extracted by the * {@code ArticleGet} function-pointer. * @param imageURLs A list of all HTML {@code <IMG>} elements found inside the * {@code 'articleBody'} * @param imagePosArr The {@code Vector}-indices where the images (if any) were found in the * article. */ public Article( URL url, String titleElement, Vector<HTMLNode> originalPage, Vector<HTMLNode> articleBody, Vector<URL> imageURLs, int[] imagePosArr ) { this.wasErrorDownload = false; this.url = url; this.titleElement = titleElement; this.originalPage = originalPage; this.articleBody = articleBody; this.imageURLs = imageURLs; this.imagePosArr = imagePosArr; this.originalPageStats = (originalPage == null) ? null : new PageStats(originalPage); this.processedArticleStats = new PageStats(articleBody); } } |