Source code

001package Torello.HTML.Tools.NewsSite;
002
003import Torello.HTML.HTMLNode;
004import Torello.HTML.PageStats;
005import java.util.Vector;
006import java.io.Serializable;
007import java.net.URL;
008
009/**
010 * When a news article is downloaded from a {@code URL}, its contents are parsed, and the
011 * information-HTML is stored in this class.
012 * 
013 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=ARTICLE>
014 */
015public class Article implements Serializable
016{
017    /** <EMBED CLASS='external-html' DATA-FILE-ID=SVUID>  */
018    protected static final long serialVersionUID = 1;
019
020    /**
021     * This should inform the user that an error occurred when downloading an article. If this
022     * field,  after instantiation is {@code TRUE}, all other fields in this class should be thought
023     * of as "irrelevant."
024     */
025    public final boolean wasErrorDownload;
026
027    /** This is the article's URL from the news website. */
028    public final URL url;
029
030    /**
031     * This is the title that was scraped from the main page.  The title is the content of the
032     * {@code <TITLE>...</TITLE>} element on the article HTML page.
033     */
034    public final String titleElement;
035
036    /**
037     * This is the original, and complete, HTML vectorized-page download.  It contains the
038     * original, un-modified, article download.
039     */
040    public final Vector<HTMLNode> originalPage;
041
042    /**
043     * This is the pared down article-body.  It is what is retrieved from {@code class ArticleGet}
044     */
045    public final Vector<HTMLNode> articleBody;
046
047    /**
048     * The image-URL's that were found in the news-article.  The easiest way to think about this
049     * field is that the following instructions were called on the article-body after downloading
050     * the article:
051     * 
052     * <BR /><BR /><DIV CLASS="SNIP">{@code
053     * Vector<TagNode> imageNodes  = TagNodeGet.all(article, TC.OpeningTags, "img");
054     * Vector<URL>     imageURLs   = Links.resolveSRCs(imageNodes, articleURL);
055     * 
056     * // The results of the above call are stored in this field / Vector<URL>.
057     * }</DIV>
058     */
059    public final Vector<URL> imageURLs;
060
061    /**
062     * This list contains the "Image Positions" inside the vectorized-article for each image that
063     * was found inside the article.  The easiest way to think about this field is that the
064     * following instructions were called on the article-body after downloading that article:
065     * 
066     * <BR /><BR /><DIV CLASS="SNIP">{@code
067     *  int[] imagePosArr = TagNodeFind.all(page, TC.OpeningTags, "img");
068     * }</DIV>
069     */
070    public final int[] imagePosArr;
071
072    /**
073     * This contains an instance of {@code class PageStats} that has been generated out of an
074     * original Newspaper Article Page.
075     * 
076     * <DIV CLASS="LOC">{@code 
077     * this.originalPageStats = new PageStats(originalPage);
078     * }</DIV>
079     */
080    public final PageStats originalPageStats;
081
082    /**
083     * This contains an instance of {@code class PageStats} that has been generated from the
084     *  post-processed Newspaper Article.
085     * <DIV CLASS="LOC">{@code 
086     * this.processedArticleStats = new PageStats(articleBody);
087     * }</DIV>
088     */
089    public final PageStats processedArticleStats;
090
091
092    /**
093     * Builds an instance of this class.
094     * 
095     * @param url The web-address from whence this news-article was downloaded / retrieved.
096     * @param titleElement The contents of the HTML {@code <TITLE>} tag, as a {@code String}.
097     * @param originalPage Vectorized-HTML of the original article web-page, in its entirety.
098     * @param articleBody Vectorized-HTML of the body of the article's page, as extracted by the
099     * {@code ArticleGet} function-pointer.
100     * @param imageURLs A list of all HTML {@code <IMG>} elements found inside the
101     * {@code 'articleBody'}
102     * @param imagePosArr The {@code Vector}-indices where the images (if any) were found in the
103     * article.
104     */
105    public Article(
106            URL                 url,
107            String              titleElement,
108            Vector<HTMLNode>    originalPage,
109            Vector<HTMLNode>    articleBody,
110            Vector<URL>         imageURLs,
111            int[]               imagePosArr
112        )
113    {
114        this.wasErrorDownload       = false;
115        this.url                    = url;
116        this.titleElement           = titleElement;
117        this.originalPage           = originalPage;
118        this.articleBody            = articleBody;
119        this.imageURLs              = imageURLs;
120        this.imagePosArr            = imagePosArr;
121        this.originalPageStats      = (originalPage == null) ? null : new PageStats(originalPage);
122        this.processedArticleStats  = new PageStats(articleBody);
123    }
124}