1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
package Torello.HTML.Tools.NewsSite;

import Torello.HTML.HTMLNode;
import Torello.HTML.PageStats;
import java.util.Vector;
import java.io.Serializable;
import java.net.URL;

/**
 * When a news article is downloaded from a {@code URL}, its contents are parsed, and the
 * information-HTML is stored in this class.
 * 
 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=ARTICLE>
 */
public class Article implements Serializable
{
    /** <EMBED CLASS='external-html' DATA-FILE-ID=SVUID>  */
    protected static final long serialVersionUID = 1;

    /**
     * This should inform the user that an error occurred when downloading an article. If this
     * field,  after instantiation is {@code TRUE}, all other fields in this class should be thought
     * of as "irrelevant."
     */
    public final boolean wasErrorDownload;

    /** This is the article's URL from the news website. */
    public final URL url;

    /**
     * This is the title that was scraped from the main page.  The title is the content of the
     * {@code <TITLE>...</TITLE>} element on the article HTML page.
     */
    public final String titleElement;

    /**
     * This is the original, and complete, HTML vectorized-page download.  It contains the
     * original, un-modified, article download.
     */
    public final Vector<HTMLNode> originalPage;

    /**
     * This is the pared down article-body.  It is what is retrieved from {@code class ArticleGet}
     */
    public final Vector<HTMLNode> articleBody;

    /**
     * The image-URL's that were found in the news-article.  The easiest way to think about this
     * field is that the following instructions were called on the article-body after downloading
     * the article:
     * 
     * <BR /><BR /><DIV CLASS="SNIP">{@code
     * Vector<TagNode> imageNodes  = TagNodeGet.all(article, TC.OpeningTags, "img");
     * Vector<URL>     imageURLs   = Links.resolveSRCs(imageNodes, articleURL);
     * 
     * // The results of the above call are stored in this field / Vector<URL>.
     * }</DIV>
     */
    public final Vector<URL> imageURLs;

    /**
     * This list contains the "Image Positions" inside the vectorized-article for each image that
     * was found inside the article.  The easiest way to think about this field is that the
     * following instructions were called on the article-body after downloading that article:
     * 
     * <BR /><BR /><DIV CLASS="SNIP">{@code
     *  int[] imagePosArr = TagNodeFind.all(page, TC.OpeningTags, "img");
     * }</DIV>
     */
    public final int[] imagePosArr;

    /**
     * This contains an instance of {@code class PageStats} that has been generated out of an
     * original Newspaper Article Page.
     * 
     * <DIV CLASS="LOC">{@code 
     * this.originalPageStats = new PageStats(originalPage);
     * }</DIV>
     */
    public final PageStats originalPageStats;

    /**
     * This contains an instance of {@code class PageStats} that has been generated from the
     *  post-processed Newspaper Article.
     * <DIV CLASS="LOC">{@code 
     * this.processedArticleStats = new PageStats(articleBody);
     * }</DIV>
     */
    public final PageStats processedArticleStats;


    /**
     * Builds an instance of this class.
     * 
     * @param url The web-address from whence this news-article was downloaded / retrieved.
     * @param titleElement The contents of the HTML {@code <TITLE>} tag, as a {@code String}.
     * @param originalPage Vectorized-HTML of the original article web-page, in its entirety.
     * @param articleBody Vectorized-HTML of the body of the article's page, as extracted by the
     * {@code ArticleGet} function-pointer.
     * @param imageURLs A list of all HTML {@code <IMG>} elements found inside the
     * {@code 'articleBody'}
     * @param imagePosArr The {@code Vector}-indices where the images (if any) were found in the
     * article.
     */
    public Article(
            URL                 url,
            String              titleElement,
            Vector<HTMLNode>    originalPage,
            Vector<HTMLNode>    articleBody,
            Vector<URL>         imageURLs,
            int[]               imagePosArr
        )
    {
        this.wasErrorDownload       = false;
        this.url                    = url;
        this.titleElement           = titleElement;
        this.originalPage           = originalPage;
        this.articleBody            = articleBody;
        this.imageURLs              = imageURLs;
        this.imagePosArr            = imagePosArr;
        this.originalPageStats      = (originalPage == null) ? null : new PageStats(originalPage);
        this.processedArticleStats  = new PageStats(articleBody);
    }
}