1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
package Torello.HTML.Tools.NewsSite;

import Torello.Java.*;

import Torello.HTML.URLFilter;
import Torello.Java.Additional.RemoveUnsupportedIterator;
import Torello.Languages.LC;

import java.util.*;
import java.net.*;

/**
 * The 'data flow' encapsulation class that contains most of the salient features of a news
 * oriented web-site.
 * 
 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=NEWS_SITE>
 */
public class NewsSite implements java.io.Serializable
{
    /** <EMBED CLASS='external-html' DATA-FILE-ID=SVUID>  */
    public static final long serialVersionUID = 1;

    /** A Simple Name for the news-site */
    public final String siteName;

    /** Country of origin for the news-site in question */
    public final Country country;

    /** {@code URL} of the main-page for the news web-site */
    public final URL siteURL;

    /** A Language Code instance for the web-site, if needed. */
    public final LC languageCode;

    /** A simple text description of the news web-site */
    public final String description;

    /**
     * Should contain the complete list of news-section {@code URL's}
     * @see ScrapeURLs#get​(Vector, URLFilter, LinksGet, Appendable)
     */
    private final Vector<URL> sectionURLs;

    /**
     * An instance of {@link URLFilter} for filtering out certain {@code URL's} from the list of
     * scraped article-{@code URL's}.
     * @see ScrapeURLs
     */
    public final URLFilter filter;

    /**
     * An instance of {@code LinksGet} for retrieving Article-{@code URL} links from a section
     * page
     * @see ScrapeURLs
     */
    public final LinksGet linksGetter;

    /**
     * An instance of {@code ArticleGet} used to retrieve news-articles from this site.
     * @see ScrapeArticles
     */
    public final ArticleGet articleGetter;

    /**
     * An instance of {@code StrFilter} for finding banner's or ad's
     * @see ScrapeArticles
     */
    public final StrFilter bannerAndAddFinder;

    /**
     * Convenience Constructor.
     * <BR />May pass a {@link StrFilter} to the {@link URLFilter} parameter instead.
     * <BR />Invokes: {@link #NewsSite(String, Country, String, LC, String, Vector, URLFilter,
     *  LinksGet, ArticleGet, StrFilter)}
     */
    public NewsSite(
            String      siteName,
            Country     country,
            String      siteURLAsStr,
            LC          languageCode,
            String      description,
            Vector<URL> sectionURLs,
            StrFilter   filter,
            LinksGet    linksGetter,
            ArticleGet  articleGetter,
            StrFilter   bannerAndAddFinder
        )
    {
        this(
            siteName, country, siteURLAsStr, languageCode, description, sectionURLs,
            URLFilter.fromStrFilter(filter), linksGetter, articleGetter, bannerAndAddFinder
        );
    }

    /**
     * Simple constructor for this data-class.
     * 
     * @param siteName This site's name
     * @param country The <I>country-of-origin</I> for this news web-site.
     * @param siteURLAsStr The primary {@code URL} for the news web-site.
     * 
     * @param languageCode If this site uses a non-English system, the {@code 'languageCode'}
     * parameter can keep track of the language.
     * 
     * @param description Brief Description of the site.
     * 
     * @param sectionURLs This should list the primary news-sections on the web-site.  News
     * sections include lists such as "Life", "Health", "Business", "World News", "Sports" - but
     * this list could actually include just about anything.
     * 
     * @param filter If, when scraping a section, there are {@code URL's} that need to be filtered,
     * this parameter can help filtering non-Article, non-news links.  As explained in the
     * {@code class ScrapeURL's}, this is often a simple one-lined lambda-expression that
     * identifies which {@code URL's} match a Regular-Expression {@code Pattern}.
     * 
     * @param linksGetter This is a 'getter', which also is often just a one line
     * regular-expression lambda for retrieving the links from a section web-page.
     * 
     * @param articleGetter This should implement the {@code ArticleGet} interface.
     * @param bannerAndAddFinder Filter for finding repetitive ads or banners.
     */
    @SuppressWarnings("unchecked")
    public NewsSite(
            String      siteName,
            Country     country,
            String      siteURLAsStr,
            LC          languageCode,
            String      description,
            Vector<URL> sectionURLs,
            URLFilter   filter,
            LinksGet    linksGetter,
            ArticleGet  articleGetter,
            StrFilter   bannerAndAddFinder
        )
    {
        this.siteName           = siteName;
        this.country            = country;
        this.languageCode       = languageCode;
        this.description        = description;
        this.sectionURLs        = (Vector<URL>) sectionURLs.clone();
        this.filter             = filter;
        this.linksGetter        = linksGetter;
        this.articleGetter      = articleGetter;
        this.bannerAndAddFinder = bannerAndAddFinder;

        try
            { this.siteURL = new URL(siteURLAsStr); }

        catch (MalformedURLException e)
        {
            throw new NewsSiteException(
                "Unable to instantiate the parameter 'siteURLAsStr'.  There was a Malformed URL " +
                "Exception thrown.  Please see this Exceptions Throwable.getCause() for more " +
                "details.", e
            );
        }
    }

    /**
     * Retrieves the <B>Section URL's</B> (life, comedy, sports, business, world) for this
     * news-site
     * 
     * @return An {@code Iterator<URL>} of the different sections for a particular news-site.
     */
    public Iterator<URL> sectionURLsIter()
    { return new RemoveUnsupportedIterator<URL>(sectionURLs.iterator()); }

    /**
     * Retrieves the <B>Section URL's</B> (life, comedy, sports, business, world) for this
     * news-site
     * 
     * @return A {@code Vector<URL>} of the different sections for a particular news-site.
     */
    @SuppressWarnings("unchecked")
    public Vector<URL> sectionURLsVec()
    { return (Vector<URL>) sectionURLs.clone(); }
}