Source code

001package Torello.HTML.Tools.NewsSite;
002
003import Torello.Java.*;
004
005import Torello.HTML.URLFilter;
006import Torello.Java.Additional.RemoveUnsupportedIterator;
007import Torello.Languages.LC;
008
009import java.util.*;
010import java.net.*;
011
012/**
013 * The 'data flow' encapsulation class that contains most of the salient features of a news
014 * oriented web-site.
015 * 
016 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=NEWS_SITE>
017 */
018public class NewsSite implements java.io.Serializable
019{
020    /** <EMBED CLASS='external-html' DATA-FILE-ID=SVUID>  */
021    public static final long serialVersionUID = 1;
022
023    /** A Simple Name for the news-site */
024    public final String siteName;
025
026    /** Country of origin for the news-site in question */
027    public final Country country;
028
029    /** {@code URL} of the main-page for the news web-site */
030    public final URL siteURL;
031
032    /** A Language Code instance for the web-site, if needed. */
033    public final LC languageCode;
034
035    /** A simple text description of the news web-site */
036    public final String description;
037
038    /**
039     * Should contain the complete list of news-section {@code URL's}
040     * @see ScrapeURLs#get(Vector, URLFilter, LinksGet, Appendable)
041     */
042    private final Vector<URL> sectionURLs;
043
044    /**
045     * An instance of {@link URLFilter} for filtering out certain {@code URL's} from the list of
046     * scraped article-{@code URL's}.
047     * @see ScrapeURLs
048     */
049    public final URLFilter filter;
050
051    /**
052     * An instance of {@code LinksGet} for retrieving Article-{@code URL} links from a section
053     * page
054     * @see ScrapeURLs
055     */
056    public final LinksGet linksGetter;
057
058    /**
059     * An instance of {@code ArticleGet} used to retrieve news-articles from this site.
060     * @see ScrapeArticles
061     */
062    public final ArticleGet articleGetter;
063
064    /**
065     * An instance of {@code StrFilter} for finding banner's or ad's
066     * @see ScrapeArticles
067     */
068    public final StrFilter bannerAndAddFinder;
069
070    /**
071     * Convenience Constructor.
072     * <BR />May pass a {@link StrFilter} to the {@link URLFilter} parameter instead.
073     * <BR />Invokes: {@link #NewsSite(String, Country, String, LC, String, Vector, URLFilter,
074     *  LinksGet, ArticleGet, StrFilter)}
075     */
076    public NewsSite(
077            String      siteName,
078            Country     country,
079            String      siteURLAsStr,
080            LC          languageCode,
081            String      description,
082            Vector<URL> sectionURLs,
083            StrFilter   filter,
084            LinksGet    linksGetter,
085            ArticleGet  articleGetter,
086            StrFilter   bannerAndAddFinder
087        )
088    {
089        this(
090            siteName, country, siteURLAsStr, languageCode, description, sectionURLs,
091            URLFilter.fromStrFilter(filter), linksGetter, articleGetter, bannerAndAddFinder
092        );
093    }
094
095    /**
096     * Simple constructor for this data-class.
097     * 
098     * @param siteName This site's name
099     * @param country The <I>country-of-origin</I> for this news web-site.
100     * @param siteURLAsStr The primary {@code URL} for the news web-site.
101     * 
102     * @param languageCode If this site uses a non-English system, the {@code 'languageCode'}
103     * parameter can keep track of the language.
104     * 
105     * @param description Brief Description of the site.
106     * 
107     * @param sectionURLs This should list the primary news-sections on the web-site.  News
108     * sections include lists such as "Life", "Health", "Business", "World News", "Sports" - but
109     * this list could actually include just about anything.
110     * 
111     * @param filter If, when scraping a section, there are {@code URL's} that need to be filtered,
112     * this parameter can help filtering non-Article, non-news links.  As explained in the
113     * {@code class ScrapeURL's}, this is often a simple one-lined lambda-expression that
114     * identifies which {@code URL's} match a Regular-Expression {@code Pattern}.
115     * 
116     * @param linksGetter This is a 'getter', which also is often just a one line
117     * regular-expression lambda for retrieving the links from a section web-page.
118     * 
119     * @param articleGetter This should implement the {@code ArticleGet} interface.
120     * @param bannerAndAddFinder Filter for finding repetitive ads or banners.
121     */
122    @SuppressWarnings("unchecked")
123    public NewsSite(
124            String      siteName,
125            Country     country,
126            String      siteURLAsStr,
127            LC          languageCode,
128            String      description,
129            Vector<URL> sectionURLs,
130            URLFilter   filter,
131            LinksGet    linksGetter,
132            ArticleGet  articleGetter,
133            StrFilter   bannerAndAddFinder
134        )
135    {
136        this.siteName           = siteName;
137        this.country            = country;
138        this.languageCode       = languageCode;
139        this.description        = description;
140        this.sectionURLs        = (Vector<URL>) sectionURLs.clone();
141        this.filter             = filter;
142        this.linksGetter        = linksGetter;
143        this.articleGetter      = articleGetter;
144        this.bannerAndAddFinder = bannerAndAddFinder;
145
146        try
147            { this.siteURL = new URL(siteURLAsStr); }
148
149        catch (MalformedURLException e)
150        {
151            throw new NewsSiteException(
152                "Unable to instantiate the parameter 'siteURLAsStr'.  There was a Malformed URL " +
153                "Exception thrown.  Please see this Exceptions Throwable.getCause() for more " +
154                "details.", e
155            );
156        }
157    }
158
159    /**
160     * Retrieves the <B>Section URL's</B> (life, comedy, sports, business, world) for this
161     * news-site
162     * 
163     * @return An {@code Iterator<URL>} of the different sections for a particular news-site.
164     */
165    public Iterator<URL> sectionURLsIter()
166    { return new RemoveUnsupportedIterator<URL>(sectionURLs.iterator()); }
167
168    /**
169     * Retrieves the <B>Section URL's</B> (life, comedy, sports, business, world) for this
170     * news-site
171     * 
172     * @return A {@code Vector<URL>} of the different sections for a particular news-site.
173     */
174    @SuppressWarnings("unchecked")
175    public Vector<URL> sectionURLsVec()
176    { return (Vector<URL>) sectionURLs.clone(); }
177}