NewsSite.java.html

package Torello.HTML.Tools.NewsSite;

import Torello.Java.*;

import Torello.HTML.URLFilter;
import Torello.Java.Additional.RemoveUnsupportedIterator;
import Torello.Languages.LC;

import java.util.*;
import java.net.*;

/**
 * The 'data flow' encapsulation class that contains most of the salient features of a news
 * oriented web-site.
 * 
 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=NEWS_SITE>
 */
public class NewsSite implements java.io.Serializable
{
    /** <EMBED CLASS='external-html' DATA-FILE-ID=SVUID>  */
    public static final long serialVersionUID = 1;

    /** A Simple Name for the news-site */
    public final String siteName;

    /** Country of origin for the news-site in question */
    public final Country country;

    /** {@code URL} of the main-page for the news web-site */
    public final URL siteURL;

    /** A Language Code instance for the web-site, if needed. */
    public final LC languageCode;

    /** A simple text description of the news web-site */
    public final String description;

    /**
     * Should contain the complete list of news-section {@code URL's}
     * @see ScrapeURLs#get(Vector, URLFilter, LinksGet, Appendable)
     */
    private final Vector<URL> sectionURLs;

    /**
     * An instance of {@link URLFilter} for filtering out certain {@code URL's} from the list of
     * scraped article-{@code URL's}.
     * @see ScrapeURLs
     */
    public final URLFilter filter;

    /**
     * An instance of {@code LinksGet} for retrieving Article-{@code URL} links from a section
     * page
     * @see ScrapeURLs
     */
    public final LinksGet linksGetter;

    /**
     * An instance of {@code ArticleGet} used to retrieve news-articles from this site.
     * @see ScrapeArticles
     */
    public final ArticleGet articleGetter;

    /**
     * An instance of {@code StrFilter} for finding banner's or ad's
     * @see ScrapeArticles
     */
    public final StrFilter bannerAndAddFinder;

    /**
     * Convenience Constructor.
     * <BR />May pass a {@link StrFilter} to the {@link URLFilter} parameter instead.
     * <BR />Invokes: {@link #NewsSite(String, Country, String, LC, String, Vector, URLFilter,
     *  LinksGet, ArticleGet, StrFilter)}
     */
    public NewsSite(
            String      siteName,
            Country     country,
            String      siteURLAsStr,
            LC          languageCode,
            String      description,
            Vector<URL> sectionURLs,
            StrFilter   filter,
            LinksGet    linksGetter,
            ArticleGet  articleGetter,
            StrFilter   bannerAndAddFinder
        )
    {
        this(
            siteName, country, siteURLAsStr, languageCode, description, sectionURLs,
            URLFilter.fromStrFilter(filter), linksGetter, articleGetter, bannerAndAddFinder
        );
    }

    /**
     * Simple constructor for this data-class.
     * 
     * @param siteName This site's name
     * @param country The <I>country-of-origin</I> for this news web-site.
     * @param siteURLAsStr The primary {@code URL} for the news web-site.
     * 
     * @param languageCode If this site uses a non-English system, the {@code 'languageCode'}
     * parameter can keep track of the language.
     * 
     * @param description Brief Description of the site.
     * 
     * @param sectionURLs This should list the primary news-sections on the web-site.  News
     * sections include lists such as "Life", "Health", "Business", "World News", "Sports" - but
     * this list could actually include just about anything.
     * 
     * @param filter If, when scraping a section, there are {@code URL's} that need to be filtered,
     * this parameter can help filtering non-Article, non-news links.  As explained in the
     * {@code class ScrapeURL's}, this is often a simple one-lined lambda-expression that
     * identifies which {@code URL's} match a Regular-Expression {@code Pattern}.
     * 
     * @param linksGetter This is a 'getter', which also is often just a one line
     * regular-expression lambda for retrieving the links from a section web-page.
     * 
     * @param articleGetter This should implement the {@code ArticleGet} interface.
     * @param bannerAndAddFinder Filter for finding repetitive ads or banners.
     */
    @SuppressWarnings("unchecked")
    public NewsSite(
            String      siteName,
            Country     country,
            String      siteURLAsStr,
            LC          languageCode,
            String      description,
            Vector<URL> sectionURLs,
            URLFilter   filter,
            LinksGet    linksGetter,
            ArticleGet  articleGetter,
            StrFilter   bannerAndAddFinder
        )
    {
        this.siteName           = siteName;
        this.country            = country;
        this.languageCode       = languageCode;
        this.description        = description;
        this.sectionURLs        = (Vector<URL>) sectionURLs.clone();
        this.filter             = filter;
        this.linksGetter        = linksGetter;
        this.articleGetter      = articleGetter;
        this.bannerAndAddFinder = bannerAndAddFinder;

        try
            { this.siteURL = new URL(siteURLAsStr); }

        catch (MalformedURLException e)
        {
            throw new NewsSiteException(
                "Unable to instantiate the parameter 'siteURLAsStr'.  There was a Malformed URL " +
                "Exception thrown.  Please see this Exceptions Throwable.getCause() for more " +
                "details.", e
            );
        }
    }

    /**
     * Retrieves the <B>Section URL's</B> (life, comedy, sports, business, world) for this
     * news-site
     * 
     * @return An {@code Iterator<URL>} of the different sections for a particular news-site.
     */
    public Iterator<URL> sectionURLsIter()
    { return new RemoveUnsupportedIterator<URL>(sectionURLs.iterator()); }

    /**
     * Retrieves the <B>Section URL's</B> (life, comedy, sports, business, world) for this
     * news-site
     * 
     * @return A {@code Vector<URL>} of the different sections for a particular news-site.
     */
    @SuppressWarnings("unchecked")
    public Vector<URL> sectionURLsVec()
    { return (Vector<URL>) sectionURLs.clone(); }
}