ArticleGet.java.html

package Torello.HTML.Tools.NewsSite;

import java.util.function.*;
import java.util.*;
import java.util.regex.*;

import java.net.URL;

import Torello.HTML.*;
import Torello.HTML.NodeSearch.*;

import Torello.Java.ParallelArrayException;

/**
 * A function-pointer / lambda target for extracting an article's content from the web-page
 * from whence it was downloaded; including several {@code static}-builder methods for the
 * most common means of finding the HTML-Tags that wrap artilce-HTML on news-media websites.
 * 
 * <EMBED CLASS='external-html' DATA-FILE-ID=ARTICLE_GET>
 */
@FunctionalInterface
public interface ArticleGet extends java.io.Serializable
{
    /** <EMBED CLASS='external-html' DATA-FILE-ID=SVUIDFI>  */
    public static final long serialVersionUID = 1;

    // ******************************************************************************************
    // Standard Functional Interface Method
    // ******************************************************************************************

    /**
     * <EMBED CLASS='external-html' DATA-FILE-ID=FUNC_INTER_METH>
     *
     * <BR /><BR />This method's purpose is to take a "Scraped HTML Page" (stored as a
     * Vectorized-HTML Web-Page), and return an HTML {@code Vector} that contains only the
     * "Article Content" - <I>which is usually just called the "Article Body."</I>  Perhaps it
     * seems daunting, but <I>the usual way</I> to get the actual article-body of an HTML
     * News-Website Page is to simply identify an {@code HTML <DIV ID="..." CLASS="...">}
     * surrounding element.
     *
     * <BR /><BR />This class has <I>several different static-methods called "usual"</I> which
     * automatically create a page-getter.  The example at the top of this class should hiLite
     * how this works.  Extracting news-content from a page that has already been downloaded - is
     * usually trivial.  The point really becomes identifying the {@code <DIV>}'s {@code class=...}
     * or {@code id=...} attributes &amp; page-structure to find the article-body.  Generally, in
     * your browser just click the {@code View Source} and look at manually to find the attributes
     * used.  Using the myriad Get methods from {@code Torello.HTML.NodeSearch} usually boils down
     * to code that looks surreptitiously like Java-Script:
     *
     * <BR /><DIV CLASS="JAVASCRIPT">{@code
     *  var articleHTML = document.getElementById("article-body").innerHTML;
     *
     *  // or...
     *  var articleHTML = document.getElementByClassName("article-body").innerHTML;
     * }</DIV>
     *
     * <BR />Using the {@code NodeSearch} package, the above DOM-Tree Java-Script is easily written
     * in Java as below:
     *
     * <DIV CLASS="SNIP">{@code
     *  // For articles with HTML divider elements having an "ID" attribute to specify the article
     *  // body, get the article using the code below.  In this example, the particular newspaper
     *  // web-site has articles whose content ("Article Body") is simply wrapped in an HTML
     *  // HTML Divider Element: <DIV ID="article-body"> ... </DIV>
     * 
     *  // For extracting that content use the NodeSearch Package Class: InnerTagGetInclusive
     *
     *  Vector<HTMLNode> articleBody = InnerTagGetInclusive
     *      (page, "div", "id", TextComparitor.EQ_CI, "article-body");
     *
     *  // To use this NodeSearch Package Class with the NewsSite Package, simply use one of the
     *  // 'usual' methods in class ArticleGet, and the lambda Functional Interface "ArticleGet"
     *  // will be built automatically as such:
     *
     *  ArticleGet getter = ArticleGet.usual("div", "id", TextComparitor.EQ_CI, "article-body");
     *
     *  // For articles with HTML divider elements having an "CLASS" attribute to specify
     *  // the article body, get the article with the following code.  Note that in this example
     *  // the article body is wrapped in an HTML Divider Element that has the characteristics
     *  // <DIV CLASS="article-body"> ... </DIV>.  The content of a Newspaper Article can be easily
     *  // extracted with just one line of code using the methods in the NodeSearch Package as
     *  // follows: 
     *
     *  Vector<HTMLNode> articleBody = InnerTagGetInclusive
     *      (page, "div", "class", TextComparitor.C, "article-body");
     *
     *  // which should be written for use with the ScrapeArticles class as using the 'usual'
     *  // methods in ArticleGet as such:
     *
     *  ArticleGet getter = ArticleGet.usual(TextComparitor.EQ_CI, "article-body");
     * }</DIV>
     *
     * <BR /><BR /><B>NOTE:</B> For all examples above, the text-string "article-body" will be
     * a tag-value that (was) decided/chosen by the HTML news-website, or content-website you want
     * to scrape.
     *
     * <BR /><BR /><B><SPAN STYLE="color: red">ALSO:</SPAN></B> One might have to be careful about
     * modifying the input to this {@code Predicate}.  Each and every one of the NodeSearch classes
     * retrieves a copy (read: <B><I>a clone</I></B>) of the input {@code Vector} (other than the
     * classes that actually use the term "remove.")  However, if you were to write an Article
     * Get lambda of your own (rather than using the "usual" methods), make sure you know whether
     * you are going to <I>intentionally</I>, modify the input-page, and if so, remember you have.
     *
     * <BR /><BR /><B><SPAN STYLE="color: red">FURTHERMORE:</SPAN></B> There are many content-based
     * web-sites that have some (even "a lot") of spurious HTML information inside the primary
     * article body, even after the header &amp; footer information has been eliminated.  It may be
     * necessary to do some vector-cleaning later on.  For example: getting rid of "Post to
     * Facebook", "Post to Twitter" or "E-Mail Link" buttons.
     */
    public Vector<HTMLNode> apply(URL url, Vector<HTMLNode> page) throws ArticleGetException;

    // ******************************************************************************************
    // Filter Factory / Filter-Generator  static-methods
    // ******************************************************************************************

    /**
     * <I>This is a static, factory method for building ArticleGet.</I>
     *
     * <BR /><BR />This builds an "Article Getter" based on a parameter-specified HTML Tag.  Two
     * or three common HTML "semantic elements" used for wrapping newspaper article-content
     * include these:
     *
     * <BR /><BR /><UL CLASS=JDUL>
     * <LI>{@code <ARTICLE ...> article-body </ARITCLE>}</LI>
     * <LI>{@code <MAIN ...> article-body </MAIN>}</LI>
     * <LI>{@code <SECTION ...> article-body </SECTION>}</LI>
     * </UL> 
     *
     * <BR />Identifying which tag to use can be accomplished by going to the main-page of an
     * internet news web-site, selecting a news-article, and then using the {@code "View Source"}
     * or the {@code "View Page Source"} depending upon which browser your are using, and then
     * scanning the HTML to find what elements are used to wrap the article-body.
     *
     * <BR /><BR />Call this method, and use the ArticleGet that it generates/returns with the
     * {@code class NewsSiteScrape}.  As long as the news or content website that you are scraping
     * has it's page-body wrapped inside of an {@code HTML <DIV>} element <I><B>whose
     * {@code CSS 'class'} specifier is one you have uncovered by inspecting the
     * page-manually</B></I> then {@code ArticleGet} produced by this factory-method will retrieve
     * your page content appropriately.
     *
     * @param htmlTag This should be the HTML element that is used to wrap the actual news-content
     * article-body of an HTML news web-site page.
     * 
     * @return This returns an "Article Getter" that just picks out the part of a news-website
     * article that lies between the open and closed version of the specified htmlTag.
     */
    public static ArticleGet usual(String htmlTag)
    {
        final String htmlTagLC = htmlTag.toLowerCase();

        // This 'final String' is merely used for proper error reporting in any potential
        // exception-messages, nothing else.
        final String functionNameStr = "TagNodeGetInclusive.first(page, \"" + htmlTagLC + "\");";


        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // FAIL-FAST: Check user-input for possible errors BEFORE building the Lambda-Function.
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

        // Check for valid HTML Token
        HTMLTokException.check(htmlTagLC);

        // Self-Closing / Singleton Tags CANNOT be used with INCLUSIVE Retrieval Operations.
        InclusiveException.check(htmlTagLC);


        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // Build the instance, using a lambda-expression
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

        return (URL url, Vector<HTMLNode> page) ->
        {
            // This exception-check is done on every invocation of this Lambda-Function.
            // It is merely checking that these inputs are not-null, and page is of non-zero size.
            // ArticleGetException is a compile-time, checked exception.  It is important to halt
            // News-Site Scrape Progress when "Empty News-Page Data" is being passed here.
            // NOTE: This would imply an internal-error with class Download has occurred.

            ArticleGetException.check(url, page);   

            Vector<HTMLNode> ret;

            try
                { ret = TagNodeGetInclusive.first(page, htmlTagLC); }

            catch (Exception e)
            {
                throw new ArticleGetException
                    (ArticleGetException.GOT_EXCEPTION, functionNameStr, e);
            }

            // These error-checks are used to deduce whether the "Article Get" was successful.
            // When this exception is thrown, it means that the user-specified means of "Retrieving
            // an Article Body" FAILED.  In this case, the "innerHTML" of the specified htmlTag was
            // not found, and produced a null news-article page, or an empty news-article page.

            if (ret == null) throw new ArticleGetException
                (ArticleGetException.RET_NULL, functionNameStr);

            if (ret.size() == 0) throw new ArticleGetException
                (ArticleGetException.RET_EMPTY_VECTOR, functionNameStr);

            return ret;
        };
    }

    /**
     * <I>This is a static, factory method for building ArticleGet.</I>
     *
     * <BR /><BR />This builds an "Article Getter" for you, using the most common way to get
     * an article - specifically via the {@code HTML <DIV CLASS="...">} element and it's
     * {@code CSS 'class'} selector.  
     *
     * <BR /><BR />Call this method, and use the ArticleGet that it generates/returns with the
     * {@code class NewsSiteScrape}.  As long as the news or content website that you are scraping
     * has it's page-body wrapped inside of an {@code HTML <DIV>} element <I><B>whose 
     * {@code CSS 'class'} specifier is one you have uncovered by inspecting the
     * page-manually</B></I> then {@code ArticleGet} produced by this factory-method will retrieve
     * your page content appropriately.
     * 
     * @param tc This should be any of the pre-instantiated {@code TextComparitor's}.  Again, a
     * TextComparitor is just a {@code String} compare function like: {@code equals, contains,
     * StrCmpr.containsIgnoreCase(...)}, etc...
     * 
     * @param cssClassCompareStrings These are the values to be used by the 
     * {@code TextComparitor} when comparing with the value of the CSS-Selector {@code "Class"}
     * from the list of {@code DIV} elements on the page.
     * 
     * @return This returns an "Article Getter" that just picks out the part of a news-website
     * article that lies between the HTML-{@code DIV} Element nodes whose class is identified by
     * the "CSS (Cascading Style Sheets) {@code 'class'} identifier,  and the
     * {@code TextComparitor} parameter that you have chosen.
     */
    public static ArticleGet usual(TextComparitor tc, String... cssClassCompareStrings)
    {
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // FAIL-FAST: Check user-input for possible errors BEFORE building the Lambda-Function.
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

        // Check for valid compareStrings
        TCCompareStrException.check(cssClassCompareStrings);

        if (tc == null) throw new NullPointerException
            ("Null has been passed to TextComparitor Parameter 'tc', but this is not allowed here.");

        // This 'final' String is merely used for proper error reporting in any potential 
        // exception-messages, nothing else.

        final String functionNameStr =
            "InnerTagGetInclusive.first(page, \"div\", \"class\", " +
            STR_FORMAT_TC_PARAMS(tc, cssClassCompareStrings) + ")";

        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // Build the instance, using a lambda-expression
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

        return (URL url, Vector<HTMLNode> page) ->
        {
            // This exception-check is done on every invocation of this Lambda-Function.
            // It is merely checking that these inputs are not-null, and page is of non-zero size.
            // ArticleGetException is a compile-time, checked exception.  It is important to halt
            // News-Site Scrape Progress when "Empty News-Page Data" is being passed here.
            // NOTE: This would imply an internal-error with class Download has occurred.

            ArticleGetException.check(url, page);

            Vector<HTMLNode> ret;

            try
            {
                ret = InnerTagGetInclusive.first
                    (page, "div", "class", tc, cssClassCompareStrings);
            }
            catch (Exception e) 
            { 
                throw new ArticleGetException
                    (ArticleGetException.GOT_EXCEPTION, functionNameStr, e);
            }

            // These error-checks are used to deduce whether the "Article Get" was successful.
            // When this exception is thrown, it means that the user-specified means of "Retrieving
            // an Article Body" FAILED.  In this case, the "innerHTML" of the specified htmltag and
            // class of the <DIV CLASS=...> produced a null news-article page, or an empty
            // news-article page.

            if (ret == null) throw new ArticleGetException
                (ArticleGetException.RET_NULL, functionNameStr);

            if (ret.size() == 0) throw new ArticleGetException
                (ArticleGetException.RET_EMPTY_VECTOR, functionNameStr);

            return ret;
        };
    }

    /**
     * <I>This is a static, factory method for building ArticleGet.</I>
     *
     * <BR /><BR />This gives more options for building your article getter.  In almost 95% of the
     * news-websites, the article or page-body is between and open and close HTML DIV element,
     * and the {@code <DIV CLASS="...">} can be found by the {@code CSS 'class'} attribute.
     * <I><B>However,</B></I> This factory method allows a programmer to select article content
     * that handles other cases than the {@code 95%}, where you specify the HTML-token,
     * attribute-<B STYLE='color: red;'>name</B> and use the usual {@code TextComparitor} to find
     * the article.
     * 
     * @param htmlTag This is almost always a {@code "DIV"} element, but if you wish to specify
     * something else, possibly a paragraph element ({@code <P>}), or maybe an {@code <IFRAME>}
     * or {@code <FRAME>}, then you may.
     * 
     * @param innerTag This is almost always a {@code "CLASS"} attribute, but if you need to use
     * {@code "ID"} or something different altogether - possibly a site-specific tag, then use the
     * innerTag / attribute-<B STYLE='color: red;'>name</B> of your choice.
     * 
     * @param tc This should be any of the pre-instantiated {@code TextComparitor's}.  Again, a
     * {@code TextComparitor} is just a {@code String} compare function like: {@code equals, 
     * contains, StrCmpr.containsIgnoreCase(...)}.
     * 
     * @param attributeValueCompareStrings These are the {@code String's} compared with using
     * the innerTag <B STYLE='color: red;'>value</B> using the {@code TextComparitor}.
     * 
     * @return This returns an "Article Getter" that picks out the part of a news-website article
     * that lies between the HTML element which matches the {@code 'htmlTag', 'innerTag' (id,
     * class, or "other")}, and whose attribute-<B STYLE='color: red;'>value</B> of the specified
     * {@code inner-tag} can be matched by the {@code TextComparitor} and the 
     * compare-{@code String's}.
     */
    public static ArticleGet usual
        (String htmlTag, String innerTag, TextComparitor tc, String... attributeValueCompareStrings)
    {
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // FAIL-FAST: Check user-input for possible errors BEFORE building the Lambda-Function.
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

        TCCompareStrException.check(attributeValueCompareStrings);

        if (tc == null) throw new NullPointerException
            ("Null has been passed to TextComparitor Parameter 'tc', but this is not allowed here.");

        final String htmlTagLC  = htmlTag.toLowerCase();
        final String innerTagLC = innerTag.toLowerCase();

        // This 'final String' is merely used for proper error reporting in any potential
        // exception-messages, nothing else.

        final String functionNameStr =
            "InnerTagGetInclusive.first(page, \"" + htmlTag + "\", \"" + innerTag + "\", " +
            STR_FORMAT_TC_PARAMS(tc, attributeValueCompareStrings) + ")";

        // Check for valid HTML Tag.
        HTMLTokException.check(htmlTagLC);

        // Self-Closing / Singleton Tags CANNOT be used with INCLUSIVE Retrieval Operations.
        InclusiveException.check(htmlTagLC);


        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // Build the instance, using a lambda-expression
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

        return (URL url, Vector<HTMLNode> page) ->
        {
            // This exception-check is done on every invocation of this Lambda-Function.
            // It is merely checking that these inputs are not-null, and page is of non-zero size.
            // ArticleGetException is a compile-time, checked exception.  It is important to halt
            // News-Site Scrape Progress when "Empty News-Page Data" is being passed here.
            // NOTE: This would imply an internal-error with class Download has occurred.

            ArticleGetException.check(url, page);   

            Vector<HTMLNode> ret;

            try
            { 
                ret = InnerTagGetInclusive.first
                    (page, htmlTagLC, innerTagLC, tc, attributeValueCompareStrings);
            }
            catch (Exception e) // unlikely
            { 
                throw new ArticleGetException
                    (ArticleGetException.GOT_EXCEPTION, functionNameStr, e);
            }

            // These error-checks are used to deduce whether the "Article Get" was successful.
            // When this exception is thrown, it means that the user-specified means of "Retrieving
            // an Article Body" FAILED.  In this case, the "innerHTML" of the specified htmlTag and
            // attribute produced a null news-article page, or an empty news-article page.

            if (ret == null) throw new ArticleGetException
                (ArticleGetException.RET_NULL, functionNameStr);

            if (ret.size() == 0) throw new ArticleGetException
                (ArticleGetException.RET_EMPTY_VECTOR, functionNameStr);

            return ret;
        };
    }

    /**
     * <I>This is a static, factory method for building ArticleGet.</I>
     *
     * <BR /><BR />This gives more options for building your article getter.  In almost 95% of the
     * news-websites, the article or page-body is between and open and close HTML DIV element, and
     * the {@code <DIV CLASS="...">} can be found by the {@code CSS 'class'} attribute.  
     * <I><B>However,</B></I> This factory method allows a programmer to select article content
     * that handles other cases than the {@code 95%}.  Here, you may specify the HTML-token,
     * attribute-<B STYLE='color: red;'>name</B> and use a Java Regular-Expression handler to
     * test the <B STYLE='color: red;'>value</B> of the attribute - no matter how complicated or
     * bizarre.
     *
     * @param htmlTag This is almost always a {@code "DIV"} element, but if you wish to specify
     * something else, possibly a paragraph element ({@code <P>}), or maybe an {@code <IFRAME>}
     * or {@code <FRAME>}, then you may.
     *
     * @param innerTag This is almost always a {@code "CLASS"} attribute, but if you need to use
     * {@code "ID"} or something different altogether - possibly a site-specific tag, then use the
     * innerTag / attribute-<B STYLE='color: red;'>name</B> of your choice.
     *
     * @param innerTagValuePattern Any regular-expression.  It will be used to <B>PASS</B> or
     * <B>FAIL</B> the attribute-<B STYLE='color: red;'>value</B> <I>(a name that is used
     * interchangeably in this scrape/search package for
     * "inner-tag-<B STYLE='color: red;'>value</B>")</I> when compared against this
     * regular-expression parameter.
     *
     * <BR /><BR /><B>HELP:</B> This would be like saying:
     * <DIV CLASS="SNIP">{@code
     * // Pick some random HTML TagNode
     * TagNode aTagNode        = (TagNode) page.elementAt(index_to_test);
     *
     * // Gets the attribute value of "innerTag"
     * String  attributeValue  = aTagNode.AV(innerTag);
     *
     * // Make sure the HTML-token is as specified
     * // calls to: java.util.regex.*;
     * boolean passFail = aTagNode.tok.equals(htmlTag) &&
     *      innerTagValuePattern.matcher(attributeValue).find();
     * }</DIV>
     *
     * @return This returns an "Article Getter" that picks out the part of a news-website article
     * that lays between the HTML element which matches the htmlTag, innerTag and value-testing
     * regex {@code Pattern "innerTagValuePattern"}.
     */
    public static ArticleGet usual(String htmlTag, String innerTag, Pattern innerTagValuePattern)
    {
        final String htmlTagLC  = htmlTag.toLowerCase();
        final String innerTagLC = innerTag.toLowerCase();

        // This 'final String' is merely used for proper error reporting in any potential
        // exception-messages, nothing else.

        final String functionNameStr =
            "InnerTagGetInclusive.first(page, \"" + htmlTag + "\", \"" + innerTag + "\", " +
            innerTagValuePattern.pattern() + ")";


        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // FAIL-FAST: Check user-input for possible errors BEFORE building the Lambda-Function.
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

        HTMLTokException.check(htmlTagLC);
        InclusiveException.check(htmlTagLC);


        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // Build the instance, using a lambda-expression
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

        return (URL url, Vector<HTMLNode> page) ->
        {
            // This exception-check is done on every invocation of this Lambda-Function.
            // It is merely checking that these inputs are not-null, and page is of non-zero size.
            // ArticleGetException is a compile-time, checked exception.  It is important to halt
            // News-Site Scrape Progress when "Empty News-Page Data" is being passed here.
            // NOTE: This would imply an internal-error with class Download has occurred.

            ArticleGetException.check(url, page);

            Vector<HTMLNode> ret;

            try
            { 
                ret = InnerTagGetInclusive.first
                    (page, htmlTagLC, innerTagLC, innerTagValuePattern);
            }
            catch (Exception e) // unlikely
            { 
                throw new ArticleGetException
                    (ArticleGetException.GOT_EXCEPTION, functionNameStr, e);
            }

            // These error-checks are used to deduce whether the "Article Get" was successful.
            // When this exception is thrown, it means that the user-specified means of "Retrieving
            // an Article Body" FAILED.  In this case, the "innerHTML" of the specified htmlTag and
            // attribute produced a null news-article page, or an empty news-article page.

            if (ret == null) throw new ArticleGetException
                (ArticleGetException.RET_NULL, functionNameStr);

            if (ret.size() == 0) throw new ArticleGetException
                (ArticleGetException.RET_EMPTY_VECTOR, functionNameStr);

            return ret;            
        };
    }

    /**
     * <I>This is a static, factory method for building ArticleGet.</I>
     *
     * <BR /><BR />This gives more options for building your article getter.  In almost 95% of the
     * news-websites, the article or page-body is between and open and close HTML {@code 'DIV'}
     * element, and the {@code <DIV CLASS="...">} can be found by the {@code CSS 'class'} attribute.
     * <I><B>However,</B></I> This factory method allows a programmer to select article content
     * that handles other cases than the {@code 95%}, where you specify the HTML-token, 
     * attribute-<B STYLE='color: red;'>name</B> and a {@code Predicate<String>} for finding the
     * page-body.
     *
     * @param htmlTag This is almost always a {@code "DIV"} element, but if you wish to specify
     * something else, possibly a paragraph element ({@code <P>}), or maybe an {@code <IFRAME>}
     * or {@code <FRAME>}, then you may.
     *
     * @param innerTag This is almost always a {@code "CLASS"} attribute, but if you need to use
     * {@code "ID"} or something different altogether - possibly a site-specific tag, then use the
     * innerTag / attribute-<B STYLE='color: red;'>name</B> of your choice.
     *
     * @param p This java "lambda {@code Predicate}" will just receive the 
     * attribute-<B STYLE='color: red;'>value</B> from the "inner-tag" and provide a yes/no answer.
     *
     * @return This returns an "Article Getter" that matches an HTML element specified by
     * {@code 'htmlTag', 'innerTag'} and the result of the {@code String-Predicate} parameter
     * {@code 'p'} on the <B STYLE='color: red;'>value</B> of that inner-tag.
     */
    public static ArticleGet usual(String htmlTag, String innerTag, Predicate<String> p)
    {
        final String htmlTagLC  = htmlTag.toLowerCase();
        final String innerTagLC = innerTag.toLowerCase();

        // This 'final' String is merely used for proper error reporting in any potential
        // exception-messages, nothing else.

        final String functionNameStr =
            "InnerTagGetInclusive.first(page, \"" + htmlTag + "\", \"" + innerTag + "\", " +
            "Predicate<String>)";


        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // FAIL-FAST: Check user-input for possible errors BEFORE building the Lambda-Function.
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

        HTMLTokException.check(htmlTagLC);
        InclusiveException.check(htmlTagLC);

        if (p == null) throw new NullPointerException
            ("Null has been passed to Predicate parameter 'p'.  This is not allowed here.");


        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // Build the instance, using a lambda-expression
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

        return (URL url, Vector<HTMLNode> page) ->
        {
            // This exception-check is done on every invocation of this Lambda-Function.
            // It is merely checking that these inputs are not-null, and page is of non-zero size.
            // ArticleGetException is a compile-time, checked exception.  It is important to halt
            // News-Site Scrape Progress when "Empty News-Page Data" is being passed here.
            // NOTE: This would imply an internal-error with class Download has occurred.

            ArticleGetException.check(url, page);

            Vector<HTMLNode> ret;

            try
                { ret = InnerTagGetInclusive.first(page, htmlTagLC, innerTagLC, p); }

            catch (Exception e)
            { 
                throw new ArticleGetException
                    (ArticleGetException.GOT_EXCEPTION, functionNameStr, e);
            }

            // These error-checks are used to deduce whether the "Article Get" was successful.
            // When this exception is thrown, it means that the user-specified means of "Retrieving
            // an Article Body" FAILED.  In this case, the "innerHTML" of the specified htmlTag and
            // attribute produced a null news-article page, or an empty news-article page.

            if (ret == null) throw new ArticleGetException
                (ArticleGetException.RET_NULL, functionNameStr, null);

            if (ret.size() == 0) throw new ArticleGetException
                (ArticleGetException.RET_EMPTY_VECTOR, functionNameStr, null);

            return ret;
        };
    }

    /**
     * <I>This is a static, factory method for building ArticleGet.</I>
     *
     * <BR /><BR />This factory method generates an "ArticleGet" that will retrieve news-article
     * body-content based on a "start-tag" and an "end-tag."  It is <B><I>very</I></B> to note,
     * that the text can only match a single text-node, and not span multiple text-nodes, or be
     * within {@code TagNode's} at all!  This should be easy to find, print up the HTML page as a
     * {@code Vector}, and inspect it!
     * 
     * @param startTextTag This must be text from an HTML {@code TextNode} that is
     * <I><B>contained</B> within one (single) {@code TextNode}</I> of the vectorized-HTML page.
     * 
     * @param endTextTag This must be text from an HTML {@code TextNode} that is also
     * <B><I>contained</B> in a single {@code TextNode}</I> of the vectorized-HTML page.
     * 
     * @return This will return an "Article Getter" that looks for <B><I>non-HTML Text</I></B> in
     * the article, specified by the text-tag parameters, and gets it.
     */
    public static ArticleGet usual(String startTextTag, String endTextTag)
    {
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // FAIL-FAST: Check user-input for possible errors BEFORE building the Lambda-Function.
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

        if (startTextTag == null) throw new NullPointerException
            ("Null has been passed to parameter 'startTextTag', but this is not allowed here.");

        if (endTextTag == null) throw new NullPointerException
            ("Null has been passed to parameter 'endTextTag', but this is not allowed here.");


        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // Build the instance, using a lambda-expression
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

        return (URL url, Vector<HTMLNode> page) ->
        {
            // This exception-check is done on every invokation of this Lambda-Function.
            // It is merely checking that these inputs are not-null, and page is of non-zero size.
            // ArticleGetException is a compile-time, checked exception.  It is important to halt
            // News-Site Scrape Progress when "Empty News-Page Data" is being passed here.
            // NOTE: This would imply an internal-error with class Download has occured.

            ArticleGetException.check(url, page);

            int         start   = -1;
            int         end     = -1;
            HTMLNode    n       = null;

            while (start++ < page.size())
                if ((n = page.elementAt(start)) instanceof TextNode)
                    if (n.str.contains(startTextTag))
                        break;

            while (end++ < page.size())
                if ((n = page.elementAt(end)) instanceof TextNode)
                    if (n.str.contains(endTextTag))
                        break;

            // These error-checks are used to deduce whether the "Article Get" was successful.
            // When this exception is thrown, it means that the user-specified means of "Retrieving
            // an Article Body" FAILED.  In this case it is because the start/end tags were not found
            // in the text of the vectorized-html news-article web-page.

            if (start == page.size()) throw new ArticleGetException(
                "Start Text Tag [" + startTextTag + "], was not found on the News Article HTML " +
                "page."
            );

            if (end == page.size()) throw new ArticleGetException(
                "End Text Tag [" + endTextTag + "], was not found on the News Article HTML " +
                "page."
            );

            return Util.cloneRange(page, start, end + 1);
        };
    }

    /**
     * <I>This is a static, factory method for building ArticleGet.</I>
     *
     * This factory method generates an "ArticleGet" that will retrieve news-article body-content
     * based on starting and ending regular-expressions.  The matches performed by the Regular
     * Expression checker will be performed on {@code TextNode's}, not on the {@code TagNode's}, or
     * the page itself.  It is <B><I>very</I></B> to note, that the text can only match a single
     * {@code TextNode}, and not span multiple {@code TextNode's}, or be within {@code TagNode's}
     * at all!  This should be easy to find, print up the HTML page as a {@code Vector}, and
     * inspect it!
     * 
     * @param startPattern This must be a regular expression {@code Pattern} that matches an HTML
     * {@code TextNode} that is <I><B>contained</B> within one (single) {@code TextNode}</I> of
     * the vectorized-HTML page.
     * 
     * @param endPattern This must be a regular expression {@code Pattern} that matches an HTML
     * {@code TextNode} that is also <B><I>contained</B> in a single  {@code TextNode}</I> of the
     * vectorized-HTML page.
     * 
     * @return This will return an "Article Getter" that looks for <B><I>non-HTML Text</I></B>
     * in the article, specified by the regular-expression pattern-matching parameters, and gets it.
     */
    public static ArticleGet usual(Pattern startPattern, Pattern endPattern)
    {
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // FAIL-FAST: Check user-input for possible errors BEFORE building the Lambda-Function.
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

        if (startPattern == null) throw new NullPointerException
            ("Null has been passed to parameter 'startPattern', but this is not allowed here.");

        if (endPattern == null) throw new NullPointerException
            ("Null has been passed to parameter 'endPattern', but this is not allowed here.");


        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // Build the instance, using a lambda-expression
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

        return (URL url, Vector<HTMLNode> page) ->
        {
            // This exception-check is done on every invokation of this Lambda-Function.
            // It is merely checking that these inputs are not-null, and page is of non-zero size.
            // ArticleGetException is a compile-time, checked exception.  It is important to halt
            // News-Site Scrape Progress when "Empty News-Page Data" is being passed here.
            // NOTE: This would imply an internal-error with class Download has occured.

            ArticleGetException.check(url, page);
            int         start   = -1;
            int         end     = -1;
            HTMLNode    n       = null;

            while (start++ < page.size())
                if ((n = page.elementAt(start)) instanceof TextNode)
                    if (startPattern.matcher(n.str).find())
                        break;

            while (end++ < page.size())
                if ((n = page.elementAt(end)) instanceof TextNode)
                    if (endPattern.matcher(n.str).find())
                        break;

            // These error-checks are used to deduce whether the "Article Get" was successful.
            // When this exception is thrown, it means that the user-specified means of "Retrieving
            // an Article Body" FAILED.  In this case it is because the start or end regex failed to
            // match.

            if (start == page.size()) throw new ArticleGetException(
                "Start Pattern [" + startPattern.toString() + "], was not found on the HTML " +
                "page."
            );

            if (end == page.size()) throw new ArticleGetException
                ("End Pattern [" + endPattern.toString() + "], was not found on the HTML page.");

            return Util.cloneRange(page, start, end + 1);
        };
    }

    /**
     * <I>This is a static, factory method for building ArticleGet.</I>
     *
     * This is just a way to put a list of article-parse objects into a single "branching"
     * article-parse {@code Object}.  The two parameters must be equal-length arrays, with non-null
     * elements.  Each {@code 'urlSelector'} will be tested, and when a selector passes, the
     * {@code ArticleGet} that is created will use the "parallel getter" from the parallel array
     * "getters."
     *
     * <BR /><BR /><B>LAY-SPEAK:</B> The best way to summarize this is if a programmer is going to
     * use the {@code NewsSiteScrape} class, and planning to scrape a site that has different types
     * of news-articles, he will need differing {@code "ArticleGet"} methods.  This class will take
     * two {@code array's} that match the {@code URL} from which the article was retrieved with the
     * particular "getter" method you have provided.  When I scrape the address:
     * {@code http://www.baidu.com/} - a Chinese News Web-Site, it links to at least three primary
     * domains:
     *
     * <BR /><BR /><OL CLASS=JDOL>
     * <LI>{@code http://...chinesenews.com/director.../article...}</LI>
     * <LI>{@code http://...xinhuanet.com/director.../article...}</LI>
     * <LI>{@code http://...cctv.com/director.../article...}</LI>
     * </OL>
     *
     * <BR />Results from each of these sites need to be "handled" just ever-so-slightly different.
     * 
     * @param urlSelectors This is a list of {@code Predicate<URL>} elements.  When one of these
     * returns {@code TRUE} for a particular {@code URL}, then the index of that
     * {@code URL}-selector in it's {@code array} will be used to call the appropriate getter from
     * the parallel-{@code array} input-parameter {@code 'getters'}.
     * 
     * @param getters This is a list of getter elements.  These should be tailored to the
     * particular news-website source that are chosen/selected by the {@code 'urlSelectors'}
     * parallel {@code array}.
     * 
     * @return This will be a "master {@code ArticleGet}" or a "dispatch {@code ArticleGet}."
     * All it does is simply traverse the first {@code array} looking for a
     * {@code Predicate}-match from the {@code 'urlSelectors'}, and then calls the getter in the
     * parallel {@code array}.
     *
     * <BR /><BR /><B>NOTE:</B> If none of the {@code 'urlSelectors'} match when this
     * <B><I>"dispatch"</B></I> or rather <B><I>"branch"</I></B> is called by {@code class 
     * NewsSiteScrape}, the function/getter that is returned will throw an 
     * {@code ArticleGetException}.  It is important that the programmer only allow article
     * {@code URL's} that he can capably handled to pass to {@code class NewsSiteScrape}.
     *
     * @throws IllegalArgumentException Will throw this exception if:
     * 
     * <BR /><BR /><UL CLASS=JDUL>
     * <LI>Either of these parameters are null</LI>
     * <LI>If they are not parallel, with differing lengths.</LI>
     * <LI>If either contain a null value.</LI>
     * </UL>
     */
    public static ArticleGet branch(URLFilter[] urlSelectors, ArticleGet[] getters)
    {
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // FAIL-FAST: Check user-input for possible errors BEFORE building the Lambda-Function.
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

        if (urlSelectors.length == 0) throw new IllegalArgumentException
            ("parameter 'urlSelectors' had zero-elements.");

        if (getters.length == 0) throw new IllegalArgumentException
            ("parameter 'getters' had zero-elements.");

        ParallelArrayException.check(urlSelectors, "urlSelectors", true, getters, "getters", true);


        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // Build the instance, using a lambda-expression
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

        return (URL url, Vector<HTMLNode> page) ->
        {
            for (int i=0; i < urlSelectors.length; i++)
                if (urlSelectors[i].test(url))
                    return getters[i].apply(url, page);

            throw new ArticleGetException(
                "None of the urlSelecctors you have provided matched the URL sent to this " +
                "instance of ArticleGet."
            );
        };            
    }


    // ******************************************************************************************
    // Other Methods
    // ******************************************************************************************

    /**
     * This is the standard-java {@code Function 'andThen'} method.
     *
     * @param after This is the {@code ArticleGet} that will be (automatically) applied after
     * {@code 'this'} function. 
     *
     * @return A new, composite {@code ArticleGet} that performs both operations. It will:
     *
     * <BR /><BR /><OL CLASS=JDOL>
     * <LI> Run {@code 'this'} function's {@code 'apply'} method to a
     *      {@code URL, Vector<HTMLNode>}, and return a {@code Vector<HTMLNode>}.
     *      <BR /><BR />
     *      </LI>
     * <LI> Then it will run the {@code 'after'} function's {@code 'apply'} method to the
     *      results of {@code 'this.apply(...)'} and return the result.
     *      <BR />
     *      </LI>
     * </OL>
     */
    default ArticleGet andThen(ArticleGet after)
    { return (URL url, Vector<HTMLNode> page) -> after.apply(url, this.apply(url, page)); }

    /**
     * This is the standard-java {@code Function 'compose'} method.
     * 
     * @param before This is the {@code ArticleGet} that is performed first, whose results are
     * sent to {@code 'this'} function.
     * 
     * @return A new composite {@code ArticleGet} that performs both operations.
     * It will:
     * 
     * <BR /><BR /><OL CLASS=JDOL>
     * <LI> Run the {@code 'before'} function's {@code 'apply'} method to a
     *      {@code URL, Vector<HTMLNode>}, and return a {@code Vector<HTMLNode>}.
     *      </LI>
     * <LI> Then it will run {@code 'this'} function's {@code 'apply'} method to the
     *      results of the {@code before.apply(...)} and return the result.
     *      </LI>
     * </OL>
     */
    default ArticleGet compose(ArticleGet before)
    { return (URL url, Vector<HTMLNode> page) -> this.apply(url, before.apply(url, page)); }

    /**
     * The identity function will always return the same {@code Vector<HTMLNode>} as output that
     * it receives as input.  This is one of the {@code default} Java's lambda-methods.
     * 
     * @return a new {@code ArticleGet} which (it should be obvious) is of type:
     * {@code java.util.function.Function<Vector<HTMLNode>, Vector<HTMLNode>>}
     * <BR /><BR />...<I> where the returned {@code Vector} is always the same (identical) to
     * the input {@code Vector}.</I>
     */
    static ArticleGet identity()
    {
        return (URL url, Vector<HTMLNode> page) ->
        {
            ArticleGetException.check(url, page);
            return page;
        };
    }

    // Internally used "Helper Method"
    /** Internally Used.  */
    static String STR_FORMAT_TC_PARAMS(TextComparitor tc, String... compareStrings)
    {
        String tcName = TextComparitor.getName(tc);

        String ret = (tcName != null)
            ? "TextComparitor." + tcName + ", "
            : "TextComparitor.(Anonymous-TC), ";

        for (int i=0; i < compareStrings.length; i++)
        {
            String str = compareStrings[i];

            if ((ret.length() + str.length()) > 120)
            {
                ret += "\"" + (str.substring(0, str.length() - (120 - ret.length())) + "...");
                break;
            }
            else ret += "\"" + str;

            ret += (i < (compareStrings.length - 1)) ? "\", " : "\")";
        }

        return ret;
    }
}