001package Torello.HTML.Tools.NewsSite;
002
003import java.util.function.*;
004import java.util.*;
005import java.util.regex.*;
006
007import java.net.URL;
008
009import Torello.HTML.*;
010import Torello.HTML.NodeSearch.*;
011
012import Torello.Java.ParallelArrayException;
013
014/**
015 * A function-pointer / lambda target for extracting an article's content from the web-page
016 * from whence it was downloaded; including several {@code static}-builder methods for the
017 * most common means of finding the HTML-Tags that wrap artilce-HTML on news-media websites.
018 * 
019 * <EMBED CLASS='external-html' DATA-FILE-ID=ARTICLE_GET>
020 */
021@FunctionalInterface
022public interface ArticleGet extends java.io.Serializable
023{
024    /** <EMBED CLASS='external-html' DATA-FILE-ID=SVUIDFI>  */
025    public static final long serialVersionUID = 1;
026
027    // ******************************************************************************************
028    // Standard Functional Interface Method
029    // ******************************************************************************************
030
031    /**
032     * <EMBED CLASS='external-html' DATA-FILE-ID=FUNC_INTER_METH>
033     *
034     * <BR /><BR />This method's purpose is to take a "Scraped HTML Page" (stored as a
035     * Vectorized-HTML Web-Page), and return an HTML {@code Vector} that contains only the
036     * "Article Content" - <I>which is usually just called the "Article Body."</I>  Perhaps it
037     * seems daunting, but <I>the usual way</I> to get the actual article-body of an HTML
038     * News-Website Page is to simply identify an {@code HTML <DIV ID="..." CLASS="...">}
039     * surrounding element.
040     *
041     * <BR /><BR />This class has <I>several different static-methods called "usual"</I> which
042     * automatically create a page-getter.  The example at the top of this class should hiLite
043     * how this works.  Extracting news-content from a page that has already been downloaded - is
044     * usually trivial.  The point really becomes identifying the {@code <DIV>}'s {@code class=...}
045     * or {@code id=...} attributes &amp; page-structure to find the article-body.  Generally, in
046     * your browser just click the {@code View Source} and look at manually to find the attributes
047     * used.  Using the myriad Get methods from {@code Torello.HTML.NodeSearch} usually boils down
048     * to code that looks surreptitiously like Java-Script:
049     *
050     * <BR /><DIV CLASS="JAVASCRIPT">{@code
051     *  var articleHTML = document.getElementById("article-body").innerHTML;
052     *
053     *  // or...
054     *  var articleHTML = document.getElementByClassName("article-body").innerHTML;
055     * }</DIV>
056     *
057     * <BR />Using the {@code NodeSearch} package, the above DOM-Tree Java-Script is easily written
058     * in Java as below:
059     *
060     * <DIV CLASS="SNIP">{@code
061     *  // For articles with HTML divider elements having an "ID" attribute to specify the article
062     *  // body, get the article using the code below.  In this example, the particular newspaper
063     *  // web-site has articles whose content ("Article Body") is simply wrapped in an HTML
064     *  // HTML Divider Element: <DIV ID="article-body"> ... </DIV>
065     * 
066     *  // For extracting that content use the NodeSearch Package Class: InnerTagGetInclusive
067     *
068     *  Vector<HTMLNode> articleBody = InnerTagGetInclusive
069     *      (page, "div", "id", TextComparitor.EQ_CI, "article-body");
070     *
071     *  // To use this NodeSearch Package Class with the NewsSite Package, simply use one of the
072     *  // 'usual' methods in class ArticleGet, and the lambda Functional Interface "ArticleGet"
073     *  // will be built automatically as such:
074     *
075     *  ArticleGet getter = ArticleGet.usual("div", "id", TextComparitor.EQ_CI, "article-body");
076     *
077     *  // For articles with HTML divider elements having an "CLASS" attribute to specify
078     *  // the article body, get the article with the following code.  Note that in this example
079     *  // the article body is wrapped in an HTML Divider Element that has the characteristics
080     *  // <DIV CLASS="article-body"> ... </DIV>.  The content of a Newspaper Article can be easily
081     *  // extracted with just one line of code using the methods in the NodeSearch Package as
082     *  // follows: 
083     *
084     *  Vector<HTMLNode> articleBody = InnerTagGetInclusive
085     *      (page, "div", "class", TextComparitor.C, "article-body");
086     *
087     *  // which should be written for use with the ScrapeArticles class as using the 'usual'
088     *  // methods in ArticleGet as such:
089     *
090     *  ArticleGet getter = ArticleGet.usual(TextComparitor.EQ_CI, "article-body");
091     * }</DIV>
092     *
093     * <BR /><BR /><B>NOTE:</B> For all examples above, the text-string "article-body" will be
094     * a tag-value that (was) decided/chosen by the HTML news-website, or content-website you want
095     * to scrape.
096     *
097     * <BR /><BR /><B><SPAN STYLE="color: red">ALSO:</SPAN></B> One might have to be careful about
098     * modifying the input to this {@code Predicate}.  Each and every one of the NodeSearch classes
099     * retrieves a copy (read: <B><I>a clone</I></B>) of the input {@code Vector} (other than the
100     * classes that actually use the term "remove.")  However, if you were to write an Article
101     * Get lambda of your own (rather than using the "usual" methods), make sure you know whether
102     * you are going to <I>intentionally</I>, modify the input-page, and if so, remember you have.
103     *
104     * <BR /><BR /><B><SPAN STYLE="color: red">FURTHERMORE:</SPAN></B> There are many content-based
105     * web-sites that have some (even "a lot") of spurious HTML information inside the primary
106     * article body, even after the header &amp; footer information has been eliminated.  It may be
107     * necessary to do some vector-cleaning later on.  For example: getting rid of "Post to
108     * Facebook", "Post to Twitter" or "E-Mail Link" buttons.
109     */
110    public Vector<HTMLNode> apply(URL url, Vector<HTMLNode> page) throws ArticleGetException;
111
112    // ******************************************************************************************
113    // Filter Factory / Filter-Generator  static-methods
114    // ******************************************************************************************
115
116    /**
117     * <I>This is a static, factory method for building ArticleGet.</I>
118     *
119     * <BR /><BR />This builds an "Article Getter" based on a parameter-specified HTML Tag.  Two
120     * or three common HTML "semantic elements" used for wrapping newspaper article-content
121     * include these:
122     *
123     * <BR /><BR /><UL CLASS=JDUL>
124     * <LI>{@code <ARTICLE ...> article-body </ARITCLE>}</LI>
125     * <LI>{@code <MAIN ...> article-body </MAIN>}</LI>
126     * <LI>{@code <SECTION ...> article-body </SECTION>}</LI>
127     * </UL> 
128     *
129     * <BR />Identifying which tag to use can be accomplished by going to the main-page of an
130     * internet news web-site, selecting a news-article, and then using the {@code "View Source"}
131     * or the {@code "View Page Source"} depending upon which browser your are using, and then
132     * scanning the HTML to find what elements are used to wrap the article-body.
133     *
134     * <BR /><BR />Call this method, and use the ArticleGet that it generates/returns with the
135     * {@code class NewsSiteScrape}.  As long as the news or content website that you are scraping
136     * has it's page-body wrapped inside of an {@code HTML <DIV>} element <I><B>whose
137     * {@code CSS 'class'} specifier is one you have uncovered by inspecting the
138     * page-manually</B></I> then {@code ArticleGet} produced by this factory-method will retrieve
139     * your page content appropriately.
140     *
141     * @param htmlTag This should be the HTML element that is used to wrap the actual news-content
142     * article-body of an HTML news web-site page.
143     * 
144     * @return This returns an "Article Getter" that just picks out the part of a news-website
145     * article that lies between the open and closed version of the specified htmlTag.
146     */
147    public static ArticleGet usual(String htmlTag)
148    {
149        final String htmlTagLC = htmlTag.toLowerCase();
150
151        // This 'final String' is merely used for proper error reporting in any potential
152        // exception-messages, nothing else.
153        final String functionNameStr = "TagNodeGetInclusive.first(page, \"" + htmlTagLC + "\");";
154
155
156        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
157        // FAIL-FAST: Check user-input for possible errors BEFORE building the Lambda-Function.
158        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
159
160        // Check for valid HTML Token
161        HTMLTokException.check(htmlTagLC);
162
163        // Self-Closing / Singleton Tags CANNOT be used with INCLUSIVE Retrieval Operations.
164        InclusiveException.check(htmlTagLC);
165
166
167        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
168        // Build the instance, using a lambda-expression
169        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
170
171        return (URL url, Vector<HTMLNode> page) ->
172        {
173            // This exception-check is done on every invocation of this Lambda-Function.
174            // It is merely checking that these inputs are not-null, and page is of non-zero size.
175            // ArticleGetException is a compile-time, checked exception.  It is important to halt
176            // News-Site Scrape Progress when "Empty News-Page Data" is being passed here.
177            // NOTE: This would imply an internal-error with class Download has occurred.
178
179            ArticleGetException.check(url, page);   
180
181            Vector<HTMLNode> ret;
182
183            try
184                { ret = TagNodeGetInclusive.first(page, htmlTagLC); }
185
186            catch (Exception e)
187            {
188                throw new ArticleGetException
189                    (ArticleGetException.GOT_EXCEPTION, functionNameStr, e);
190            }
191
192            // These error-checks are used to deduce whether the "Article Get" was successful.
193            // When this exception is thrown, it means that the user-specified means of "Retrieving
194            // an Article Body" FAILED.  In this case, the "innerHTML" of the specified htmlTag was
195            // not found, and produced a null news-article page, or an empty news-article page.
196
197            if (ret == null) throw new ArticleGetException
198                (ArticleGetException.RET_NULL, functionNameStr);
199
200            if (ret.size() == 0) throw new ArticleGetException
201                (ArticleGetException.RET_EMPTY_VECTOR, functionNameStr);
202
203            return ret;
204        };
205    }
206
207    /**
208     * <I>This is a static, factory method for building ArticleGet.</I>
209     *
210     * <BR /><BR />This builds an "Article Getter" for you, using the most common way to get
211     * an article - specifically via the {@code HTML <DIV CLASS="...">} element and it's
212     * {@code CSS 'class'} selector.  
213     *
214     * <BR /><BR />Call this method, and use the ArticleGet that it generates/returns with the
215     * {@code class NewsSiteScrape}.  As long as the news or content website that you are scraping
216     * has it's page-body wrapped inside of an {@code HTML <DIV>} element <I><B>whose 
217     * {@code CSS 'class'} specifier is one you have uncovered by inspecting the
218     * page-manually</B></I> then {@code ArticleGet} produced by this factory-method will retrieve
219     * your page content appropriately.
220     * 
221     * @param tc This should be any of the pre-instantiated {@code TextComparitor's}.  Again, a
222     * TextComparitor is just a {@code String} compare function like: {@code equals, contains,
223     * StrCmpr.containsIgnoreCase(...)}, etc...
224     * 
225     * @param cssClassCompareStrings These are the values to be used by the 
226     * {@code TextComparitor} when comparing with the value of the CSS-Selector {@code "Class"}
227     * from the list of {@code DIV} elements on the page.
228     * 
229     * @return This returns an "Article Getter" that just picks out the part of a news-website
230     * article that lies between the HTML-{@code DIV} Element nodes whose class is identified by
231     * the "CSS (Cascading Style Sheets) {@code 'class'} identifier,  and the
232     * {@code TextComparitor} parameter that you have chosen.
233     */
234    public static ArticleGet usual(TextComparitor tc, String... cssClassCompareStrings)
235    {
236        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
237        // FAIL-FAST: Check user-input for possible errors BEFORE building the Lambda-Function.
238        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
239
240        // Check for valid compareStrings
241        TCCompareStrException.check(cssClassCompareStrings);
242
243        if (tc == null) throw new NullPointerException
244            ("Null has been passed to TextComparitor Parameter 'tc', but this is not allowed here.");
245
246        // This 'final' String is merely used for proper error reporting in any potential 
247        // exception-messages, nothing else.
248
249        final String functionNameStr =
250            "InnerTagGetInclusive.first(page, \"div\", \"class\", " +
251            STR_FORMAT_TC_PARAMS(tc, cssClassCompareStrings) + ")";
252
253        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
254        // Build the instance, using a lambda-expression
255        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
256
257        return (URL url, Vector<HTMLNode> page) ->
258        {
259            // This exception-check is done on every invocation of this Lambda-Function.
260            // It is merely checking that these inputs are not-null, and page is of non-zero size.
261            // ArticleGetException is a compile-time, checked exception.  It is important to halt
262            // News-Site Scrape Progress when "Empty News-Page Data" is being passed here.
263            // NOTE: This would imply an internal-error with class Download has occurred.
264
265            ArticleGetException.check(url, page);
266
267            Vector<HTMLNode> ret;
268
269            try
270            {
271                ret = InnerTagGetInclusive.first
272                    (page, "div", "class", tc, cssClassCompareStrings);
273            }
274            catch (Exception e) 
275            { 
276                throw new ArticleGetException
277                    (ArticleGetException.GOT_EXCEPTION, functionNameStr, e);
278            }
279
280            // These error-checks are used to deduce whether the "Article Get" was successful.
281            // When this exception is thrown, it means that the user-specified means of "Retrieving
282            // an Article Body" FAILED.  In this case, the "innerHTML" of the specified htmltag and
283            // class of the <DIV CLASS=...> produced a null news-article page, or an empty
284            // news-article page.
285
286            if (ret == null) throw new ArticleGetException
287                (ArticleGetException.RET_NULL, functionNameStr);
288
289            if (ret.size() == 0) throw new ArticleGetException
290                (ArticleGetException.RET_EMPTY_VECTOR, functionNameStr);
291
292            return ret;
293        };
294    }
295
296    /**
297     * <I>This is a static, factory method for building ArticleGet.</I>
298     *
299     * <BR /><BR />This gives more options for building your article getter.  In almost 95% of the
300     * news-websites, the article or page-body is between and open and close HTML DIV element,
301     * and the {@code <DIV CLASS="...">} can be found by the {@code CSS 'class'} attribute.
302     * <I><B>However,</B></I> This factory method allows a programmer to select article content
303     * that handles other cases than the {@code 95%}, where you specify the HTML-token,
304     * attribute-<B STYLE='color: red;'>name</B> and use the usual {@code TextComparitor} to find
305     * the article.
306     * 
307     * @param htmlTag This is almost always a {@code "DIV"} element, but if you wish to specify
308     * something else, possibly a paragraph element ({@code <P>}), or maybe an {@code <IFRAME>}
309     * or {@code <FRAME>}, then you may.
310     * 
311     * @param innerTag This is almost always a {@code "CLASS"} attribute, but if you need to use
312     * {@code "ID"} or something different altogether - possibly a site-specific tag, then use the
313     * innerTag / attribute-<B STYLE='color: red;'>name</B> of your choice.
314     * 
315     * @param tc This should be any of the pre-instantiated {@code TextComparitor's}.  Again, a
316     * {@code TextComparitor} is just a {@code String} compare function like: {@code equals, 
317     * contains, StrCmpr.containsIgnoreCase(...)}.
318     * 
319     * @param attributeValueCompareStrings These are the {@code String's} compared with using
320     * the innerTag <B STYLE='color: red;'>value</B> using the {@code TextComparitor}.
321     * 
322     * @return This returns an "Article Getter" that picks out the part of a news-website article
323     * that lies between the HTML element which matches the {@code 'htmlTag', 'innerTag' (id,
324     * class, or "other")}, and whose attribute-<B STYLE='color: red;'>value</B> of the specified
325     * {@code inner-tag} can be matched by the {@code TextComparitor} and the 
326     * compare-{@code String's}.
327     */
328    public static ArticleGet usual
329        (String htmlTag, String innerTag, TextComparitor tc, String... attributeValueCompareStrings)
330    {
331        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
332        // FAIL-FAST: Check user-input for possible errors BEFORE building the Lambda-Function.
333        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
334
335        TCCompareStrException.check(attributeValueCompareStrings);
336
337        if (tc == null) throw new NullPointerException
338            ("Null has been passed to TextComparitor Parameter 'tc', but this is not allowed here.");
339
340        final String htmlTagLC  = htmlTag.toLowerCase();
341        final String innerTagLC = innerTag.toLowerCase();
342
343        // This 'final String' is merely used for proper error reporting in any potential
344        // exception-messages, nothing else.
345
346        final String functionNameStr =
347            "InnerTagGetInclusive.first(page, \"" + htmlTag + "\", \"" + innerTag + "\", " +
348            STR_FORMAT_TC_PARAMS(tc, attributeValueCompareStrings) + ")";
349
350        // Check for valid HTML Tag.
351        HTMLTokException.check(htmlTagLC);
352
353        // Self-Closing / Singleton Tags CANNOT be used with INCLUSIVE Retrieval Operations.
354        InclusiveException.check(htmlTagLC);
355
356
357        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
358        // Build the instance, using a lambda-expression
359        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
360
361        return (URL url, Vector<HTMLNode> page) ->
362        {
363            // This exception-check is done on every invocation of this Lambda-Function.
364            // It is merely checking that these inputs are not-null, and page is of non-zero size.
365            // ArticleGetException is a compile-time, checked exception.  It is important to halt
366            // News-Site Scrape Progress when "Empty News-Page Data" is being passed here.
367            // NOTE: This would imply an internal-error with class Download has occurred.
368
369            ArticleGetException.check(url, page);   
370
371            Vector<HTMLNode> ret;
372
373            try
374            { 
375                ret = InnerTagGetInclusive.first
376                    (page, htmlTagLC, innerTagLC, tc, attributeValueCompareStrings);
377            }
378            catch (Exception e) // unlikely
379            { 
380                throw new ArticleGetException
381                    (ArticleGetException.GOT_EXCEPTION, functionNameStr, e);
382            }
383
384            // These error-checks are used to deduce whether the "Article Get" was successful.
385            // When this exception is thrown, it means that the user-specified means of "Retrieving
386            // an Article Body" FAILED.  In this case, the "innerHTML" of the specified htmlTag and
387            // attribute produced a null news-article page, or an empty news-article page.
388
389            if (ret == null) throw new ArticleGetException
390                (ArticleGetException.RET_NULL, functionNameStr);
391
392            if (ret.size() == 0) throw new ArticleGetException
393                (ArticleGetException.RET_EMPTY_VECTOR, functionNameStr);
394
395            return ret;
396        };
397    }
398
399    /**
400     * <I>This is a static, factory method for building ArticleGet.</I>
401     *
402     * <BR /><BR />This gives more options for building your article getter.  In almost 95% of the
403     * news-websites, the article or page-body is between and open and close HTML DIV element, and
404     * the {@code <DIV CLASS="...">} can be found by the {@code CSS 'class'} attribute.  
405     * <I><B>However,</B></I> This factory method allows a programmer to select article content
406     * that handles other cases than the {@code 95%}.  Here, you may specify the HTML-token,
407     * attribute-<B STYLE='color: red;'>name</B> and use a Java Regular-Expression handler to
408     * test the <B STYLE='color: red;'>value</B> of the attribute - no matter how complicated or
409     * bizarre.
410     *
411     * @param htmlTag This is almost always a {@code "DIV"} element, but if you wish to specify
412     * something else, possibly a paragraph element ({@code <P>}), or maybe an {@code <IFRAME>}
413     * or {@code <FRAME>}, then you may.
414     *
415     * @param innerTag This is almost always a {@code "CLASS"} attribute, but if you need to use
416     * {@code "ID"} or something different altogether - possibly a site-specific tag, then use the
417     * innerTag / attribute-<B STYLE='color: red;'>name</B> of your choice.
418     *
419     * @param innerTagValuePattern Any regular-expression.  It will be used to <B>PASS</B> or
420     * <B>FAIL</B> the attribute-<B STYLE='color: red;'>value</B> <I>(a name that is used
421     * interchangeably in this scrape/search package for
422     * "inner-tag-<B STYLE='color: red;'>value</B>")</I> when compared against this
423     * regular-expression parameter.
424     *
425     * <BR /><BR /><B>HELP:</B> This would be like saying:
426     * <DIV CLASS="SNIP">{@code
427     * // Pick some random HTML TagNode
428     * TagNode aTagNode        = (TagNode) page.elementAt(index_to_test);
429     *
430     * // Gets the attribute value of "innerTag"
431     * String  attributeValue  = aTagNode.AV(innerTag);
432     *
433     * // Make sure the HTML-token is as specified
434     * // calls to: java.util.regex.*;
435     * boolean passFail = aTagNode.tok.equals(htmlTag) &&
436     *      innerTagValuePattern.matcher(attributeValue).find();
437     * }</DIV>
438     *
439     * @return This returns an "Article Getter" that picks out the part of a news-website article
440     * that lays between the HTML element which matches the htmlTag, innerTag and value-testing
441     * regex {@code Pattern "innerTagValuePattern"}.
442     */
443    public static ArticleGet usual(String htmlTag, String innerTag, Pattern innerTagValuePattern)
444    {
445        final String htmlTagLC  = htmlTag.toLowerCase();
446        final String innerTagLC = innerTag.toLowerCase();
447
448        // This 'final String' is merely used for proper error reporting in any potential
449        // exception-messages, nothing else.
450
451        final String functionNameStr =
452            "InnerTagGetInclusive.first(page, \"" + htmlTag + "\", \"" + innerTag + "\", " +
453            innerTagValuePattern.pattern() + ")";
454
455
456        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
457        // FAIL-FAST: Check user-input for possible errors BEFORE building the Lambda-Function.
458        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
459
460        HTMLTokException.check(htmlTagLC);
461        InclusiveException.check(htmlTagLC);
462
463
464        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
465        // Build the instance, using a lambda-expression
466        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
467
468        return (URL url, Vector<HTMLNode> page) ->
469        {
470            // This exception-check is done on every invocation of this Lambda-Function.
471            // It is merely checking that these inputs are not-null, and page is of non-zero size.
472            // ArticleGetException is a compile-time, checked exception.  It is important to halt
473            // News-Site Scrape Progress when "Empty News-Page Data" is being passed here.
474            // NOTE: This would imply an internal-error with class Download has occurred.
475
476            ArticleGetException.check(url, page);
477
478            Vector<HTMLNode> ret;
479
480            try
481            { 
482                ret = InnerTagGetInclusive.first
483                    (page, htmlTagLC, innerTagLC, innerTagValuePattern);
484            }
485            catch (Exception e) // unlikely
486            { 
487                throw new ArticleGetException
488                    (ArticleGetException.GOT_EXCEPTION, functionNameStr, e);
489            }
490
491            // These error-checks are used to deduce whether the "Article Get" was successful.
492            // When this exception is thrown, it means that the user-specified means of "Retrieving
493            // an Article Body" FAILED.  In this case, the "innerHTML" of the specified htmlTag and
494            // attribute produced a null news-article page, or an empty news-article page.
495
496            if (ret == null) throw new ArticleGetException
497                (ArticleGetException.RET_NULL, functionNameStr);
498
499            if (ret.size() == 0) throw new ArticleGetException
500                (ArticleGetException.RET_EMPTY_VECTOR, functionNameStr);
501
502            return ret;            
503        };
504    }
505
506    /**
507     * <I>This is a static, factory method for building ArticleGet.</I>
508     *
509     * <BR /><BR />This gives more options for building your article getter.  In almost 95% of the
510     * news-websites, the article or page-body is between and open and close HTML {@code 'DIV'}
511     * element, and the {@code <DIV CLASS="...">} can be found by the {@code CSS 'class'} attribute.
512     * <I><B>However,</B></I> This factory method allows a programmer to select article content
513     * that handles other cases than the {@code 95%}, where you specify the HTML-token, 
514     * attribute-<B STYLE='color: red;'>name</B> and a {@code Predicate<String>} for finding the
515     * page-body.
516     *
517     * @param htmlTag This is almost always a {@code "DIV"} element, but if you wish to specify
518     * something else, possibly a paragraph element ({@code <P>}), or maybe an {@code <IFRAME>}
519     * or {@code <FRAME>}, then you may.
520     *
521     * @param innerTag This is almost always a {@code "CLASS"} attribute, but if you need to use
522     * {@code "ID"} or something different altogether - possibly a site-specific tag, then use the
523     * innerTag / attribute-<B STYLE='color: red;'>name</B> of your choice.
524     *
525     * @param p This java "lambda {@code Predicate}" will just receive the 
526     * attribute-<B STYLE='color: red;'>value</B> from the "inner-tag" and provide a yes/no answer.
527     *
528     * @return This returns an "Article Getter" that matches an HTML element specified by
529     * {@code 'htmlTag', 'innerTag'} and the result of the {@code String-Predicate} parameter
530     * {@code 'p'} on the <B STYLE='color: red;'>value</B> of that inner-tag.
531     */
532    public static ArticleGet usual(String htmlTag, String innerTag, Predicate<String> p)
533    {
534        final String htmlTagLC  = htmlTag.toLowerCase();
535        final String innerTagLC = innerTag.toLowerCase();
536
537        // This 'final' String is merely used for proper error reporting in any potential
538        // exception-messages, nothing else.
539
540        final String functionNameStr =
541            "InnerTagGetInclusive.first(page, \"" + htmlTag + "\", \"" + innerTag + "\", " +
542            "Predicate<String>)";
543
544
545        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
546        // FAIL-FAST: Check user-input for possible errors BEFORE building the Lambda-Function.
547        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
548
549        HTMLTokException.check(htmlTagLC);
550        InclusiveException.check(htmlTagLC);
551
552        if (p == null) throw new NullPointerException
553            ("Null has been passed to Predicate parameter 'p'.  This is not allowed here.");
554
555
556        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
557        // Build the instance, using a lambda-expression
558        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
559
560        return (URL url, Vector<HTMLNode> page) ->
561        {
562            // This exception-check is done on every invocation of this Lambda-Function.
563            // It is merely checking that these inputs are not-null, and page is of non-zero size.
564            // ArticleGetException is a compile-time, checked exception.  It is important to halt
565            // News-Site Scrape Progress when "Empty News-Page Data" is being passed here.
566            // NOTE: This would imply an internal-error with class Download has occurred.
567
568            ArticleGetException.check(url, page);
569
570            Vector<HTMLNode> ret;
571
572            try
573                { ret = InnerTagGetInclusive.first(page, htmlTagLC, innerTagLC, p); }
574
575            catch (Exception e)
576            { 
577                throw new ArticleGetException
578                    (ArticleGetException.GOT_EXCEPTION, functionNameStr, e);
579            }
580
581            // These error-checks are used to deduce whether the "Article Get" was successful.
582            // When this exception is thrown, it means that the user-specified means of "Retrieving
583            // an Article Body" FAILED.  In this case, the "innerHTML" of the specified htmlTag and
584            // attribute produced a null news-article page, or an empty news-article page.
585
586            if (ret == null) throw new ArticleGetException
587                (ArticleGetException.RET_NULL, functionNameStr, null);
588
589            if (ret.size() == 0) throw new ArticleGetException
590                (ArticleGetException.RET_EMPTY_VECTOR, functionNameStr, null);
591
592            return ret;
593        };
594    }
595
596    /**
597     * <I>This is a static, factory method for building ArticleGet.</I>
598     *
599     * <BR /><BR />This factory method generates an "ArticleGet" that will retrieve news-article
600     * body-content based on a "start-tag" and an "end-tag."  It is <B><I>very</I></B> to note,
601     * that the text can only match a single text-node, and not span multiple text-nodes, or be
602     * within {@code TagNode's} at all!  This should be easy to find, print up the HTML page as a
603     * {@code Vector}, and inspect it!
604     * 
605     * @param startTextTag This must be text from an HTML {@code TextNode} that is
606     * <I><B>contained</B> within one (single) {@code TextNode}</I> of the vectorized-HTML page.
607     * 
608     * @param endTextTag This must be text from an HTML {@code TextNode} that is also
609     * <B><I>contained</B> in a single {@code TextNode}</I> of the vectorized-HTML page.
610     * 
611     * @return This will return an "Article Getter" that looks for <B><I>non-HTML Text</I></B> in
612     * the article, specified by the text-tag parameters, and gets it.
613     */
614    public static ArticleGet usual(String startTextTag, String endTextTag)
615    {
616        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
617        // FAIL-FAST: Check user-input for possible errors BEFORE building the Lambda-Function.
618        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
619
620        if (startTextTag == null) throw new NullPointerException
621            ("Null has been passed to parameter 'startTextTag', but this is not allowed here.");
622
623        if (endTextTag == null) throw new NullPointerException
624            ("Null has been passed to parameter 'endTextTag', but this is not allowed here.");
625
626
627        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
628        // Build the instance, using a lambda-expression
629        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
630
631        return (URL url, Vector<HTMLNode> page) ->
632        {
633            // This exception-check is done on every invokation of this Lambda-Function.
634            // It is merely checking that these inputs are not-null, and page is of non-zero size.
635            // ArticleGetException is a compile-time, checked exception.  It is important to halt
636            // News-Site Scrape Progress when "Empty News-Page Data" is being passed here.
637            // NOTE: This would imply an internal-error with class Download has occured.
638
639            ArticleGetException.check(url, page);
640
641            int         start   = -1;
642            int         end     = -1;
643            HTMLNode    n       = null;
644
645            while (start++ < page.size())
646                if ((n = page.elementAt(start)) instanceof TextNode)
647                    if (n.str.contains(startTextTag))
648                        break;
649
650            while (end++ < page.size())
651                if ((n = page.elementAt(end)) instanceof TextNode)
652                    if (n.str.contains(endTextTag))
653                        break;
654
655            // These error-checks are used to deduce whether the "Article Get" was successful.
656            // When this exception is thrown, it means that the user-specified means of "Retrieving
657            // an Article Body" FAILED.  In this case it is because the start/end tags were not found
658            // in the text of the vectorized-html news-article web-page.
659
660            if (start == page.size()) throw new ArticleGetException(
661                "Start Text Tag [" + startTextTag + "], was not found on the News Article HTML " +
662                "page."
663            );
664
665            if (end == page.size()) throw new ArticleGetException(
666                "End Text Tag [" + endTextTag + "], was not found on the News Article HTML " +
667                "page."
668            );
669
670            return Util.cloneRange(page, start, end + 1);
671        };
672    }
673
674    /**
675     * <I>This is a static, factory method for building ArticleGet.</I>
676     *
677     * This factory method generates an "ArticleGet" that will retrieve news-article body-content
678     * based on starting and ending regular-expressions.  The matches performed by the Regular
679     * Expression checker will be performed on {@code TextNode's}, not on the {@code TagNode's}, or
680     * the page itself.  It is <B><I>very</I></B> to note, that the text can only match a single
681     * {@code TextNode}, and not span multiple {@code TextNode's}, or be within {@code TagNode's}
682     * at all!  This should be easy to find, print up the HTML page as a {@code Vector}, and
683     * inspect it!
684     * 
685     * @param startPattern This must be a regular expression {@code Pattern} that matches an HTML
686     * {@code TextNode} that is <I><B>contained</B> within one (single) {@code TextNode}</I> of
687     * the vectorized-HTML page.
688     * 
689     * @param endPattern This must be a regular expression {@code Pattern} that matches an HTML
690     * {@code TextNode} that is also <B><I>contained</B> in a single  {@code TextNode}</I> of the
691     * vectorized-HTML page.
692     * 
693     * @return This will return an "Article Getter" that looks for <B><I>non-HTML Text</I></B>
694     * in the article, specified by the regular-expression pattern-matching parameters, and gets it.
695     */
696    public static ArticleGet usual(Pattern startPattern, Pattern endPattern)
697    {
698        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
699        // FAIL-FAST: Check user-input for possible errors BEFORE building the Lambda-Function.
700        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
701
702        if (startPattern == null) throw new NullPointerException
703            ("Null has been passed to parameter 'startPattern', but this is not allowed here.");
704
705        if (endPattern == null) throw new NullPointerException
706            ("Null has been passed to parameter 'endPattern', but this is not allowed here.");
707
708
709        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
710        // Build the instance, using a lambda-expression
711        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
712
713        return (URL url, Vector<HTMLNode> page) ->
714        {
715            // This exception-check is done on every invokation of this Lambda-Function.
716            // It is merely checking that these inputs are not-null, and page is of non-zero size.
717            // ArticleGetException is a compile-time, checked exception.  It is important to halt
718            // News-Site Scrape Progress when "Empty News-Page Data" is being passed here.
719            // NOTE: This would imply an internal-error with class Download has occured.
720
721            ArticleGetException.check(url, page);
722            int         start   = -1;
723            int         end     = -1;
724            HTMLNode    n       = null;
725
726            while (start++ < page.size())
727                if ((n = page.elementAt(start)) instanceof TextNode)
728                    if (startPattern.matcher(n.str).find())
729                        break;
730
731            while (end++ < page.size())
732                if ((n = page.elementAt(end)) instanceof TextNode)
733                    if (endPattern.matcher(n.str).find())
734                        break;
735
736            // These error-checks are used to deduce whether the "Article Get" was successful.
737            // When this exception is thrown, it means that the user-specified means of "Retrieving
738            // an Article Body" FAILED.  In this case it is because the start or end regex failed to
739            // match.
740
741            if (start == page.size()) throw new ArticleGetException(
742                "Start Pattern [" + startPattern.toString() + "], was not found on the HTML " +
743                "page."
744            );
745
746            if (end == page.size()) throw new ArticleGetException
747                ("End Pattern [" + endPattern.toString() + "], was not found on the HTML page.");
748
749            return Util.cloneRange(page, start, end + 1);
750        };
751    }
752
753    /**
754     * <I>This is a static, factory method for building ArticleGet.</I>
755     *
756     * This is just a way to put a list of article-parse objects into a single "branching"
757     * article-parse {@code Object}.  The two parameters must be equal-length arrays, with non-null
758     * elements.  Each {@code 'urlSelector'} will be tested, and when a selector passes, the
759     * {@code ArticleGet} that is created will use the "parallel getter" from the parallel array
760     * "getters."
761     *
762     * <BR /><BR /><B>LAY-SPEAK:</B> The best way to summarize this is if a programmer is going to
763     * use the {@code NewsSiteScrape} class, and planning to scrape a site that has different types
764     * of news-articles, he will need differing {@code "ArticleGet"} methods.  This class will take
765     * two {@code array's} that match the {@code URL} from which the article was retrieved with the
766     * particular "getter" method you have provided.  When I scrape the address:
767     * {@code http://www.baidu.com/} - a Chinese News Web-Site, it links to at least three primary
768     * domains:
769     *
770     * <BR /><BR /><OL CLASS=JDOL>
771     * <LI>{@code http://...chinesenews.com/director.../article...}</LI>
772     * <LI>{@code http://...xinhuanet.com/director.../article...}</LI>
773     * <LI>{@code http://...cctv.com/director.../article...}</LI>
774     * </OL>
775     *
776     * <BR />Results from each of these sites need to be "handled" just ever-so-slightly different.
777     * 
778     * @param urlSelectors This is a list of {@code Predicate<URL>} elements.  When one of these
779     * returns {@code TRUE} for a particular {@code URL}, then the index of that
780     * {@code URL}-selector in it's {@code array} will be used to call the appropriate getter from
781     * the parallel-{@code array} input-parameter {@code 'getters'}.
782     * 
783     * @param getters This is a list of getter elements.  These should be tailored to the
784     * particular news-website source that are chosen/selected by the {@code 'urlSelectors'}
785     * parallel {@code array}.
786     * 
787     * @return This will be a "master {@code ArticleGet}" or a "dispatch {@code ArticleGet}."
788     * All it does is simply traverse the first {@code array} looking for a
789     * {@code Predicate}-match from the {@code 'urlSelectors'}, and then calls the getter in the
790     * parallel {@code array}.
791     *
792     * <BR /><BR /><B>NOTE:</B> If none of the {@code 'urlSelectors'} match when this
793     * <B><I>"dispatch"</B></I> or rather <B><I>"branch"</I></B> is called by {@code class 
794     * NewsSiteScrape}, the function/getter that is returned will throw an 
795     * {@code ArticleGetException}.  It is important that the programmer only allow article
796     * {@code URL's} that he can capably handled to pass to {@code class NewsSiteScrape}.
797     *
798     * @throws IllegalArgumentException Will throw this exception if:
799     * 
800     * <BR /><BR /><UL CLASS=JDUL>
801     * <LI>Either of these parameters are null</LI>
802     * <LI>If they are not parallel, with differing lengths.</LI>
803     * <LI>If either contain a null value.</LI>
804     * </UL>
805     */
806    public static ArticleGet branch(URLFilter[] urlSelectors, ArticleGet[] getters)
807    {
808        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
809        // FAIL-FAST: Check user-input for possible errors BEFORE building the Lambda-Function.
810        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
811
812        if (urlSelectors.length == 0) throw new IllegalArgumentException
813            ("parameter 'urlSelectors' had zero-elements.");
814
815        if (getters.length == 0) throw new IllegalArgumentException
816            ("parameter 'getters' had zero-elements.");
817
818        ParallelArrayException.check(urlSelectors, "urlSelectors", true, getters, "getters", true);
819
820
821        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
822        // Build the instance, using a lambda-expression
823        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
824
825        return (URL url, Vector<HTMLNode> page) ->
826        {
827            for (int i=0; i < urlSelectors.length; i++)
828                if (urlSelectors[i].test(url))
829                    return getters[i].apply(url, page);
830
831            throw new ArticleGetException(
832                "None of the urlSelecctors you have provided matched the URL sent to this " +
833                "instance of ArticleGet."
834            );
835        };            
836    }
837
838
839    // ******************************************************************************************
840    // Other Methods
841    // ******************************************************************************************
842
843    /**
844     * This is the standard-java {@code Function 'andThen'} method.
845     *
846     * @param after This is the {@code ArticleGet} that will be (automatically) applied after
847     * {@code 'this'} function. 
848     *
849     * @return A new, composite {@code ArticleGet} that performs both operations. It will:
850     *
851     * <BR /><BR /><OL CLASS=JDOL>
852     * <LI> Run {@code 'this'} function's {@code 'apply'} method to a
853     *      {@code URL, Vector<HTMLNode>}, and return a {@code Vector<HTMLNode>}.
854     *      <BR /><BR />
855     *      </LI>
856     * <LI> Then it will run the {@code 'after'} function's {@code 'apply'} method to the
857     *      results of {@code 'this.apply(...)'} and return the result.
858     *      <BR />
859     *      </LI>
860     * </OL>
861     */
862    default ArticleGet andThen(ArticleGet after)
863    { return (URL url, Vector<HTMLNode> page) -> after.apply(url, this.apply(url, page)); }
864
865    /**
866     * This is the standard-java {@code Function 'compose'} method.
867     * 
868     * @param before This is the {@code ArticleGet} that is performed first, whose results are
869     * sent to {@code 'this'} function.
870     * 
871     * @return A new composite {@code ArticleGet} that performs both operations.
872     * It will:
873     * 
874     * <BR /><BR /><OL CLASS=JDOL>
875     * <LI> Run the {@code 'before'} function's {@code 'apply'} method to a
876     *      {@code URL, Vector<HTMLNode>}, and return a {@code Vector<HTMLNode>}.
877     *      </LI>
878     * <LI> Then it will run {@code 'this'} function's {@code 'apply'} method to the
879     *      results of the {@code before.apply(...)} and return the result.
880     *      </LI>
881     * </OL>
882     */
883    default ArticleGet compose(ArticleGet before)
884    { return (URL url, Vector<HTMLNode> page) -> this.apply(url, before.apply(url, page)); }
885
886    /**
887     * The identity function will always return the same {@code Vector<HTMLNode>} as output that
888     * it receives as input.  This is one of the {@code default} Java's lambda-methods.
889     * 
890     * @return a new {@code ArticleGet} which (it should be obvious) is of type:
891     * {@code java.util.function.Function<Vector<HTMLNode>, Vector<HTMLNode>>}
892     * <BR /><BR />...<I> where the returned {@code Vector} is always the same (identical) to
893     * the input {@code Vector}.</I>
894     */
895    static ArticleGet identity()
896    {
897        return (URL url, Vector<HTMLNode> page) ->
898        {
899            ArticleGetException.check(url, page);
900            return page;
901        };
902    }
903
904    // Internally used "Helper Method"
905    /** Internally Used.  */
906    static String STR_FORMAT_TC_PARAMS(TextComparitor tc, String... compareStrings)
907    {
908        String tcName = TextComparitor.getName(tc);
909
910        String ret = (tcName != null)
911            ? "TextComparitor." + tcName + ", "
912            : "TextComparitor.(Anonymous-TC), ";
913
914        for (int i=0; i < compareStrings.length; i++)
915        {
916            String str = compareStrings[i];
917
918            if ((ret.length() + str.length()) > 120)
919            {
920                ret += "\"" + (str.substring(0, str.length() - (120 - ret.length())) + "...");
921                break;
922            }
923            else ret += "\"" + str;
924
925            ret += (i < (compareStrings.length - 1)) ? "\", " : "\")";
926        }
927
928        return ret;
929    }
930}