Source code

001package Torello.HTML.Tools.NewsSite;
002
003import Torello.HTML.*;
004import Torello.HTML.NodeSearch.*;
005import Torello.HTML.Tools.NewsSite.*;
006import Torello.Java.*;
007
008import Torello.Languages.LC;
009
010import java.util.*;
011import java.util.regex.*;
012
013import java.net.URL;
014import java.io.*;
015
016/**
017 * This class is nothing more than an 'Example Class' that contains some foreign-language
018 * based news web-pages, from both overseas and from Latin America.
019 * 
020 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=NEWS_SITES>
021 */
022public class NewsSites
023{
024    private NewsSites() { }
025
026    @SuppressWarnings("unchecked")
027    private static final Hashtable<String, Vector<URL>> newsPaperSections = 
028        (Hashtable<String, Vector<URL>>) LFEC.readObjectFromFile_JAR
029        (NewsSite.class, "data-files/SectionURLs.htdat", true, Hashtable.class);
030
031
032    /**
033     * This example will run the news-site scrape on the Chinese Government News Article
034     * Carousel.
035     * 
036     * <BR /><BR /><B><SPAN STYLE="color: red;">IMPORTANT NOTE:</B></SPAN> This will method will
037     * create a directory called <B>"cnb"</B> on your file-system where it will write the contents
038     * of (most likely) 15 news-paper articles to disk as HTML files.
039     *
040     * The output log generated by this method may be viewed here:
041     * <BR /><BR /><B><CODE><A HREF='doc-files/Logs/Gov.CN.log.html'>
042     * Gov.CN.log.html</A></B></CODE>
043     *
044     * @throws IOException This throws for IO errors that may occur when reading the web-server,
045     * or when saving the web-pages or images to the file-system.
046     * 
047     * @see FileRW#delTree(String, boolean, Appendable)
048     * @see NewsSite
049     * @see FileRW#writeFile(CharSequence, String)
050     * @see C#toHTML(String, boolean, boolean, boolean)
051     */
052    public static void runExample() throws IOException
053    {
054        StorageWriter log = new StorageWriter();
055
056        // This directory will contain ".dat" files that are simply "Serialized" HTML Vectors.
057        // Each ".dat" file will contain precisely one HTML page.
058
059        final String dataFilesDir = "cnb" + File.separator + "articleData" + File.separator;
060
061        // This directory will contain sub-directories with ".html" files (and image-files)
062        // for each news-article that is saved / downloaded.
063
064        final String htmlFilesDir = "cnb" + File.separator + "articleHTML" + File.separator;
065
066        // This CLEARS WHATEVE DATA IS CURRENTLY IN THE DIRECTORY (by deleting all its contents)
067        // The following code is the same as the UNIX Shell Command:
068        // rm -r cnb/articleData/
069        // mkdir cnb/articleData
070
071        FileRW.delTree(dataFilesDir, true, log);
072
073        // The following code is the same as the UNIX Shell Command:
074        // rm -r cnb/articleHTML/
075        // mkdir cnb/articleHTML
076
077        FileRW.delTree(htmlFilesDir, true, log);
078
079
080        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
081        // Previous Download Data Erased (if any), Start today's News-Site Scrape
082        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
083    
084        // Use the "GovCNCarousel" instance that is created in this class as a NewsSite
085        NewsSite ns = NewsSites.GovCNCarousel;
086
087        // Call the "Scrape URLs" class to retrieve all of the available newspaper articles
088        // on the Java-Script "Article Carousel"  Again, the "Article Carousel" is just this 
089        // little widget at the top of the page that rotates (usually) five hilited / emphasized
090        // news-article links for today
091
092        Vector<Vector<String>> articleURLs = ScrapeURLs.get(ns, log);
093
094        // This is usually not very important if only a small number of articles are being
095        // scraped.  When downloading hundreds of articles - being able to pause if there is a
096        // web-site IOError (And restart) is very important.
097        //
098        // The standard factory-generated "getFSInstance" creates a small file on the file-system
099        // for saving the "Download State" while downloading...
100    
101        Pause pause = Pause.getFSInstance("cnb" + File.separator + "state.dat");
102        pause.initialize();
103
104        // The "Scraped Articles" will be sent to the directory named by "dataFilesDir"
105        // Using the File-System to save these articles is the default-factory means for
106        // saving article-data.  Writing a customized "ScapedArticleReceiver" to do anything
107        // from saving article-data to a Data-Base up to and including e-mailing article data
108        // is possible using a self-written "ScrapedArticleReceiver"
109
110        ScrapedArticleReceiver receiver = ScrapedArticleReceiver.saveToFS(dataFilesDir);
111
112        // This will download each of the article's from their web-page URL.  The web-page
113        // article URL's were retrieved by "Scraped URLs".  The saved HTML (as HTML Vectors)
114        // is sent to the "Article Receiver" (defined in the previous step).  These news articles
115        // are saved as ".dat" since they are serialized java-objects.
116        //
117        // Explaining some "unnamed parameters" passed to the method invocation below:
118        //
119        // true: [skipArticlesWithoutPhotos] Skips Mandarin Chinese Newspaper Articles that do not
120        //       include at least one photo.  Photos usually help when reading foreign news articles.
121        // null: [bannerAndAdFinder] Some sites include images for Facebook links or advertising.
122        //       Gov.CN usually doesn't have these, but occasionally there are extraneous links.
123        //       for the purposes of this example, this parameter is ignored, and passed null.
124        // false: [keepOriginalPageHTML] The "Complete Page" - content before the Article Body is
125        //        extracted from the Article Web-Page is not saved.  This can occasionally be useful
126        //        if the HTML <HEAD>...</HEAD> has JSON or React-JS data to extract.
127
128        ScrapeArticles.download
129            (receiver, articleURLs, ns.articleGetter, true, null, false, pause, log);
130        
131        // Now this will convert each of the ".dat" files to an ".html" file - and also it
132        // will download the pictures / image included in the article.
133        //
134        // Explaining some "unnamed parameters" passed to the method invocation below:
135        //
136        // true: [cleanIt] This runs some basic HTML remove operations.  The best way to see
137        //       what the parameter "cleanIt" asks to have removed is to view the class "ToHTML"
138        // null: [HTMLModifier] Cleaning up other extraneous links and content in an newspaper
139        //       article body like advertising or links to other articles is usually necessary.
140        //       Anywhere between 1 and 10 lines of NodeSearch Removal Operations will get rid of
141        //       unnecessary HTML.  For the purposes of this example, such a cleaning operation is
142        //       not done here - although the final articles do include some "links to other
143        //       articles" that is not "CLEANED" like it should be.
144
145        ToHTML.convert(dataFilesDir, htmlFilesDir, true, null, log);
146
147        // NOTE: The log of running this command on Debian UNIX / LINUX may be viewed in the
148        // JavaDoc Comments in the top of this method.  If this method is run in an MS-DOS
149        // or Windows Environment, there will be no screen colors available to view.
150
151        FileRW.writeFile(
152            C.toHTML(log.getString(), true, true, true),
153            "cnb" + File.separator + "Gov.CN.log.html"
154        );
155    }
156
157    /**
158     * Prints the contents of the Data File.  Invoking this command allows a programmer to see
159     * which "sub-sections" are ascribed to each of the different news-paper definitions in this
160     * class.  Each "sub-section" is nothing more than a {@code URL}-branch of the primary web
161     * site {@code URL}.
162     *
163     * <DIV CLASS="HTML">{@code
164     * <!-- If the following were the primary news-site -->
165     * http://news.baidu.com
166     * 
167     * <!-- This would be a "sub-section" of the primary site -->
168     * http://news.baidu.com/sports
169     * }</DIV>
170     *
171     * <BR /><BR />Can be called from the command line.
172     * <BR /><BR />If a single command-line argument is passed to {@code "argv[0]"}, the contents
173     * of the "Sections URL Data File" will be output to a text-file that is named using the
174     * {@code String} passed to {@code "argv[0]"}.
175     *
176     * @param argv These are the command line arguments passed by the JRE to this method.
177     * @throws IOException If there are any problems while attempting to save the output to the
178     * the output file (if one was named / requested).
179     */
180    public static void main(String[] argv) throws IOException
181    {
182        // Uncomment this line to run the example code (instead of section-data print)
183        // runExample(); System.exit(0);
184
185        // The data-file is loaded into private field "newsPaperSections"
186        // This private field is a Hashtable<String, Vector<URL>>.  Convert each of
187        // these sections so that they may be printed to terminal and maybe to a text
188        // file.
189
190        StringBuilder sb = new StringBuilder();
191
192        for (String newspaper : newsPaperSections.keySet())
193        {
194            sb.append(newspaper + '\n');
195            for (URL section : newsPaperSections.get(newspaper))
196                sb.append(section.toString() + '\n');
197            sb.append("\n\n***************************************************\n\n");
198        }
199        
200        String s = sb.toString();
201        System.out.println(s);
202        
203        // If there is a command-line parameter, it shall be interpreted a file-name.
204        // The contents of the "sections data-file" (as text) will be written a file on the
205        // file-system using the String-value of "argv[0]" as the name of the output-filename.
206
207        if (argv.length == 1) FileRW.writeFile(s, argv[0]);
208    }
209
210    // URLFilter.regexKEEP(Pattern.compile("^http.+baidu\\.com\\/s\\?id=\\d+$")));
211    // ArticleGet.usual(TextComparitor.CN_CI, "article-content"));
212
213    /**
214     * The News Site at address: <CODE><A HREF="https://www.abc.es/" TARGET=_blank>
215     * "https://www.abc.es/"</A></CODE> is slightly more complicated when retrieving News-Article
216     * Links.
217     *
218     * <BR /><BR />Notice that each newspaper article {@code URL}-link is "wrapped" in an HTML
219     * {@code '<ARTICLE>...</ARTICLE>'} Element.
220     *
221     * <BR /><BR />If this code were translated into an "XPath Query" or "CSS Selector", it would
222     * read: <B>{@code article a}</B>.  Specifically it says to find all {@code 'Anchor'} elements
223     * that are descendants of {@code 'Article'} Elements.
224     * 
225     * @see TagNodeFindL1Inclusive#all(Vector, String)
226     * @see TagNodeGet#first(Vector, int, int, TC, String[])
227     * @see TagNode#AV(String)
228     */
229    public static Vector<String> ABC_LINKS_GETTER(URL url, Vector<HTMLNode> page)
230    {
231        Vector<String> ret = new Vector<>();       TagNode tn;     String urlStr;
232
233        // Links are kept inside <ARTICLE> ... </ARTICLE> on the main / section page.
234        for (DotPair article : TagNodeFindL1Inclusive.all(page, "article"))
235
236            // Now find the <A HREF=...> ... </A>
237            if ((tn = TagNodeGet.first(page, article.start, article.end, TC.OpeningTags, "a"))
238                != null)
239
240                if ((urlStr = tn.AV("href")) != null)
241                    ret.add(urlStr);
242
243        return ret;
244    }
245
246    /**
247     * This is the {@code NewsSite} definition for the Newspaper located at:
248     * <CODE><A HREF="https://www.abc.es/" TARGET=_blank>https://www.abc.es/</A></CODE>.
249     * 
250     * <BR /><BR /><TABLE CLASS=JDBriefTable>
251     * <TR><TH>Parameter</TH>                       <TH>Significance</TH></TR>
252     * <TR><TD>Newspaper Name</TD>                  <TD>ABC España</TD></TR>
253     * <TR><TD>Country of Origin</TD>               <TD>Spain</TD></TR>
254     * <TR><TD>Website URL</TD>                     <TD>{@code https://abc.es}</TD></TR>
255     * <TR><TD>Newspaper Printing Language</TD>     <TD>Spanish</TD></TR>
256     * </TABLE>
257     * 
258     * <BR /><TABLE CLASS=NEWSSITE>
259     * <TR><TH>Parameter</TH><TH>Purpose</TH><TH>Value</TH></TR>
260     * <TR> <TD>Newspaper Article Groups / Sections</TD>
261     *      <TD>Scrape Sections</TD>
262     *      <TD><I>Retrieved from Data File</I></TD>
263     * </TR>
264     * <TR> <TD><B>{@link StrFilter}</B></TD>
265     *      <TD>News Web-Site Section-Page Aritlce-Link ({@code <A HREF=...>}) Filter</TD>
266     *      <TD>{@code 'HREF'} must end with {@code '.html'}
267     *          <BR />See: {@link StrFilter#comparitor(TextComparitor, String[])}
268     *          <BR />See: {@link TextComparitor#EW_CI}
269     *      </TD>
270     * </TR>
271     * <TR> <TD><B>{@link LinksGet}</B></TD>
272     *      <TD>Used to <B><I>manually</I></B> retrieve Article-Link {@code URL's}</TD>
273     *      <TD>Invokes method {@link #ABC_LINKS_GETTER(URL, Vector)}</TD>
274     * </TR>
275     * <TR> <TD><B>{@link ArticleGet}</B></TD>
276     *      <TD>Retrieves Article-Body Content from an Article-Link Web-Page</TD>
277     *      <TD>{@code <MAIN>...</MAIN>}<BR />See: {@link ArticleGet#usual(String)}</TD>
278     * </TR>
279     * </TABLE>
280     * 
281     * <BR />View a copy of the logs that are generated from using this {@code NewsSite} instance.
282     * <BR /><BR /><UL CLASS=JDUL>
283     * <LI> <CODE><B><A HREF='doc-files/Logs/ABC.ES-ScrapeURLs.html'>
284     *      ABC.ES ScrapeURLs LOG</A></B></CODE>
285     *      </LI>
286     * <LI> {@code ScrapeArticles}
287     *      <BR /><B>IMPORTANT NOTE:</B> Though {@code ScrapeURL's} code <I>will check for
288     *      duplicate {@code URL's}</I> that may be returned <I>within any given-section</I>,
289     *      {@code Article URL's} may be repeated among the different sections of the newspaper.
290     *      Since the {@code URL}-scrape returned nearly 3,000 articles, the log of
291     *      an {@code Article} scrape is not included here.  Proper duplicate {@code URL} checking
292     *      code has obviously been written, but would be too complicated to show in this example.
293     *     </LI>
294     * </UL>
295     * 
296     * <EMBED CLASS='external-html' DATA-FILE-ID=NEWS_STE_CHANGE>
297     */
298    public static final NewsSite ABCES = new NewsSite
299    (
300        "ABC España", Country.Spain, "https://www.abc.es/", LC.ES,
301        "ABC is a Spanish national daily newspaper.  It is the third largest general-interest " +
302        "newspaper in Spain, and the oldest newspaper still operating in Madrid.",
303        newsPaperSections.get("ABCES"),
304        StrFilter.comparitor(TextComparitor.EW_CI, ".html"),
305        NewsSites::ABC_LINKS_GETTER,
306        ArticleGet.usual("main"),
307        null /* bannerAndAdFinder */
308    );
309
310    /**
311     * This is the {@code NewsSite} definition for the Newspaper located at:
312     * <CODE><A HREF="https://www.elpulso.mx/" TARGET=_blank>https://www.elpulso.mx/</A></CODE>.
313     * 
314     * <BR /><BR /><TABLE CLASS=JDBriefTable>
315     * <TR><TH>Parameter</TH>                       <TH>Significance</TH></TR>
316     * <TR><TD>Newspaper Name</TD>                  <TD>El Pulso, México</TD></TR>
317     * <TR><TD>Country of Origin</TD>               <TD>México</TD></TR>
318     * <TR><TD>Website URL</TD>                     <TD>{@code https://elpulso.mx}</TD></TR>
319     * <TR><TD>Newspaper Printing Language</TD>     <TD>Spanish</TD></TR>
320     * </TABLE>
321     * 
322     * <BR /><TABLE CLASS=NEWSSITE>
323     * <TR><TH>Parameter</TH><TH>Purpose</TH><TH>Value</TH></TR>
324     * <TR> <TD>Newspaper Article Groups / Sections</TD>
325     *      <TD>Scrape Sections</TD>
326     *      <TD><I>Retrieved from Data File</I></TD>
327     * </TR>
328     * <TR> <TD><B>{@link StrFilter}</B></TD>
329     *      <TD>News Web-Site Section-Page Aritlce-Link ({@code <A HREF=...>}) Filter</TD>
330     *      <TD>{@code HREF} must match: {@code http://some.domain/YYYY/MM/DD/<article-name>/}</TD>
331     * </TR>
332     * <TR> <TD><B>{@link LinksGet}</B></TD>
333     *      <TD>Used to <B><I>manually</I></B> retrieve Article-Link {@code URL's}</TD>
334     *      <TD><B>{@code null}</B>.  Retrieves <B><I>all</I></B> Anchor-Links on a Section-Page.
335     *          Note that {@code URL's} must still pass the previous {@code StrFilter} (above)
336     *          in order to be parsed as {@link Article}'s.
337     *      </TD>
338     * </TR>
339     * <TR> <TD><B>{@link ArticleGet}</B></TD>
340     *      <TD>Retrieves Article-Body Content from an Article-Link Web-Page</TD>
341     *      <TD>{@code <DIV CLASS="entry-content">...</DIV>}
342     *          <BR />See: {@link ArticleGet#usual(TextComparitor, String[])}
343     *          <BR />See: {@link TextComparitor#C}
344     *      </TD>
345     * </TR>
346     * </TABLE>
347     */
348    public static final NewsSite Pulso = new NewsSite
349    (
350        "El Pulso, México", Country.Mexico, "https://elpulso.mx", LC.ES,
351        "El Pulso newspaper is Spanish language newspaper in Mexico. It is showing breaking " +
352            "news, " +
353        "headlines, kids news, tourism news, entertainment news, study news, industrial news, " +
354        "economical news, health & beauty news, crime news, career news, Travel news, " +
355        "diet & fitness news, Top stories, special news, celebrity news.",
356        newsPaperSections.get("PULSO"),
357        StrFilter.regExKEEP(Pattern.compile(
358            "^https?:\\/{2}.*?\\/\\d{4}\\/\\d{2}\\/\\d{2}\\/[\\w-]{10,}\\/$"
359        ), false),
360        null /* LinksGet */,
361        ArticleGet.usual(TextComparitor.C, "entry-content"),
362        null /* bannerAndAddFinder */
363    );
364
365    /**
366     * The News Site at address: <CODE><A HREF="https://www.ElNacional.com/" TARGET=_blank>
367     * "https://www.ElNacional.com/"</A></CODE> is slightly more complicated when retrieving
368     * News-Article Links.
369     *
370     * <BR /><BR />Notice that each newspaper article {@code URL}-link is "wrapped" in an HTML
371     * {@code '<DIV CLASS="td-module-thumb">...</DIV>'} Element.
372     *
373     * <BR /><BR />If this code were translated into an "XPath Query" or "CSS Selector", it would
374     * read: <B>{@code div.td-module-thumb a}</B>.  Specifically it says to find all
375     * {@code 'Anchor'} elements that are descendants of {@code 'DIV'} Elements where said
376     * Divider's CSS {@code CLASS} contains {@code 'td-module-thumb'}.
377     * 
378     * @see InnerTagFindInclusive#all(Vector, String, String, TextComparitor, String[])
379     * @see TagNodeGet#first(Vector, int, int, TC, String[])
380     * @see TagNode#AV(String)
381     */
382    public static Vector<String> EL_NACIONAL_LINKS_GETTER(URL url, Vector<HTMLNode> page)
383    {
384        Vector<String> ret = new Vector<>();       TagNode tn;     String urlStr;
385
386        // Links are kept inside <DIV CLASS=td-module-thumb> ... </DIV> on the main / section page.
387        for (DotPair article : InnerTagFindInclusive.all
388            (page, "div", "class", TextComparitor.C, "td-module-thumb"))
389
390            // Now find the <A HREF=...> ... </A>
391            if ((tn = TagNodeGet.first
392                (page, article.start, article.end, TC.OpeningTags, "a")) != null)
393
394                if ((urlStr = tn.AV("href")) != null)
395                    ret.add(urlStr);
396
397        return ret;
398    }
399
400    /**
401     * This is the {@code NewsSite} definition for the Newspaper located at:
402     * <CODE><A HREF="https://www.elnacional.com/" TARGET=_blank>
403     * https://www.elnacional.com/</A></CODE>.
404     * 
405     * <BR /><BR /><TABLE CLASS=JDBriefTable>
406     * <TR><TH>Parameter</TH>                       <TH>Significance</TH></TR>
407     * <TR><TD>Newspaper Name</TD>                  <TD>El Nacional</TD></TR>
408     * <TR><TD>Country of Origin</TD>               <TD>Venezuela</TD></TR>
409     * <TR><TD>Website URL</TD>                     <TD>{@code https://elnacional.com}</TD></TR>
410     * <TR><TD>Newspaper Printing Language</TD>     <TD>Spanish</TD></TR>
411     * </TABLE>
412     * 
413     * <BR /><TABLE CLASS=NEWSSITE>
414     * <TR><TH>Parameter</TH><TH>Purpose</TH><TH>Value</TH></TR>
415     * <TR> <TD>Newspaper Article Groups / Sections</TD>
416     *      <TD>Scrape Sections</TD>
417     *      <TD><I>Retrieved from Data File</I></TD>
418     * </TR>
419     * <TR> <TD><B>{@link URLFilter}</B></TD>
420     *      <TD>News Web-Site Section-Page Aritlce-Link ({@code <A HREF=...>}) Filter</TD>
421     *      <TD><B>{@code null}</B>.  The {@code LinksGet} provided here will only return valid
422     *          {@code Article URL's}, so there is no need for a {@code URLFilter}.
423     *      </TD>
424     * </TR>
425     * <TR> <TD><B>{@link LinksGet}</B></TD>
426     *      <TD>Used to <B><I>manually</I></B> retrieve Article-Link {@code URL's}</TD>
427     *      <TD>Invokes method {@link #EL_NACIONAL_LINKS_GETTER(URL, Vector)}</TD>
428     * </TR>
429     * <TR> <TD><B>{@link ArticleGet}</B></TD>
430     *      <TD>Retrieves Article-Body Content from an Article-Link Web-Page</TD>
431     *      <TD>{@code <ARTICLE>...</ARTICLE>}<BR />See: {@link ArticleGet#usual(String)}</TD>
432     * </TR>
433     * </TABLE>
434     * 
435     * <BR />View a copy of the logs that are generated from using this {@code NewsSite}.
436     * <BR /><BR /><UL CLASS=JDUL>
437     * <LI> <CODE><B><A HREF='doc-files/Logs/ElNacional-ScrapeURLs.html'>
438     *      El Nacional ScrapeURLs LOG</A></B></CODE>
439     *      </LI>
440     * <LI> <CODE><B><A HREF='doc-files/Logs/ElNacional-ScrapeArticles.html'>
441     *      El Nacional ScrapeArticles LOG</A></B></CODE>
442     *      </LI>
443     * </UL>
444     * 
445     * <EMBED CLASS='external-html' DATA-FILE-ID=NEWS_STE_CHANGE>
446     */
447    public static final NewsSite ElNacional = new NewsSite
448    (
449        "El Nacional", Country.Venezuela, "https://elnacional.com", LC.ES,
450        "El Nacional is a Venezuelan publishing company under the name C.A. Editorial " +
451            "El Nacional, " +
452        "most widely known for its El Nacional newspaper and website. It, along with Últimas " +
453        "Noticias and El Universal, are the most widely read and circulated daily national " +
454        "newspapers in the country, and it has an average of more than 80,000 papers distributed " +
455        "daily and 170,000 copies on weekends.",
456        newsPaperSections.get("ElNacional"),
457        (URLFilter) null, /* The LinksGetter will only return valid Anchor's */
458        NewsSites::EL_NACIONAL_LINKS_GETTER,
459        ArticleGet.usual("article"),
460        null /* bannerAndAdFinder */
461    );
462
463    /**
464     * The News Site at address: <CODE><A HREF="https://www.ElEspectador.com/" TARGET=_blank>
465     * "https://www.ElEspectador.com/"</A></CODE> is slightly more complicated when retrieving
466     * News-Article Links.
467     *
468     * <BR /><BR />Notice that each newspaper article {@code URL}-link is "wrapped" in an HTML
469     * {@code '<DIV CLASS="Card ...">...</DIV>'} Element.
470     *
471     * <BR /><BR />If this code were translated into an "XPath Query" or "CSS Selector", it would
472     * read: <B>{@code div.Card a.card-link}</B>.  Specifically it says to find all
473     * {@code 'Anchor'} elements whose CSS {@code Class} contains {@code 'card-link'} and which
474     * are descendants of {@code 'DIV'} Elements where said Divider's
475     * CSS {@code CLASS} contains {@code 'Card'}.
476     *
477     * @see InnerTagFindInclusive#all(Vector, String, String, TextComparitor, String[])
478     * @see InnerTagGet#first(Vector, int, int, String, String, TextComparitor, String[])
479     * @see TagNode#AV(String)
480     */
481    public static Vector<String> EL_ESPECTADOR_LINKS_GETTER(URL url, Vector<HTMLNode> page)
482    {
483        Vector<String> ret = new Vector<>();
484
485        TagNode tn;
486        String  urlStr;
487
488        // Links are kept inside <DIV CLASS="Card ..."> ... </DIV> on the main / section page.
489        for (DotPair article : InnerTagFindInclusive.all
490            (page, "div", "class", TextComparitor.C, "Card"))
491
492            // Now find the <A CLASS="card-link" HREF=...> ... </A>
493            if ((tn = InnerTagGet.first
494                (page, article.start, article.end, "a", "class", TextComparitor.C, "card-link"))
495                    != null)
496
497                if ((urlStr = tn.AV("href")) != null)
498                    ret.add(urlStr);
499
500        return ret;
501    }
502
503    /**
504     * This is the {@code NewsSite} definition for the Newspaper located at:
505     * <CODE><A HREF="https://www.elespectador.com/" TARGET=_blank>
506     * https://www.elespectador.com/</A></CODE>.
507     * 
508     * <BR /><BR /><TABLE CLASS=JDBriefTable>
509     * <TR><TH>Parameter</TH>                       <TH>Significance</TH></TR>
510     * <TR><TD>Newspaper Name</TD>                  <TD>El Espectador</TD></TR>
511     * <TR><TD>Country of Origin</TD>               <TD>Columbia</TD></TR>
512     * <TR><TD>Website URL</TD>                     <TD>{@code https://elespectador.com}</TD></TR>
513     * <TR><TD>Newspaper Printing Language</TD>     <TD>Spanish</TD></TR>
514     * </TABLE>
515     * 
516     * <BR /><TABLE CLASS=NEWSSITE>
517     * <TR><TH>Parameter</TH><TH>Purpose</TH><TH>Value</TH></TR>
518     * <TR> <TD>Newspaper Article Groups / Sections</TD>
519     *      <TD>Scrape Sections</TD>
520     *      <TD><I>Retrieved from Data File</I></TD>
521     * </TR>
522     * <TR> <TD><B>{@link StrFilter}</B></TD>
523     *      <TD>News Web-Site Section-Page Aritlce-Link ({@code <A HREF=...>}) Filter</TD>
524     *      <TD>{@code HREF} must end with a forward-slash {@code '/'} character.
525     *          <BR />See: {@link TextComparitor#ENDS_WITH}
526     *      </TD>
527     * </TR>
528     * <TR> <TD><B>{@link LinksGet}</B></TD>
529     *      <TD>Used to <B><I>manually</I></B> retrieve Article-Link {@code URL's}</TD>
530     *      <TD>Invokes method {@link #EL_NACIONAL_LINKS_GETTER(URL, Vector)}</TD>
531     * </TR>
532     * <TR> <TD><B>{@link ArticleGet}</B></TD>
533     *      <TD>Retrieves Article-Body Content from an Article-Link Web-Page</TD>
534     *      <TD>{@code <DIV CLASS="l-main">...</DIV>}
535     *          <BR />See: {@link ArticleGet#usual(TextComparitor, String[])}
536     *          <BR />See: {@link TextComparitor#C}
537     *      </TD>
538     * </TR>
539     * </TABLE>
540     * 
541     * <BR />View a copy of the logs that are generated from using this {@code NewsSite}.
542     * <BR /><BR /><UL CLASS=JDUL>
543     * <LI> <CODE><B><A HREF='doc-files/Logs/ElEspectador-ScrapeURLs.html'>
544     *      El Espectador ScrapeURLs LOG</A></B></CODE>
545     *      </LI>
546     * <LI> <CODE><B><A HREF='doc-files/Logs/ElEspectador-ScrapeArticles.html'>
547     *      El Espectador ScrapeArticles LOG</A></B></CODE>
548     *      </LI>
549     * </UL>
550     * 
551     * <EMBED CLASS='external-html' DATA-FILE-ID=NEWS_STE_CHANGE>
552     */
553    public static final NewsSite ElEspectador = new NewsSite
554    (
555        "El Espectador, Columbia", Country.Colombia, "https://elespectador.com", LC.ES,
556        "El Espectador (meaning \"The Spectator\") is a newspaper with national circulation " +
557            "within " +
558        "Colombia, founded by Fidel Cano Gutiérrez on 22 March 1887 in Medellín and published " +
559        "since 1915 in Bogotá. It changed from a daily to a weekly edition in 2001, following a " +
560        "financial crisis, and became a daily again on 11 May 2008, a comeback which had been " +
561        "long rumoured, in tabloid format (28 x 39.5 cm). From 1997 to 2011 its main shareholder " +
562        "was Julio Mario Santo Domingo.",
563        newsPaperSections.get("ElEspectador"),
564        StrFilter.comparitor(TextComparitor.ENDS_WITH, "/"),
565        NewsSites::EL_ESPECTADOR_LINKS_GETTER,
566        ArticleGet.usual("article"),
567        null /* bannerAndAdFinder */
568    );
569
570
571    /**
572     * The News Site at address: <CODE><A HREF="https://www.gov.cn/" TARGET=_blank>
573     * "https://www.gov.cn/"</A></CODE> has a Java-Script "Links Carousel".  Essentially, there
574     * is a section with "Showcased News Articles" that are intended to be emphasize anywhere
575     * between four and eight primary articles.
576     *
577     * <BR /><BR />This Links-Carousel is wrapped in an HTML Divider Element as below:
578     * {@code <DIV CLASS="slider-carousel">}.
579     *
580     * <BR /><BR />If this code were translated into an "XPath Query" or "CSS Selector", it would
581     * read: <B>{@code div[class=slider-carousel] a}</B>.  Specifically it says to  find all
582     * {@code 'Anchor'} elements that are descendants of {@code '<DIV CLASS="slider-carousel">'}
583     * Elements.
584     * 
585     * @see InnerTagGetInclusive#first(Vector, String, String, TextComparitor, String[])
586     * @see TagNodeGet#all(Vector, TC, String[])
587     * @see TagNode#AV(String)
588     */
589    public static Vector<String> GOVCN_CAROUSEL_LINKS_GETTER(URL url, Vector<HTMLNode> page)
590    {
591        Vector<String>  ret     = new Vector<>();
592        String          urlStr;
593
594        // Find the first <DIV CLASS="slider-carousel"> ... </DIV> section
595        Vector<HTMLNode> carouselDIV = InnerTagGetInclusive.first
596            (page, "div", "class", TextComparitor.CN_CI, "slider-carousel");
597
598        // Retrieve any HTML Anchor <A HREF=...> ... </A> found within the contents of the
599        // Divider.
600
601        for (TagNode tn: TagNodeGet.all(carouselDIV, TC.OpeningTags, "a"))
602            if ((urlStr = tn.AV("href")) != null)
603                ret.add(urlStr);
604
605        return ret;
606    };
607
608    /**
609     * This is the {@code NewsSite} definition for the Newspaper located at:
610     * <CODE><A HREF="https://www.gov.cn/" TARGET=_blank>
611     * https://www.gov.cn/</A></CODE>.
612     * 
613     * <BR /><BR />The "Carousels" are just the emphasized or "HiLighted" links that are
614     * on three separate pages.  There is a complete-link {@code NewsSite} definition that
615     * will retrieve all links - <I>not just the links hilited by the carousel.</I>
616     * 
617     * <BR /><BR /><TABLE CLASS=JDBriefTable>
618     * <TR><TH>Parameter</TH>                       <TH>Significance</TH></TR>
619     * <TR><TD>Newspaper Name</TD>                  <TD>Chinese Government Web Portal</TD></TR>
620     * <TR><TD>Country of Origin</TD>               <TD>People's Republic of China</TD></TR>
621     * <TR><TD>Website URL</TD>                     <TD>{@code https://gov.cn}</TD></TR>
622     * <TR><TD>Newspaper Printing Language</TD>     <TD>Mandarin Chinese</TD></TR>
623     * </TABLE>
624     * 
625     * <BR /><TABLE CLASS=NEWSSITE>
626     * <TR><TH>Parameter</TH><TH>Purpose</TH><TH>Value</TH></TR>
627     * <TR> <TD>Newspaper Article Groups / Sections</TD>
628     *      <TD>Scrape Sections</TD>
629     *      <TD><I>Retrieved from Data File</I></TD>
630     * </TR>
631     * <TR> <TD><B>{@link StrFilter}</B></TD>
632     *      <TD>News Web-Site Section-Page Aritlce-Link ({@code <A HREF=...>}) Filter</TD>
633     *      <TD>{@code HREF} must match:
634     *          {@code "^http://www.gov.cn/(?:.+?/)?\\d{4}-\\d{2}/\\d{2}/(?:.+?/)?content_\\d+.htm(?:l)?(#\\d+)?"}
635     *      </TD>
636     * </TR>
637     * <TR> <TD><B>{@link LinksGet}</B></TD>
638     *      <TD>Used to <B><I>manually</I></B> retrieve Article-Link {@code URL's}</TD>
639     *      <TD>Invokes method {@link #GOVCN_CAROUSEL_LINKS_GETTER(URL, Vector)}</TD>
640     * </TR>
641     * <TR> <TD><B>{@link ArticleGet}</B></TD>
642     *      <TD>Retrieves Article-Body Content from an Article-Link Web-Page</TD>
643     *      <TD>{@code <DIV CLASS="article ...">...</DIV>}
644     *          <BR />See: {@link ArticleGet#usual(TextComparitor, String[])}
645     *          <BR />See: {@link TextComparitor#C}
646     *      </TD>
647     * </TR>
648     * </TABLE>
649     * 
650     * <BR />View a copy of the logs that are generated from using this {@code NewsSite}.
651     * <BR /><BR /><UL CLASS=JDUL>
652     * <LI> <CODE><B><A HREF='doc-files/Logs/GovCNCarousel-ScrapeURLs.html'>
653     *      Gov.CN Carousel ScrapeURLs LOG</A></B></CODE>
654     *      </LI>
655     * <LI> <CODE><B><A HREF='doc-files/Logs/GovCNCarousel-ScrapeArticles.html'>
656     *      Gov.CN Carousel ScrapeArticles LOG</A></B></CODE>
657     *      </LI>
658     * </UL>
659     * 
660     * <EMBED CLASS='external-html' DATA-FILE-ID=NEWS_STE_CHANGE>
661     */
662    public static final NewsSite GovCNCarousel = new NewsSite
663    (
664        "Chinese Government Web Portal", Country.China, "https://gov.cn/", LC.ZH_CN,
665        "The Chinese Government Sponsored Web-Site",
666        newsPaperSections.get("GovCNCarousel"),
667        StrFilter.regExKEEP(Pattern.compile(
668            "^http://www.gov.cn/(?:.+?/)?\\d{4}-\\d{2}/\\d{2}/(?:.+?/)?" +
669                "content_\\d+.htm(?:l)?(#\\d+)?"
670        ), false),
671        NewsSites::GOVCN_CAROUSEL_LINKS_GETTER,
672        ArticleGet.usual(TextComparitor.C, "article"),
673        null /* bannerAndAddFinder */
674    );
675
676    /**
677     * This is the {@code NewsSite} definition for the Newspaper located at:
678     * <CODE><A HREF="https://www.gov.cn/" TARGET=_blank>
679     * https://www.gov.cn/</A></CODE>.
680     *
681     * <BR /><BR />This version of the "Gov.CN" website will scour a larger set of section
682     * {@code URL's}, and will not limit the returned Article-Links to just those found on the
683     * java-script carousel.  The Java-Script Carousel will almost always have a total of five
684     * news-article links available.  This definition of {@code 'NewsSite'} may return up to 
685     * thirty to forty different articles per news-section.
686     * 
687     * <BR /><BR /><TABLE CLASS=JDBriefTable>
688     * <TR><TH>Parameter</TH>                       <TH>Significance</TH></TR>
689     * <TR><TD>Newspaper Name</TD>                  <TD>Chinese Government Web Portal</TD></TR>
690     * <TR><TD>Country of Origin</TD>               <TD>People's Republic of China</TD></TR>
691     * <TR><TD>Website URL</TD>                     <TD>{@code https://gov.cn}</TD></TR>
692     * <TR><TD>Newspaper Printing Language</TD>     <TD>Mandarin Chinese</TD></TR>
693     * </TABLE>
694     * 
695     * <BR /><TABLE CLASS=NEWSSITE>
696     * <TR><TH>Parameter</TH><TH>Purpose</TH><TH>Value</TH></TR>
697     * <TR> <TD>Newspaper Article Groups / Sections</TD>
698     *      <TD>Scrape Sections</TD>
699     *      <TD><I>Retrieved from Data File</I></TD>
700     * </TR>
701     * <TR> <TD><B>{@link StrFilter}</B></TD>
702     *      <TD>News Web-Site Section-Page Aritlce-Link ({@code <A HREF=...>}) Filter</TD>
703     *      <TD>{@code HREF} must match:
704     *          {@code "^http://www.gov.cn/(?:.+?/)?\\d{4}-\\d{2}/\\d{2}/(?:.+?/)?content_\\d+.htm(?:l)?(#\\d+)?"}
705     *      </TD>
706     * </TR>
707     * <TR> <TD><B>{@link LinksGet}</B></TD>
708     *      <TD>Used to <B><I>manually</I></B> retrieve Article-Link {@code URL's}</TD>
709     *      <TD><B>{@code null}</B>.  Retrieves <B><I>all</I></B> Anchor-Links on a Section-Page.
710     *          Note that {@code URL's} must still pass the previous {@code StrFilter} (above)
711     *          in order to be parsed as {@link Article}'s.
712     *      </TD>
713     * </TR> 
714     * <TR> <TD><B>{@link ArticleGet}</B></TD>
715     *      <TD>Retrieves Article-Body Content from an Article-Link Web-Page</TD>
716     *      <TD>{@code <DIV CLASS="article ...">...</DIV>}
717     *          <BR />See: {@link ArticleGet#usual(TextComparitor, String[])}
718     *          <BR />See: {@link TextComparitor#C}
719     *      </TD>
720     * </TR>
721     * </TABLE>
722     * 
723     * <BR />View a copy of the logs that are generated from using this {@code NewsSite}.
724     * <BR /><BR /><UL CLASS=JDUL>
725     * <LI> <CODE><B><A HREF='doc-files/Logs/GovCN-ScrapeURLs.html'>
726     *      Gov.CN ScrapeURLs LOG</A></B></CODE>
727     *      </LI>
728     * <LI> <CODE><B><A HREF='doc-files/Logs/GovCN-ScrapeArticles.html'>
729     *      Gov.CN ScrapeArticles LOG</A></B></CODE>
730     *      </LI>
731     * </UL>
732     * 
733     * <EMBED CLASS='external-html' DATA-FILE-ID=NEWS_STE_CHANGE>
734     */
735    public static final NewsSite GovCN = new NewsSite
736    (
737        "Chinese Government Web Portal", Country.China, "https://gov.cn/", LC.ZH_CN,
738        "The Chinese Government Sponsored Web-Site",
739        newsPaperSections.get("GovCN"),
740        StrFilter.regExKEEP(Pattern.compile(
741            "^http://www.gov.cn/(?:.+?/)?\\d{4}-\\d{2}/\\d{2}/(?:.+?/)?" +
742                "content_\\d+.htm(?:l)?(#\\d+)?"
743        ), false),
744        null,
745        ArticleGet.usual(TextComparitor.C, "article"),
746        null /* bannerAndAddFinder */
747    );
748}