001package Torello.HTML.Tools.NewsSite;
002
003import Torello.HTML.*;
004import Torello.HTML.NodeSearch.*;
005import Torello.HTML.Tools.NewsSite.*;
006import Torello.Java.*;
007
008import Torello.Languages.LC;
009
010import java.util.*;
011import java.util.regex.*;
012
013import java.net.URL;
014import java.io.*;
015
016/**
017 * This class is nothing more than an 'Example Class' that contains some foreign-language
018 * based news web-pages, from both overseas and from Latin America.
019 * 
020 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=NEWS_SITES>
021 */
022public class NewsSites
023{
024    private NewsSites() { }
025
026    @SuppressWarnings("unchecked")
027    private static final Hashtable<String, Vector<URL>> newsPaperSections = 
028        (Hashtable<String, Vector<URL>>) LFEC.readObjectFromFile_JAR
029        (Torello.Data.DataFileLoader.class, "data17.htdat", true, Hashtable.class);
030
031    /**
032     * This example will run the news-site scrape on the Chinese Government News Article
033     * Carousel.
034     * 
035     * <BR /><BR /><B><SPAN STYLE="color: red;">IMPORTANT NOTE:</B></SPAN> This will method will
036     * create a directory called <B>"cnb"</B> on your file-system where it will write the contents
037     * of (most likely) 15 news-paper articles to disk as HTML files.
038     *
039     * The output log generated by this method may be viewed here:
040     * <BR /><BR /><B><CODE><A HREF='doc-files/Logs/Gov.CN.log.html'>
041     * Gov.CN.log.html</A></B></CODE>
042     *
043     * @throws IOException This throws for IO errors that may occur when reading the web-server,
044     * or when saving the web-pages or images to the file-system.
045     * 
046     * @see FileRW#delTree(String, boolean, Appendable)
047     * @see NewsSite
048     * @see FileRW#writeFile(CharSequence, String)
049     * @see C#toHTML(String, boolean, boolean, boolean)
050     */
051    public static void runExample() throws IOException
052    {
053        StorageWriter   log             = new StorageWriter();
054
055        // This directory will contain ".dat" files that are simply "Serialized" HTML Vectors.
056        // Each ".dat" file will contain precisely one HTML page.
057        final String    dataFilesDir    = "cnb" + File.separator + "articleData" + File.separator;
058
059        // This directory will contain sub-directories with ".html" files (and image-files)
060        // for each news-article that is saved / downloaded.
061        final String    htmlFilesDir    = "cnb" + File.separator + "articleHTML" + File.separator;
062
063        // This CLEARS WHATEVE DATA IS CURRENTLY IN THE DIRECTORY (by deleting all its contents)
064        // The following code is the same as the UNIX Shell Command:
065        // rm -r cnb/articleData/
066        // mkdir cnb/articleData
067        FileRW.delTree(dataFilesDir, true, log);
068
069        // The following code is the same as the UNIX Shell Command:
070        // rm -r cnb/articleHTML/
071        // mkdir cnb/articleHTML
072        FileRW.delTree(htmlFilesDir, true, log);
073
074        // *****************************************
075        // Previous Download Data Erased (if any)
076        // Start the today's News-Site Scrape
077        // *****************************************
078    
079        // Use the "GovCNCarousel" instance that is created in this class as a NewsSite
080        NewsSite ns = NewsSites.GovCNCarousel;
081
082        // Call the "Scrape URLs" class to retrieve all of the available newspaper articles
083        // on the Java-Script "Article Carousel"  Again, the "Article Carousel" is just this 
084        // little widget at the top of the page that rotates (usually) five hilited / emphasized
085        // news-article links for today
086        Vector<Vector<String>> articleURLs = ScrapeURLs.get(ns, log);
087
088        // This is usually not very important if only a small number of articles are being
089        // scraped.  When downloading hundreds of articles - being able to pause if there is a
090        // web-site IOError (And restart) is very important.
091        //
092        // The standard factory-generated "getFSInstance" creates a small file on the file-system
093        // for saving the "Download State" while downloading...
094    
095        Pause pause = Pause.getFSInstance("cnb" + File.separator + "state.dat");
096        pause.initialize();
097
098        // The "Scraped Articles" will be sent to the directory named by "dataFilesDir"
099        // Using the File-System to save these articles is the default-factory means for
100        // saving article-data.  Writing a customized "ScapedArticleReceiver" to do anything
101        // from saving article-data to a Data-Base up to and including e-mailing article data
102        // is possible using a self-written "ScrapedArticleReceiver"
103        ScrapedArticleReceiver receiver = ScrapedArticleReceiver.saveToFS(dataFilesDir);
104
105        // This will download each of the article's from their web-page URL.  The web-page
106        // article URL's were retrieved by "Scraped URLs".  The saved HTML (as HTML Vectors)
107        // is sent to the "Article Receiver" (defined in the previous step).  These news articles
108        // are saved as ".dat" since they are serialized java-objects.
109        //
110        // Explaining some "unnamed parameters" passed to the method invocation below:
111        //
112        // true: [skipArticlesWithoutPhotos] Skips Mandarin Chinese Newspaper Articles that do not
113        //       include at least one photo.  Photos usually help when reading foreign news articles.
114        // null: [bannerAndAdFinder] Some sites include images for Facebook links or advertising.
115        //       Gov.CN usually doesn't have these, but occasionally there are extraneous links.
116        //       for the purposes of this example, this parameter is ignored, and passed null.
117        // false: [keepOriginalPageHTML] The "Complete Page" - content before the Article Body is
118        //        extracted from the Article Web-Page is not saved.  This can occasionally be useful
119        //        if the HTML <HEAD>...</HEAD> has JSON or React-JS data to extract.
120
121        ScrapeArticles.download
122            (receiver, articleURLs, ns.articleGetter, true, null, false, pause, log);
123        
124        // Now this will convert each of the ".dat" files to an ".html" file - and also it
125        // will download the pictures / image included in the article.
126        //
127        // Explaining some "unnamed parameters" passed to the method invocation below:
128        //
129        // true: [cleanIt] This runs some basic HTML remove operations.  The best way to see
130        //       what the parameter "cleanIt" asks to have removed is to view the class "ToHTML"
131        // null: [HTMLModifier] Cleaning up other extraneous links and content in an newspaper
132        //       article body like advertising or links to other articles is usually necessary.
133        //       Anywhere between 1 and 10 lines of NodeSearch Removal Operations will get rid of
134        //       unnecessary HTML.  For the purposes of this example, such a cleaning operation is
135        //       not done here - although the final articles do include some "links to other
136        //       articles" that is not "CLEANED" like it should be.
137
138        ToHTML.convert(dataFilesDir, htmlFilesDir, true, null, log);
139
140        // NOTE: The log of running this command on Debian UNIX / LINUX may be viewed in the
141        // JavaDoc Comments in the top of this method.  If this method is run in an MS-DOS
142        // or Windows Environment, there will be no screen colors available to view.
143        FileRW.writeFile(
144            C.toHTML(log.getString(), true, true, true),
145            "cnb" + File.separator + "Gov.CN.log.html"
146        );
147    }
148
149    /**
150     * Prints the contents of the Data File.  Invoking this command allows a programmer to see
151     * which "sub-sections" are ascribed to each of the different news-paper definitions in this
152     * class.  Each "sub-section" is nothing more than a {@code URL}-branch of the primary web
153     * site {@code URL}.
154     *
155     * <DIV CLASS="HTML">{@code
156     * <!-- If the following were the primary news-site -->
157     * http://news.baidu.com
158     * 
159     * <!-- This would be a "sub-section" of the primary site -->
160     * http://news.baidu.com/sports
161     * }</DIV>
162     *
163     * <BR /><BR />Can be called from the command line.
164     * <BR /><BR />If a single command-line argument is passed to {@code "argv[0]"}, the contents
165     * of the "Sections URL Data File" will be output to a text-file that is named using the
166     * {@code String} passed to {@code "argv[0]"}.
167     *
168     * @param argv These are the command line arguments passed by the JRE to this method.
169     * @throws IOException If there are any problems while attempting to save the output to the
170     * the output file (if one was named / requested).
171     */
172    public static void main(String[] argv) throws IOException
173    {
174        // Uncomment this line to run the example code (instead of section-data print)
175        // runExample(); System.exit(0);
176
177        // The data-file is loaded into private field "newsPaperSections"
178        // This private field is a Hashtable<String, Vector<URL>>.  Convert each of
179        // these sections so that they may be printed to terminal and maybe to a text
180        // file.
181        StringBuilder sb = new StringBuilder();
182        for (String newspaper : newsPaperSections.keySet())
183        {
184            sb.append(newspaper + '\n');
185            for (URL section : newsPaperSections.get(newspaper))
186                sb.append(section.toString() + '\n');
187            sb.append("\n\n***************************************************\n\n");
188        }
189        
190        String s = sb.toString();
191        System.out.println(s);
192        
193        // If there is a command-line parameter, it shall be interpreted a file-name.
194        // The contents of the "sections data-file" (as text) will be written a file on the
195        // file-system using the String-value of "argv[0]" as the name of the output-filename.
196        if (argv.length == 1) FileRW.writeFile(s, argv[0]);
197    }
198
199    // URLFilter.regexKEEP(Pattern.compile("^http.+baidu\\.com\\/s\\?id=\\d+$")));
200    // ArticleGet.usual(TextComparitor.CN_CI, "article-content"));
201
202    /**
203     * The News Site at address: <CODE><A HREF="https://www.abc.es/" TARGET=_blank>
204     * "https://www.abc.es/"</A></CODE> is slightly more complicated when retrieving News-Article
205     * Links.
206     *
207     * <BR /><BR />Notice that each newspaper article {@code URL}-link is "wrapped" in an HTML
208     * {@code '<ARTICLE>...</ARTICLE>'} Element.
209     *
210     * <BR /><BR />If this code were translated into an "XPath Query" or "CSS Selector", it would
211     * read: <B>{@code article a}</B>.  Specifically it says to find all {@code 'Anchor'} elements
212     * that are descendants of {@code 'Article'} Elements.
213     * 
214     * @see TagNodeFindL1Inclusive#all(Vector, String)
215     * @see TagNodeGet#first(Vector, int, int, TC, String[])
216     * @see TagNode#AV(String)
217     */
218    public static Vector<String> ABC_LINKS_GETTER(URL url, Vector<HTMLNode> page)
219    {
220        Vector<String> ret = new Vector<>();       TagNode tn;     String urlStr;
221
222        // Links are kept inside <ARTICLE> ... </ARTICLE> on the main / section page.
223        for (DotPair article : TagNodeFindL1Inclusive.all(page, "article"))
224
225            // Now find the <A HREF=...> ... </A>
226            if ((tn = TagNodeGet.first(page, article.start, article.end, TC.OpeningTags, "a")) != null)
227
228                if ((urlStr = tn.AV("href")) != null)
229                    ret.add(urlStr);
230
231        return ret;
232    }
233
234    /**
235     * This is the {@code NewsSite} definition for the Newspaper located at:
236     * <CODE><A HREF="https://www.abc.es/" TARGET=_blank>https://www.abc.es/</A></CODE>.
237     * 
238     * <BR /><BR /><TABLE CLASS=JDBriefTable>
239     * <TR><TH>Parameter</TH>                       <TH>Significance</TH></TR>
240     * <TR><TD>Newspaper Name</TD>                  <TD>ABC España</TD></TR>
241     * <TR><TD>Country of Origin</TD>               <TD>Spain</TD></TR>
242     * <TR><TD>Website URL</TD>                     <TD>{@code https://abc.es}</TD></TR>
243     * <TR><TD>Newspaper Printing Language</TD>     <TD>Spanish</TD></TR>
244     * </TABLE>
245     * 
246     * <BR /><TABLE CLASS="NEWSSITE">
247     * <TR><TH>Parameter</TH><TH>Purpose</TH><TH>Value</TH></TR>
248     * <TR> <TD>Newspaper Article Groups / Sections</TD>
249     *      <TD>Scrape Sections</TD>
250     *      <TD><I>Retrieved from Data File</I></TD>
251     * </TR>
252     * <TR> <TD><B>{@link StrFilter}</B></TD>
253     *      <TD>News Web-Site Section-Page Aritlce-Link ({@code <A HREF=...>}) Filter</TD>
254     *      <TD>{@code 'HREF'} must end with {@code '.html'}
255     *          <BR />See: {@link StrFilter#comparitor(TextComparitor, String[])}
256     *          <BR />See: {@link TextComparitor#EW_CI}
257     *      </TD>
258     * </TR>
259     * <TR> <TD><B>{@link LinksGet}</B></TD>
260     *      <TD>Used to <B><I>manually</I></B> retrieve Article-Link {@code URL's}</TD>
261     *      <TD>Invokes method {@link #ABC_LINKS_GETTER(URL, Vector)}</TD>
262     * </TR>
263     * <TR> <TD><B>{@link ArticleGet}</B></TD>
264     *      <TD>Retrieves Article-Body Content from an Article-Link Web-Page</TD>
265     *      <TD>{@code <MAIN>...</MAIN>}<BR />See: {@link ArticleGet#usual(String)}</TD>
266     * </TR>
267     * </TABLE>
268     * 
269     * <BR />View a copy of the logs that are generated from using this {@code NewsSite} instance.
270     * <BR /><BR /><UL CLASS=JDUL>
271     * <LI> <CODE><B><A HREF='doc-files/Logs/ABC.ES-ScrapeURLs.html'>
272     *      ABC.ES ScrapeURLs LOG</A></B></CODE>
273     *      </LI>
274     * <LI> {@code ScrapeArticles}
275     *      <BR /><B>IMPORTANT NOTE:</B> Though {@code ScrapeURL's} code <I>will check for
276     *      duplicate {@code URL's}</I> that may be returned <I>within any given-section</I>,
277     *      {@code Article URL's} may be repeated among the different sections of the newspaper.
278     *      Since the {@code URL}-scrape returned nearly 3,000 articles, the log of
279     *      an {@code Article} scrape is not included here.  Proper duplicate {@code URL} checking
280     *      code has obviously been written, but would be too complicated to show in this example.
281     *     </LI>
282     * </UL>
283     * 
284     * <EMBED CLASS='external-html' DATA-FILE-ID=NEWS_STE_CHANGE>
285     */
286    public static final NewsSite ABCES = new NewsSite
287    (
288        "ABC España", Country.Spain, "https://www.abc.es/", LC.ES,
289        "ABC is a Spanish national daily newspaper.  It is the third largest general-interest " +
290        "newspaper in Spain, and the oldest newspaper still operating in Madrid.",
291        newsPaperSections.get("ABCES"),
292        StrFilter.comparitor(TextComparitor.EW_CI, ".html"),
293        NewsSites::ABC_LINKS_GETTER,
294        ArticleGet.usual("main"),
295        null /* bannerAndAdFinder */
296    );
297
298    /**
299     * This is the {@code NewsSite} definition for the Newspaper located at:
300     * <CODE><A HREF="https://www.elpulso.mx/" TARGET=_blank>https://www.elpulso.mx/</A></CODE>.
301     * 
302     * <BR /><BR /><TABLE CLASS=JDBriefTable>
303     * <TR><TH>Parameter</TH>                       <TH>Significance</TH></TR>
304     * <TR><TD>Newspaper Name</TD>                  <TD>El Pulso, México</TD></TR>
305     * <TR><TD>Country of Origin</TD>               <TD>México</TD></TR>
306     * <TR><TD>Website URL</TD>                     <TD>{@code https://elpulso.mx}</TD></TR>
307     * <TR><TD>Newspaper Printing Language</TD>     <TD>Spanish</TD></TR>
308     * </TABLE>
309     * 
310     * <BR /><TABLE CLASS="NEWSSITE">
311     * <TR><TH>Parameter</TH><TH>Purpose</TH><TH>Value</TH></TR>
312     * <TR> <TD>Newspaper Article Groups / Sections</TD>
313     *      <TD>Scrape Sections</TD>
314     *      <TD><I>Retrieved from Data File</I></TD>
315     * </TR>
316     * <TR> <TD><B>{@link StrFilter}</B></TD>
317     *      <TD>News Web-Site Section-Page Aritlce-Link ({@code <A HREF=...>}) Filter</TD>
318     *      <TD>{@code HREF} must match: {@code http://some.domain/YYYY/MM/DD/<article-name>/}</TD>
319     * </TR>
320     * <TR> <TD><B>{@link LinksGet}</B></TD>
321     *      <TD>Used to <B><I>manually</I></B> retrieve Article-Link {@code URL's}</TD>
322     *      <TD><B>{@code null}</B>.  Retrieves <B><I>all</I></B> Anchor-Links on a Section-Page.
323     *          Note that {@code URL's} must still pass the previous {@code StrFilter} (above)
324     *          in order to be parsed as {@link Article}'s.
325     *      </TD>
326     * </TR>
327     * <TR> <TD><B>{@link ArticleGet}</B></TD>
328     *      <TD>Retrieves Article-Body Content from an Article-Link Web-Page</TD>
329     *      <TD>{@code <DIV CLASS="entry-content">...</DIV>}
330     *          <BR />See: {@link ArticleGet#usual(TextComparitor, String[])}
331     *          <BR />See: {@link TextComparitor#C}
332     *      </TD>
333     * </TR>
334     * </TABLE>
335     */
336    public static final NewsSite Pulso = new NewsSite
337    (
338        "El Pulso, México", Country.Mexico, "https://elpulso.mx", LC.ES,
339        "El Pulso newspaper is Spanish language newspaper in Mexico. It is showing breaking news, " +
340        "headlines, kids news, tourism news, entertainment news, study news, industrial news, " +
341        "economical news, health & beauty news, crime news, career news, Travel news, " +
342        "diet & fitness news, Top stories, special news, celebrity news.",
343        newsPaperSections.get("PULSO"),
344        StrFilter.regExKEEP(Pattern.compile(
345            "^https?:\\/{2}.*?\\/\\d{4}\\/\\d{2}\\/\\d{2}\\/[\\w-]{10,}\\/$"
346        ), false),
347        null /* LinksGet */,
348        ArticleGet.usual(TextComparitor.C, "entry-content"),
349        null /* bannerAndAddFinder */
350    );
351
352    /**
353     * The News Site at address: <CODE><A HREF="https://www.ElNacional.com/" TARGET=_blank>
354     * "https://www.ElNacional.com/"</A></CODE> is slightly more complicated when retrieving
355     * News-Article Links.
356     *
357     * <BR /><BR />Notice that each newspaper article {@code URL}-link is "wrapped" in an HTML
358     * {@code '<DIV CLASS="td-module-thumb">...</DIV>'} Element.
359     *
360     * <BR /><BR />If this code were translated into an "XPath Query" or "CSS Selector", it would
361     * read: <B>{@code div.td-module-thumb a}</B>.  Specifically it says to find all
362     * {@code 'Anchor'} elements that are descendants of {@code 'DIV'} Elements where said
363     * Divider's CSS {@code CLASS} contains {@code 'td-module-thumb'}.
364     * 
365     * @see InnerTagFindInclusive#all(Vector, String, String, TextComparitor, String[])
366     * @see TagNodeGet#first(Vector, int, int, TC, String[])
367     * @see TagNode#AV(String)
368     */
369    public static Vector<String> EL_NACIONAL_LINKS_GETTER(URL url, Vector<HTMLNode> page)
370    {
371        Vector<String> ret = new Vector<>();       TagNode tn;     String urlStr;
372
373        // Links are kept inside <DIV CLASS=td-module-thumb> ... </DIV> on the main / section page.
374        for (DotPair article : InnerTagFindInclusive.all
375            (page, "div", "class", TextComparitor.C, "td-module-thumb"))
376
377            // Now find the <A HREF=...> ... </A>
378            if ((tn = TagNodeGet.first
379                (page, article.start, article.end, TC.OpeningTags, "a")) != null)
380
381                if ((urlStr = tn.AV("href")) != null)
382                    ret.add(urlStr);
383
384        return ret;
385    }
386
387    /**
388     * This is the {@code NewsSite} definition for the Newspaper located at:
389     * <CODE><A HREF="https://www.elnacional.com/" TARGET=_blank>
390     * https://www.elnacional.com/</A></CODE>.
391     * 
392     * <BR /><BR /><TABLE CLASS=JDBriefTable>
393     * <TR><TH>Parameter</TH>                       <TH>Significance</TH></TR>
394     * <TR><TD>Newspaper Name</TD>                  <TD>El Nacional</TD></TR>
395     * <TR><TD>Country of Origin</TD>               <TD>Venezuela</TD></TR>
396     * <TR><TD>Website URL</TD>                     <TD>{@code https://elnacional.com}</TD></TR>
397     * <TR><TD>Newspaper Printing Language</TD>     <TD>Spanish</TD></TR>
398     * </TABLE>
399     * 
400     * <BR /><TABLE CLASS="NEWSSITE">
401     * <TR><TH>Parameter</TH><TH>Purpose</TH><TH>Value</TH></TR>
402     * <TR> <TD>Newspaper Article Groups / Sections</TD>
403     *      <TD>Scrape Sections</TD>
404     *      <TD><I>Retrieved from Data File</I></TD>
405     * </TR>
406     * <TR> <TD><B>{@link URLFilter}</B></TD>
407     *      <TD>News Web-Site Section-Page Aritlce-Link ({@code <A HREF=...>}) Filter</TD>
408     *      <TD><B>{@code null}</B>.  The {@code LinksGet} provided here will only return valid
409     *          {@code Article URL's}, so there is no need for a {@code URLFilter}.
410     *      </TD>
411     * </TR>
412     * <TR> <TD><B>{@link LinksGet}</B></TD>
413     *      <TD>Used to <B><I>manually</I></B> retrieve Article-Link {@code URL's}</TD>
414     *      <TD>Invokes method {@link #EL_NACIONAL_LINKS_GETTER(URL, Vector)}</TD>
415     * </TR>
416     * <TR> <TD><B>{@link ArticleGet}</B></TD>
417     *      <TD>Retrieves Article-Body Content from an Article-Link Web-Page</TD>
418     *      <TD>{@code <ARTICLE>...</ARTICLE>}<BR />See: {@link ArticleGet#usual(String)}</TD>
419     * </TR>
420     * </TABLE>
421     * 
422     * <BR />View a copy of the logs that are generated from using this {@code NewsSite}.
423     * <BR /><BR /><UL CLASS=JDUL>
424     * <LI> <CODE><B><A HREF='doc-files/Logs/ElNacional-ScrapeURLs.html'>
425     *      El Nacional ScrapeURLs LOG</A></B></CODE>
426     *      </LI>
427     * <LI> <CODE><B><A HREF='doc-files/Logs/ElNacional-ScrapeArticles.html'>
428     *      El Nacional ScrapeArticles LOG</A></B></CODE>
429     *      </LI>
430     * </UL>
431     * 
432     * <EMBED CLASS='external-html' DATA-FILE-ID=NEWS_STE_CHANGE>
433     */
434    public static final NewsSite ElNacional = new NewsSite
435    (
436        "El Nacional", Country.Venezuela, "https://elnacional.com", LC.ES,
437        "El Nacional is a Venezuelan publishing company under the name C.A. Editorial El Nacional, " +
438        "most widely known for its El Nacional newspaper and website. It, along with Últimas " +
439        "Noticias and El Universal, are the most widely read and circulated daily national " +
440        "newspapers in the country, and it has an average of more than 80,000 papers distributed " +
441        "daily and 170,000 copies on weekends.",
442        newsPaperSections.get("ElNacional"),
443        (URLFilter) null, /* The LinksGetter will only return valid Anchor's */
444        NewsSites::EL_NACIONAL_LINKS_GETTER,
445        ArticleGet.usual("article"),
446        null /* bannerAndAdFinder */
447    );
448
449    /**
450     * The News Site at address: <CODE><A HREF="https://www.ElEspectador.com/" TARGET=_blank>
451     * "https://www.ElEspectador.com/"</A></CODE> is slightly more complicated when retrieving
452     * News-Article Links.
453     *
454     * <BR /><BR />Notice that each newspaper article {@code URL}-link is "wrapped" in an HTML
455     * {@code '<DIV CLASS="Card ...">...</DIV>'} Element.
456     *
457     * <BR /><BR />If this code were translated into an "XPath Query" or "CSS Selector", it would
458     * read: <B>{@code div.Card a.card-link}</B>.  Specifically it says to find all
459     * {@code 'Anchor'} elements whose CSS {@code Class} contains {@code 'card-link'} and which
460     * are descendants of {@code 'DIV'} Elements where said Divider's
461     * CSS {@code CLASS} contains {@code 'Card'}.
462     *
463     * @see InnerTagFindInclusive#all(Vector, String, String, TextComparitor, String[])
464     * @see InnerTagGet#first(Vector, int, int, String, String, TextComparitor, String[])
465     * @see TagNode#AV(String)
466     */
467    public static Vector<String> EL_ESPECTADOR_LINKS_GETTER(URL url, Vector<HTMLNode> page)
468    {
469        Vector<String> ret = new Vector<>();       TagNode tn;     String urlStr;
470
471        // Links are kept inside <DIV CLASS="Card ..."> ... </DIV> on the main / section page.
472        for (DotPair article : InnerTagFindInclusive.all
473            (page, "div", "class", TextComparitor.C, "Card"))
474
475            // Now find the <A CLASS="card-link" HREF=...> ... </A>
476            if ((tn = InnerTagGet.first
477                (page, article.start, article.end, "a", "class", TextComparitor.C, "card-link")) != null)
478
479                if ((urlStr = tn.AV("href")) != null)
480                    ret.add(urlStr);
481
482        return ret;
483    }
484
485    /**
486     * This is the {@code NewsSite} definition for the Newspaper located at:
487     * <CODE><A HREF="https://www.elespectador.com/" TARGET=_blank>
488     * https://www.elespectador.com/</A></CODE>.
489     * 
490     * <BR /><BR /><TABLE CLASS=JDBriefTable>
491     * <TR><TH>Parameter</TH>                       <TH>Significance</TH></TR>
492     * <TR><TD>Newspaper Name</TD>                  <TD>El Espectador</TD></TR>
493     * <TR><TD>Country of Origin</TD>               <TD>Columbia</TD></TR>
494     * <TR><TD>Website URL</TD>                     <TD>{@code https://elespectador.com}</TD></TR>
495     * <TR><TD>Newspaper Printing Language</TD>     <TD>Spanish</TD></TR>
496     * </TABLE>
497     * 
498     * <BR /><TABLE CLASS="NEWSSITE">
499     * <TR><TH>Parameter</TH><TH>Purpose</TH><TH>Value</TH></TR>
500     * <TR> <TD>Newspaper Article Groups / Sections</TD>
501     *      <TD>Scrape Sections</TD>
502     *      <TD><I>Retrieved from Data File</I></TD>
503     * </TR>
504     * <TR> <TD><B>{@link StrFilter}</B></TD>
505     *      <TD>News Web-Site Section-Page Aritlce-Link ({@code <A HREF=...>}) Filter</TD>
506     *      <TD>{@code HREF} must end with a forward-slash {@code '/'} character.
507     *          <BR />See: {@link TextComparitor#ENDS_WITH}
508     *      </TD>
509     * </TR>
510     * <TR> <TD><B>{@link LinksGet}</B></TD>
511     *      <TD>Used to <B><I>manually</I></B> retrieve Article-Link {@code URL's}</TD>
512     *      <TD>Invokes method {@link #EL_NACIONAL_LINKS_GETTER(URL, Vector)}</TD>
513     * </TR>
514     * <TR> <TD><B>{@link ArticleGet}</B></TD>
515     *      <TD>Retrieves Article-Body Content from an Article-Link Web-Page</TD>
516     *      <TD>{@code <DIV CLASS="l-main">...</DIV>}
517     *          <BR />See: {@link ArticleGet#usual(TextComparitor, String[])}
518     *          <BR />See: {@link TextComparitor#C}
519     *      </TD>
520     * </TR>
521     * </TABLE>
522     * 
523     * <BR />View a copy of the logs that are generated from using this {@code NewsSite}.
524     * <BR /><BR /><UL CLASS=JDUL>
525     * <LI> <CODE><B><A HREF='doc-files/Logs/ElEspectador-ScrapeURLs.html'>
526     *      El Espectador ScrapeURLs LOG</A></B></CODE>
527     *      </LI>
528     * <LI> <CODE><B><A HREF='doc-files/Logs/ElEspectador-ScrapeArticles.html'>
529     *      El Espectador ScrapeArticles LOG</A></B></CODE>
530     *      </LI>
531     * </UL>
532     * 
533     * <EMBED CLASS='external-html' DATA-FILE-ID=NEWS_STE_CHANGE>
534     */
535    public static final NewsSite ElEspectador = new NewsSite
536    (
537        "El Espectador, Columbia", Country.Colombia, "https://elespectador.com", LC.ES,
538        "El Espectador (meaning \"The Spectator\") is a newspaper with national circulation within " +
539        "Colombia, founded by Fidel Cano Gutiérrez on 22 March 1887 in Medellín and published " +
540        "since 1915 in Bogotá. It changed from a daily to a weekly edition in 2001, following a " +
541        "financial crisis, and became a daily again on 11 May 2008, a comeback which had been " +
542        "long rumoured, in tabloid format (28 x 39.5 cm). From 1997 to 2011 its main shareholder " +
543        "was Julio Mario Santo Domingo.",
544        newsPaperSections.get("ElEspectador"),
545        StrFilter.comparitor(TextComparitor.ENDS_WITH, "/"),
546        NewsSites::EL_ESPECTADOR_LINKS_GETTER,
547        ArticleGet.usual("article"),
548        null /* bannerAndAdFinder */
549    );
550
551
552    /**
553     * The News Site at address: <CODE><A HREF="https://www.gov.cn/" TARGET=_blank>
554     * "https://www.gov.cn/"</A></CODE> has a Java-Script "Links Carousel".  Essentially, there
555     * is a section with "Showcased News Articles" that are intended to be emphasize anywhere
556     * between four and eight primary articles.
557     *
558     * <BR /><BR />This Links-Carousel is wrapped in an HTML Divider Element as below:
559     * {@code <DIV CLASS="slider-carousel">}.
560     *
561     * <BR /><BR />If this code were translated into an "XPath Query" or "CSS Selector", it would
562     * read: <B>{@code div[class=slider-carousel] a}</B>.  Specifically it says to  find all
563     * {@code 'Anchor'} elements that are descendants of {@code '<DIV CLASS="slider-carousel">'}
564     * Elements.
565     * 
566     * @see InnerTagGetInclusive#first(Vector, String, String, TextComparitor, String[])
567     * @see TagNodeGet#all(Vector, TC, String[])
568     * @see TagNode#AV(String)
569     */
570    public static Vector<String> GOVCN_CAROUSEL_LINKS_GETTER(URL url, Vector<HTMLNode> page)
571    {
572        Vector<String>  ret     = new Vector<>();
573        String          urlStr;
574
575        // Find the first <DIV CLASS="slider-carousel"> ... </DIV> section
576        Vector<HTMLNode> carouselDIV = InnerTagGetInclusive.first
577            (page, "div", "class", TextComparitor.CN_CI, "slider-carousel");
578
579        // Retrieve any HTML Anchor <A HREF=...> ... </A> found within the contents of the
580        // Divider.
581        for (TagNode tn: TagNodeGet.all(carouselDIV, TC.OpeningTags, "a"))
582            if ((urlStr = tn.AV("href")) != null)
583                ret.add(urlStr);
584
585        return ret;
586    };
587
588    /**
589     * This is the {@code NewsSite} definition for the Newspaper located at:
590     * <CODE><A HREF="https://www.gov.cn/" TARGET=_blank>
591     * https://www.gov.cn/</A></CODE>.
592     * 
593     * <BR /><BR />The "Carousels" are just the emphasized or "HiLighted" links that are
594     * on three separate pages.  There is a complete-link {@code NewsSite} definition that
595     * will retrieve all links - <I>not just the links hilited by the carousel.</I>
596     * 
597     * <BR /><BR /><TABLE CLASS=JDBriefTable>
598     * <TR><TH>Parameter</TH>                       <TH>Significance</TH></TR>
599     * <TR><TD>Newspaper Name</TD>                  <TD>Chinese Government Web Portal</TD></TR>
600     * <TR><TD>Country of Origin</TD>               <TD>People's Republic of China</TD></TR>
601     * <TR><TD>Website URL</TD>                     <TD>{@code https://gov.cn}</TD></TR>
602     * <TR><TD>Newspaper Printing Language</TD>     <TD>Mandarin Chinese</TD></TR>
603     * </TABLE>
604     * 
605     * <BR /><TABLE CLASS="NEWSSITE">
606     * <TR><TH>Parameter</TH><TH>Purpose</TH><TH>Value</TH></TR>
607     * <TR> <TD>Newspaper Article Groups / Sections</TD>
608     *      <TD>Scrape Sections</TD>
609     *      <TD><I>Retrieved from Data File</I></TD>
610     * </TR>
611     * <TR> <TD><B>{@link StrFilter}</B></TD>
612     *      <TD>News Web-Site Section-Page Aritlce-Link ({@code <A HREF=...>}) Filter</TD>
613     *      <TD>{@code HREF} must match:
614     *          {@code "^http://www.gov.cn/(?:.+?/)?\\d{4}-\\d{2}/\\d{2}/(?:.+?/)?content_\\d+.htm(?:l)?(#\\d+)?"}
615     *      </TD>
616     * </TR>
617     * <TR> <TD><B>{@link LinksGet}</B></TD>
618     *      <TD>Used to <B><I>manually</I></B> retrieve Article-Link {@code URL's}</TD>
619     *      <TD>Invokes method {@link #GOVCN_CAROUSEL_LINKS_GETTER(URL, Vector)}</TD>
620     * </TR>
621     * <TR> <TD><B>{@link ArticleGet}</B></TD>
622     *      <TD>Retrieves Article-Body Content from an Article-Link Web-Page</TD>
623     *      <TD>{@code <DIV CLASS="article ...">...</DIV>}
624     *          <BR />See: {@link ArticleGet#usual(TextComparitor, String[])}
625     *          <BR />See: {@link TextComparitor#C}
626     *      </TD>
627     * </TR>
628     * </TABLE>
629     * 
630     * <BR />View a copy of the logs that are generated from using this {@code NewsSite}.
631     * <BR /><BR /><UL CLASS=JDUL>
632     * <LI> <CODE><B><A HREF='doc-files/Logs/GovCNCarousel-ScrapeURLs.html'>
633     *      Gov.CN Carousel ScrapeURLs LOG</A></B></CODE>
634     *      </LI>
635     * <LI> <CODE><B><A HREF='doc-files/Logs/GovCNCarousel-ScrapeArticles.html'>
636     *      Gov.CN Carousel ScrapeArticles LOG</A></B></CODE>
637     *      </LI>
638     * </UL>
639     * 
640     * <EMBED CLASS='external-html' DATA-FILE-ID=NEWS_STE_CHANGE>
641     */
642    public static final NewsSite GovCNCarousel = new NewsSite
643    (
644        "Chinese Government Web Portal", Country.China, "https://gov.cn/", LC.ZH_CN,
645        "The Chinese Government Sponsored Web-Site",
646        newsPaperSections.get("GovCNCarousel"),
647        StrFilter.regExKEEP(Pattern.compile(
648            "^http://www.gov.cn/(?:.+?/)?\\d{4}-\\d{2}/\\d{2}/(?:.+?/)?content_\\d+.htm(?:l)?(#\\d+)?"
649        ), false),
650        NewsSites::GOVCN_CAROUSEL_LINKS_GETTER,
651        ArticleGet.usual(TextComparitor.C, "article"),
652        null /* bannerAndAddFinder */
653    );
654
655    /**
656     * This is the {@code NewsSite} definition for the Newspaper located at:
657     * <CODE><A HREF="https://www.gov.cn/" TARGET=_blank>
658     * https://www.gov.cn/</A></CODE>.
659     *
660     * <BR /><BR />This version of the "Gov.CN" website will scour a larger set of section
661     * {@code URL's}, and will not limit the returned Article-Links to just those found on the
662     * java-script carousel.  The Java-Script Carousel will almost always have a total of five
663     * news-article links available.  This definition of {@code 'NewsSite'} may return up to 
664     * thirty to forty different articles per news-section.
665     * 
666     * <BR /><BR /><TABLE CLASS=JDBriefTable>
667     * <TR><TH>Parameter</TH>                       <TH>Significance</TH></TR>
668     * <TR><TD>Newspaper Name</TD>                  <TD>Chinese Government Web Portal</TD></TR>
669     * <TR><TD>Country of Origin</TD>               <TD>People's Republic of China</TD></TR>
670     * <TR><TD>Website URL</TD>                     <TD>{@code https://gov.cn}</TD></TR>
671     * <TR><TD>Newspaper Printing Language</TD>     <TD>Mandarin Chinese</TD></TR>
672     * </TABLE>
673     * 
674     * <BR /><TABLE CLASS="NEWSSITE">
675     * <TR><TH>Parameter</TH><TH>Purpose</TH><TH>Value</TH></TR>
676     * <TR> <TD>Newspaper Article Groups / Sections</TD>
677     *      <TD>Scrape Sections</TD>
678     *      <TD><I>Retrieved from Data File</I></TD>
679     * </TR>
680     * <TR> <TD><B>{@link StrFilter}</B></TD>
681     *      <TD>News Web-Site Section-Page Aritlce-Link ({@code <A HREF=...>}) Filter</TD>
682     *      <TD>{@code HREF} must match:
683     *          {@code "^http://www.gov.cn/(?:.+?/)?\\d{4}-\\d{2}/\\d{2}/(?:.+?/)?content_\\d+.htm(?:l)?(#\\d+)?"}
684     *      </TD>
685     * </TR>
686     * <TR> <TD><B>{@link LinksGet}</B></TD>
687     *      <TD>Used to <B><I>manually</I></B> retrieve Article-Link {@code URL's}</TD>
688     *      <TD><B>{@code null}</B>.  Retrieves <B><I>all</I></B> Anchor-Links on a Section-Page.
689     *          Note that {@code URL's} must still pass the previous {@code StrFilter} (above)
690     *          in order to be parsed as {@link Article}'s.
691     *      </TD>
692     * </TR> 
693     * <TR> <TD><B>{@link ArticleGet}</B></TD>
694     *      <TD>Retrieves Article-Body Content from an Article-Link Web-Page</TD>
695     *      <TD>{@code <DIV CLASS="article ...">...</DIV>}
696     *          <BR />See: {@link ArticleGet#usual(TextComparitor, String[])}
697     *          <BR />See: {@link TextComparitor#C}
698     *      </TD>
699     * </TR>
700     * </TABLE>
701     * 
702     * <BR />View a copy of the logs that are generated from using this {@code NewsSite}.
703     * <BR /><BR /><UL CLASS=JDUL>
704     * <LI> <CODE><B><A HREF='doc-files/Logs/GovCN-ScrapeURLs.html'>
705     *      Gov.CN ScrapeURLs LOG</A></B></CODE>
706     *      </LI>
707     * <LI> <CODE><B><A HREF='doc-files/Logs/GovCN-ScrapeArticles.html'>
708     *      Gov.CN ScrapeArticles LOG</A></B></CODE>
709     *      </LI>
710     * </UL>
711     * 
712     * <EMBED CLASS='external-html' DATA-FILE-ID=NEWS_STE_CHANGE>
713     */
714    public static final NewsSite GovCN = new NewsSite
715    (
716        "Chinese Government Web Portal", Country.China, "https://gov.cn/", LC.ZH_CN,
717        "The Chinese Government Sponsored Web-Site",
718        newsPaperSections.get("GovCN"),
719        StrFilter.regExKEEP(Pattern.compile(
720            "^http://www.gov.cn/(?:.+?/)?\\d{4}-\\d{2}/\\d{2}/(?:.+?/)?content_\\d+.htm(?:l)?(#\\d+)?"
721        ), false),
722        null,
723        ArticleGet.usual(TextComparitor.C, "article"),
724        null /* bannerAndAddFinder */
725    );
726
727}