001package Torello.HTML.Tools.NewsSite;
002
003import Torello.HTML.*;
004import Torello.HTML.NodeSearch.*;
005import Torello.HTML.Tools.NewsSite.*;
006import Torello.Java.*;
007
008import Torello.Languages.LC;
009import Torello.Java.Shell.C;
010
011import java.util.*;
012import java.util.regex.*;
013
014import java.net.URL;
015import java.io.*;
016
017/**
018 * This class is nothing more than an 'Example Class' that contains some foreign-language
019 * based news web-pages, from both overseas and from Latin America.
020 * 
021 * <BR /><BR /><EMBED CLASS="external-html" DATA-FILE-ID="NEWS_SITES">
022 */
023public class NewsSites
024{
025    private NewsSites() { }
026
027    @SuppressWarnings("unchecked")
028    private static final Hashtable<String, Vector<URL>> newsPaperSections = 
029        (Hashtable<String, Vector<URL>>) LFEC.readObjectFromFile_JAR
030        (Torello.Data.DataFileLoader.class, "data17.htdat", true, Hashtable.class);
031
032    /**
033     * This example will run the news-site scrape on the Chinese Government News Article
034     * Carousel.
035     * 
036     * <BR /><BR /><B><SPAN STYLE="color: red;">IMPORTANT NOTE:</B></SPAN> This will method will
037     * create a directory called <B>"cnb"</B> on your file-system where it will write the contents
038     * of (most likely) 15 news-paper articles to disk as HTML files.
039     *
040     * The output log generated by this method may be viewed here:
041     * <BR /><BR /><B><CODE><A HREF='doc-files/Logs/Gov.CN.log.html'>
042     * Gov.CN.log.html</A></B></CODE>
043     *
044     * @throws IOException This throws for IO errors that may occur when reading the web-server,
045     * or when saving the web-pages or images to the file-system.
046     * 
047     * @see FileRW#delTree(String, boolean, Appendable)
048     * @see NewsSite
049     * @see FileRW#writeFile(CharSequence, String)
050     * @see C#toHTML(String, boolean, boolean, boolean)
051     */
052    public static void runExample() throws IOException
053    {
054        StorageWriter   log             = new StorageWriter();
055
056        // This directory will contain ".dat" files that are simply "Serialized" HTML Vectors.
057        // Each ".dat" file will contain precisely one HTML page.
058        final String    dataFilesDir    = "cnb" + File.separator + "articleData" + File.separator;
059
060        // This directory will contain sub-directories with ".html" files (and image-files)
061        // for each news-article that is saved / downloaded.
062        final String    htmlFilesDir    = "cnb" + File.separator + "articleHTML" + File.separator;
063
064        // This CLEARS WHATEVE DATA IS CURRENTLY IN THE DIRECTORY (by deleting all its contents)
065        // The following code is the same as the UNIX Shell Command:
066        // rm -r cnb/articleData/
067        // mkdir cnb/articleData
068        FileRW.delTree(dataFilesDir, true, log);
069
070        // The following code is the same as the UNIX Shell Command:
071        // rm -r cnb/articleHTML/
072        // mkdir cnb/articleHTML
073        FileRW.delTree(htmlFilesDir, true, log);
074
075        // *****************************************
076        // Previous Download Data Erased (if any)
077        // Start the today's News-Site Scrape
078        // *****************************************
079    
080        // Use the "GovCNCarousel" instance that is created in this class as a NewsSite
081        NewsSite ns = NewsSites.GovCNCarousel;
082
083        // Call the "Scrape URLs" class to retrieve all of the available newspaper articles
084        // on the Java-Script "Article Carousel"  Again, the "Article Carousel" is just this 
085        // little widget at the top of the page that rotates (usually) five hilited / emphasized
086        // news-article links for today
087        Vector<Vector<String>> articleURLs = ScrapeURLs.get(ns, log);
088
089        // This is usually not very important if only a small number of articles are being
090        // scraped.  When downloading hundreds of articles - being able to pause if there is a
091        // web-site IOError (And restart) is very important.
092        //
093        // The standard factory-generated "getFSInstance" creates a small file on the file-system
094        // for saving the "Download State" while downloading...
095    
096        Pause pause = Pause.getFSInstance("cnb" + File.separator + "state.dat");
097        pause.initialize();
098
099        // The "Scraped Articles" will be sent to the directory named by "dataFilesDir"
100        // Using the File-System to save these articles is the default-factory means for
101        // saving article-data.  Writing a customized "ScapedArticleReceiver" to do anything
102        // from saving article-data to a Data-Base up to and including e-mailing article data
103        // is possible using a self-written "ScrapedArticleReceiver"
104        ScrapedArticleReceiver receiver = ScrapedArticleReceiver.saveToFS(dataFilesDir);
105
106        // This will download each of the article's from their web-page URL.  The web-page
107        // article URL's were retrieved by "Scraped URLs".  The saved HTML (as HTML Vectors)
108        // is sent to the "Article Receiver" (defined in the previous step).  These news articles
109        // are saved as ".dat" since they are serialized java-objects.
110        //
111        // Explaining some "unnamed parameters" passed to the method invocation below:
112        //
113        // true: [skipArticlesWithoutPhotos] Skips Mandarin Chinese Newspaper Articles that do not
114        //       include at least one photo.  Photos usually help when reading foreign news articles.
115        // null: [bannerAndAdFinder] Some sites include images for Facebook links or advertising.
116        //       Gov.CN usually doesn't have these, but occasionally there are extraneous links.
117        //       for the purposes of this example, this parameter is ignored, and passed null.
118        // false: [keepOriginalPageHTML] The "Complete Page" - content before the Article Body is
119        //        extracted from the Article Web-Page is not saved.  This can occasionally be useful
120        //        if the HTML <HEAD>...</HEAD> has JSON or React-JS data to extract.
121
122        ScrapeArticles.download
123            (receiver, articleURLs, ns.articleGetter, true, null, false, pause, log);
124        
125        // Now this will convert each of the ".dat" files to an ".html" file - and also it
126        // will download the pictures / image included in the article.
127        //
128        // Explaining some "unnamed parameters" passed to the method invocation below:
129        //
130        // true: [cleanIt] This runs some basic HTML remove operations.  The best way to see
131        //       what the parameter "cleanIt" asks to have removed is to view the class "ToHTML"
132        // null: [HTMLModifier] Cleaning up other extraneous links and content in an newspaper
133        //       article body like advertising or links to other articles is usually necessary.
134        //       Anywhere between 1 and 10 lines of NodeSearch Removal Operations will get rid of
135        //       unnecessary HTML.  For the purposes of this example, such a cleaning operation is
136        //       not done here - although the final articles do include some "links to other
137        //       articles" that is not "CLEANED" like it should be.
138
139        ToHTML.convert(dataFilesDir, htmlFilesDir, true, null, log);
140
141        // NOTE: The log of running this command on Debian UNIX / LINUX may be viewed in the
142        // JavaDoc Comments in the top of this method.  If this method is run in an MS-DOS
143        // or Windows Environment, there will be no screen colors available to view.
144        FileRW.writeFile(
145            C.toHTML(log.getString(), true, true, true),
146            "cnb" + File.separator + "Gov.CN.log.html"
147        );
148    }
149
150    /**
151     * Prints the contents of the Data File.  Invoking this command allows a programmer to see
152     * which "sub-sections" are ascribed to each of the different news-paper definitions in this
153     * class.  Each "sub-section" is nothing more than a {@code URL}-branch of the primary web
154     * site {@code URL}.
155     *
156     * <DIV CLASS="HTML">{@code
157     * <!-- If the following were the primary news-site -->
158     * http://news.baidu.com
159     * 
160     * <!-- This would be a "sub-section" of the primary site -->
161     * http://news.baidu.com/sports
162     * }</DIV>
163     *
164     * <BR /><BR />Can be called from the command line.
165     * <BR /><BR />If a single command-line argument is passed to {@code "argv[0]"}, the contents
166     * of the "Sections URL Data File" will be output to a text-file that is named using the
167     * {@code String} passed to {@code "argv[0]"}.
168     *
169     * @param argv These are the command line arguments passed by the JRE to this method.
170     * @throws IOException If there are any problems while attempting to save the output to the
171     * the output file (if one was named / requested).
172     */
173    public static void main(String[] argv) throws IOException
174    {
175        // Uncomment this line to run the example code (instead of section-data print)
176        // runExample(); System.exit(0);
177
178        // The data-file is loaded into private field "newsPaperSections"
179        // This private field is a Hashtable<String, Vector<URL>>.  Convert each of
180        // these sections so that they may be printed to terminal and maybe to a text
181        // file.
182        StringBuilder sb = new StringBuilder();
183        for (String newspaper : newsPaperSections.keySet())
184        {
185            sb.append(newspaper + '\n');
186            for (URL section : newsPaperSections.get(newspaper))
187                sb.append(section.toString() + '\n');
188            sb.append("\n\n***************************************************\n\n");
189        }
190        
191        String s = sb.toString();
192        System.out.println(s);
193        
194        // If there is a command-line parameter, it shall be interpreted a file-name.
195        // The contents of the "sections data-file" (as text) will be written a file on the
196        // file-system using the String-value of "argv[0]" as the name of the output-filename.
197        if (argv.length == 1) FileRW.writeFile(s, argv[0]);
198    }
199
200    // URLFilter.regexKEEP(Pattern.compile("^http.+baidu\\.com\\/s\\?id=\\d+$")));
201    // ArticleGet.usual(TextComparitor.CN_CI, "article-content"));
202
203    /**
204     * The News Site at address: <CODE><A HREF="https://www.abc.es/" TARGET=_blank>
205     * "https://www.abc.es/"</A></CODE> is slightly more complicated when retrieving News-Article
206     * Links.
207     *
208     * <BR /><BR />Notice that each newspaper article {@code URL}-link is "wrapped" in an HTML
209     * {@code '<ARTICLE>...</ARTICLE>'} Element.
210     *
211     * <BR /><BR />If this code were translated into an "XPath Query" or "CSS Selector", it would
212     * read: <B>{@code article a}</B>.  Specifically it says to find all {@code 'Anchor'} elements
213     * that are descendants of {@code 'Article'} Elements.
214     * 
215     * @see TagNodeFindL1Inclusive#all(Vector, String)
216     * @see TagNodeGet#first(Vector, int, int, TC, String[])
217     * @see TagNode#AV(String)
218     */
219    public static Vector<String> ABC_LINKS_GETTER(URL url, Vector<HTMLNode> page)
220    {
221        Vector<String> ret = new Vector<>();       TagNode tn;     String urlStr;
222
223        // Links are kept inside <ARTICLE> ... </ARTICLE> on the main / section page.
224        for (DotPair article : TagNodeFindL1Inclusive.all(page, "article"))
225
226            // Now find the <A HREF=...> ... </A>
227            if ((tn = TagNodeGet.first(page, article.start, article.end, TC.OpeningTags, "a")) != null)
228
229                if ((urlStr = tn.AV("href")) != null)
230                    ret.add(urlStr);
231
232        return ret;
233    }
234
235    /**
236     * This is the {@code NewsSite} definition for the Newspaper located at:
237     * <CODE><A HREF="https://www.abc.es/" TARGET=_blank>https://www.abc.es/</A></CODE>.
238     * 
239     * <BR /><BR /><TABLE CLASS="BRIEFTABLE">
240     * <TR><TH>Parameter</TH>                       <TH>Significance</TH></TR>
241     * <TR><TD>Newspaper Name</TD>                  <TD>ABC España</TD></TR>
242     * <TR><TD>Country of Origin</TD>               <TD>Spain</TD></TR>
243     * <TR><TD>Website URL</TD>                     <TD>{@code https://abc.es}</TD></TR>
244     * <TR><TD>Newspaper Printing Language</TD>     <TD>Spanish</TD></TR>
245     * </TABLE>
246     * 
247     * <BR /><TABLE CLASS="NEWSSITE">
248     * <TR><TH>Parameter</TH><TH>Purpose</TH><TH>Value</TH></TR>
249     * <TR> <TD>Newspaper Article Groups / Sections</TD>
250     *      <TD>Scrape Sections</TD>
251     *      <TD><I>Retrieved from Data File</I></TD>
252     * </TR>
253     * <TR> <TD><B>{@link StrFilter}</B></TD>
254     *      <TD>News Web-Site Section-Page Aritlce-Link ({@code <A HREF=...>}) Filter</TD>
255     *      <TD>{@code 'HREF'} must end with {@code '.html'}
256     *          <BR />See: {@link StrFilter#comparitor(TextComparitor, String[])}
257     *          <BR />See: {@link TextComparitor#EW_CI}
258     *      </TD>
259     * </TR>
260     * <TR> <TD><B>{@link LinksGet}</B></TD>
261     *      <TD>Used to <B><I>manually</I></B> retrieve Article-Link {@code URL's}</TD>
262     *      <TD>Invokes method {@link #ABC_LINKS_GETTER(URL, Vector)}</TD>
263     * </TR>
264     * <TR> <TD><B>{@link ArticleGet}</B></TD>
265     *      <TD>Retrieves Article-Body Content from an Article-Link Web-Page</TD>
266     *      <TD>{@code <MAIN>...</MAIN>}<BR />See: {@link ArticleGet#usual(String)}</TD>
267     * </TR>
268     * </TABLE>
269     * 
270     * <BR />View a copy of the logs that are generated from using this {@code NewsSite} instance.
271     * <BR /><BR /><UL CLASS="JDUL">
272     * <LI> <CODE><B><A HREF='doc-files/Logs/ABC.ES-ScrapeURLs.html'>
273     *      ABC.ES ScrapeURLs LOG</A></B></CODE>
274     *      </LI>
275     * <LI> {@code ScrapeArticles}
276     *      <BR /><B>IMPORTANT NOTE:</B> Though {@code ScrapeURL's} code <I>will check for
277     *      duplicate {@code URL's}</I> that may be returned <I>within any given-section</I>,
278     *      {@code Article URL's} may be repeated among the different sections of the newspaper.
279     *      Since the {@code URL}-scrape returned nearly 3,000 articles, the log of
280     *      an {@code Article} scrape is not included here.  Proper duplicate {@code URL} checking
281     *      code has obviously been written, but would be too complicated to show in this example.
282     *     </LI>
283     * </UL>
284     * 
285     * <EMBED CLASS="external-html" DATA-FILE-ID=NEWS_STE_CHANGE>
286     */
287    public static final NewsSite ABCES = new NewsSite
288    (
289        "ABC España", Country.Spain, "https://www.abc.es/", LC.ES,
290        "ABC is a Spanish national daily newspaper.  It is the third largest general-interest " +
291        "newspaper in Spain, and the oldest newspaper still operating in Madrid.",
292        newsPaperSections.get("ABCES"),
293        StrFilter.comparitor(TextComparitor.EW_CI, ".html"),
294        NewsSites::ABC_LINKS_GETTER,
295        ArticleGet.usual("main"),
296        null /* bannerAndAdFinder */
297    );
298
299    /**
300     * This is the {@code NewsSite} definition for the Newspaper located at:
301     * <CODE><A HREF="https://www.elpulso.mx/" TARGET=_blank>https://www.elpulso.mx/</A></CODE>.
302     * 
303     * <BR /><BR /><TABLE CLASS="BRIEFTABLE">
304     * <TR><TH>Parameter</TH>                       <TH>Significance</TH></TR>
305     * <TR><TD>Newspaper Name</TD>                  <TD>El Pulso, México</TD></TR>
306     * <TR><TD>Country of Origin</TD>               <TD>México</TD></TR>
307     * <TR><TD>Website URL</TD>                     <TD>{@code https://elpulso.mx}</TD></TR>
308     * <TR><TD>Newspaper Printing Language</TD>     <TD>Spanish</TD></TR>
309     * </TABLE>
310     * 
311     * <BR /><TABLE CLASS="NEWSSITE">
312     * <TR><TH>Parameter</TH><TH>Purpose</TH><TH>Value</TH></TR>
313     * <TR> <TD>Newspaper Article Groups / Sections</TD>
314     *      <TD>Scrape Sections</TD>
315     *      <TD><I>Retrieved from Data File</I></TD>
316     * </TR>
317     * <TR> <TD><B>{@link StrFilter}</B></TD>
318     *      <TD>News Web-Site Section-Page Aritlce-Link ({@code <A HREF=...>}) Filter</TD>
319     *      <TD>{@code HREF} must match: {@code http://some.domain/YYYY/MM/DD/<article-name>/}</TD>
320     * </TR>
321     * <TR> <TD><B>{@link LinksGet}</B></TD>
322     *      <TD>Used to <B><I>manually</I></B> retrieve Article-Link {@code URL's}</TD>
323     *      <TD><B>{@code null}</B>.  Retrieves <B><I>all</I></B> Anchor-Links on a Section-Page.
324     *          Note that {@code URL's} must still pass the previous {@code StrFilter} (above)
325     *          in order to be parsed as {@link Article}'s.
326     *      </TD>
327     * </TR>
328     * <TR> <TD><B>{@link ArticleGet}</B></TD>
329     *      <TD>Retrieves Article-Body Content from an Article-Link Web-Page</TD>
330     *      <TD>{@code <DIV CLASS="entry-content">...</DIV>}
331     *          <BR />See: {@link ArticleGet#usual(TextComparitor, String[])}
332     *          <BR />See: {@link TextComparitor#C}
333     *      </TD>
334     * </TR>
335     * </TABLE>
336     */
337    public static final NewsSite Pulso = new NewsSite
338    (
339        "El Pulso, México", Country.Mexico, "https://elpulso.mx", LC.ES,
340        "El Pulso newspaper is Spanish language newspaper in Mexico. It is showing breaking news, " +
341        "headlines, kids news, tourism news, entertainment news, study news, industrial news, " +
342        "economical news, health & beauty news, crime news, career news, Travel news, " +
343        "diet & fitness news, Top stories, special news, celebrity news.",
344        newsPaperSections.get("PULSO"),
345        StrFilter.regExKEEP(Pattern.compile(
346            "^https?:\\/{2}.*?\\/\\d{4}\\/\\d{2}\\/\\d{2}\\/[\\w-]{10,}\\/$"
347        ), false),
348        null /* LinksGet */,
349        ArticleGet.usual(TextComparitor.C, "entry-content"),
350        null /* bannerAndAddFinder */
351    );
352
353    /**
354     * The News Site at address: <CODE><A HREF="https://www.ElNacional.com/" TARGET=_blank>
355     * "https://www.ElNacional.com/"</A></CODE> is slightly more complicated when retrieving
356     * News-Article Links.
357     *
358     * <BR /><BR />Notice that each newspaper article {@code URL}-link is "wrapped" in an HTML
359     * {@code '<DIV CLASS="td-module-thumb">...</DIV>'} Element.
360     *
361     * <BR /><BR />If this code were translated into an "XPath Query" or "CSS Selector", it would
362     * read: <B>{@code div.td-module-thumb a}</B>.  Specifically it says to find all
363     * {@code 'Anchor'} elements that are descendants of {@code 'DIV'} Elements where said
364     * Divider's CSS {@code CLASS} contains {@code 'td-module-thumb'}.
365     * 
366     * @see InnerTagFindInclusive#all(Vector, String, String, TextComparitor, String[])
367     * @see TagNodeGet#first(Vector, int, int, TC, String[])
368     * @see TagNode#AV(String)
369     */
370    public static Vector<String> EL_NACIONAL_LINKS_GETTER(URL url, Vector<HTMLNode> page)
371    {
372        Vector<String> ret = new Vector<>();       TagNode tn;     String urlStr;
373
374        // Links are kept inside <DIV CLASS=td-module-thumb> ... </DIV> on the main / section page.
375        for (DotPair article : InnerTagFindInclusive.all
376            (page, "div", "class", TextComparitor.C, "td-module-thumb"))
377
378            // Now find the <A HREF=...> ... </A>
379            if ((tn = TagNodeGet.first
380                (page, article.start, article.end, TC.OpeningTags, "a")) != null)
381
382                if ((urlStr = tn.AV("href")) != null)
383                    ret.add(urlStr);
384
385        return ret;
386    }
387
388    /**
389     * This is the {@code NewsSite} definition for the Newspaper located at:
390     * <CODE><A HREF="https://www.elnacional.com/" TARGET=_blank>
391     * https://www.elnacional.com/</A></CODE>.
392     * 
393     * <BR /><BR /><TABLE CLASS="BRIEFTABLE">
394     * <TR><TH>Parameter</TH>                       <TH>Significance</TH></TR>
395     * <TR><TD>Newspaper Name</TD>                  <TD>El Nacional</TD></TR>
396     * <TR><TD>Country of Origin</TD>               <TD>Venezuela</TD></TR>
397     * <TR><TD>Website URL</TD>                     <TD>{@code https://elnacional.com}</TD></TR>
398     * <TR><TD>Newspaper Printing Language</TD>     <TD>Spanish</TD></TR>
399     * </TABLE>
400     * 
401     * <BR /><TABLE CLASS="NEWSSITE">
402     * <TR><TH>Parameter</TH><TH>Purpose</TH><TH>Value</TH></TR>
403     * <TR> <TD>Newspaper Article Groups / Sections</TD>
404     *      <TD>Scrape Sections</TD>
405     *      <TD><I>Retrieved from Data File</I></TD>
406     * </TR>
407     * <TR> <TD><B>{@link URLFilter}</B></TD>
408     *      <TD>News Web-Site Section-Page Aritlce-Link ({@code <A HREF=...>}) Filter</TD>
409     *      <TD><B>{@code null}</B>.  The {@code LinksGet} provided here will only return valid
410     *          {@code Article URL's}, so there is no need for a {@code URLFilter}.
411     *      </TD>
412     * </TR>
413     * <TR> <TD><B>{@link LinksGet}</B></TD>
414     *      <TD>Used to <B><I>manually</I></B> retrieve Article-Link {@code URL's}</TD>
415     *      <TD>Invokes method {@link #EL_NACIONAL_LINKS_GETTER(URL, Vector)}</TD>
416     * </TR>
417     * <TR> <TD><B>{@link ArticleGet}</B></TD>
418     *      <TD>Retrieves Article-Body Content from an Article-Link Web-Page</TD>
419     *      <TD>{@code <ARTICLE>...</ARTICLE>}<BR />See: {@link ArticleGet#usual(String)}</TD>
420     * </TR>
421     * </TABLE>
422     * 
423     * <BR />View a copy of the logs that are generated from using this {@code NewsSite}.
424     * <BR /><BR /><UL CLASS="JDUL">
425     * <LI> <CODE><B><A HREF='doc-files/Logs/ElNacional-ScrapeURLs.html'>
426     *      El Nacional ScrapeURLs LOG</A></B></CODE>
427     *      </LI>
428     * <LI> <CODE><B><A HREF='doc-files/Logs/ElNacional-ScrapeArticles.html'>
429     *      El Nacional ScrapeArticles LOG</A></B></CODE>
430     *      </LI>
431     * </UL>
432     * 
433     * <EMBED CLASS="external-html" DATA-FILE-ID=NEWS_STE_CHANGE>
434     */
435    public static final NewsSite ElNacional = new NewsSite
436    (
437        "El Nacional", Country.Venezuela, "https://elnacional.com", LC.ES,
438        "El Nacional is a Venezuelan publishing company under the name C.A. Editorial El Nacional, " +
439        "most widely known for its El Nacional newspaper and website. It, along with Últimas " +
440        "Noticias and El Universal, are the most widely read and circulated daily national " +
441        "newspapers in the country, and it has an average of more than 80,000 papers distributed " +
442        "daily and 170,000 copies on weekends.",
443        newsPaperSections.get("ElNacional"),
444        (URLFilter) null, /* The LinksGetter will only return valid Anchor's */
445        NewsSites::EL_NACIONAL_LINKS_GETTER,
446        ArticleGet.usual("article"),
447        null /* bannerAndAdFinder */
448    );
449
450    /**
451     * The News Site at address: <CODE><A HREF="https://www.ElEspectador.com/" TARGET=_blank>
452     * "https://www.ElEspectador.com/"</A></CODE> is slightly more complicated when retrieving
453     * News-Article Links.
454     *
455     * <BR /><BR />Notice that each newspaper article {@code URL}-link is "wrapped" in an HTML
456     * {@code '<DIV CLASS="Card ...">...</DIV>'} Element.
457     *
458     * <BR /><BR />If this code were translated into an "XPath Query" or "CSS Selector", it would
459     * read: <B>{@code div.Card a.card-link}</B>.  Specifically it says to find all
460     * {@code 'Anchor'} elements whose CSS {@code Class} contains {@code 'card-link'} and which
461     * are descendants of {@code 'DIV'} Elements where said Divider's
462     * CSS {@code CLASS} contains {@code 'Card'}.
463     *
464     * @see InnerTagFindInclusive#all(Vector, String, String, TextComparitor, String[])
465     * @see InnerTagGet#first(Vector, int, int, String, String, TextComparitor, String[])
466     * @see TagNode#AV(String)
467     */
468    public static Vector<String> EL_ESPECTADOR_LINKS_GETTER(URL url, Vector<HTMLNode> page)
469    {
470        Vector<String> ret = new Vector<>();       TagNode tn;     String urlStr;
471
472        // Links are kept inside <DIV CLASS="Card ..."> ... </DIV> on the main / section page.
473        for (DotPair article : InnerTagFindInclusive.all
474            (page, "div", "class", TextComparitor.C, "Card"))
475
476            // Now find the <A CLASS="card-link" HREF=...> ... </A>
477            if ((tn = InnerTagGet.first
478                (page, article.start, article.end, "a", "class", TextComparitor.C, "card-link")) != null)
479
480                if ((urlStr = tn.AV("href")) != null)
481                    ret.add(urlStr);
482
483        return ret;
484    }
485
486    /**
487     * This is the {@code NewsSite} definition for the Newspaper located at:
488     * <CODE><A HREF="https://www.elespectador.com/" TARGET=_blank>
489     * https://www.elespectador.com/</A></CODE>.
490     * 
491     * <BR /><BR /><TABLE CLASS="BRIEFTABLE">
492     * <TR><TH>Parameter</TH>                       <TH>Significance</TH></TR>
493     * <TR><TD>Newspaper Name</TD>                  <TD>El Espectador</TD></TR>
494     * <TR><TD>Country of Origin</TD>               <TD>Columbia</TD></TR>
495     * <TR><TD>Website URL</TD>                     <TD>{@code https://elespectador.com}</TD></TR>
496     * <TR><TD>Newspaper Printing Language</TD>     <TD>Spanish</TD></TR>
497     * </TABLE>
498     * 
499     * <BR /><TABLE CLASS="NEWSSITE">
500     * <TR><TH>Parameter</TH><TH>Purpose</TH><TH>Value</TH></TR>
501     * <TR> <TD>Newspaper Article Groups / Sections</TD>
502     *      <TD>Scrape Sections</TD>
503     *      <TD><I>Retrieved from Data File</I></TD>
504     * </TR>
505     * <TR> <TD><B>{@link StrFilter}</B></TD>
506     *      <TD>News Web-Site Section-Page Aritlce-Link ({@code <A HREF=...>}) Filter</TD>
507     *      <TD>{@code HREF} must end with a forward-slash {@code '/'} character.
508     *          <BR />See: {@link TextComparitor#ENDS_WITH}
509     *      </TD>
510     * </TR>
511     * <TR> <TD><B>{@link LinksGet}</B></TD>
512     *      <TD>Used to <B><I>manually</I></B> retrieve Article-Link {@code URL's}</TD>
513     *      <TD>Invokes method {@link #EL_NACIONAL_LINKS_GETTER(URL, Vector)}</TD>
514     * </TR>
515     * <TR> <TD><B>{@link ArticleGet}</B></TD>
516     *      <TD>Retrieves Article-Body Content from an Article-Link Web-Page</TD>
517     *      <TD>{@code <DIV CLASS="l-main">...</DIV>}
518     *          <BR />See: {@link ArticleGet#usual(TextComparitor, String[])}
519     *          <BR />See: {@link TextComparitor#C}
520     *      </TD>
521     * </TR>
522     * </TABLE>
523     * 
524     * <BR />View a copy of the logs that are generated from using this {@code NewsSite}.
525     * <BR /><BR /><UL CLASS="JDUL">
526     * <LI> <CODE><B><A HREF='doc-files/Logs/ElEspectador-ScrapeURLs.html'>
527     *      El Espectador ScrapeURLs LOG</A></B></CODE>
528     *      </LI>
529     * <LI> <CODE><B><A HREF='doc-files/Logs/ElEspectador-ScrapeArticles.html'>
530     *      El Espectador ScrapeArticles LOG</A></B></CODE>
531     *      </LI>
532     * </UL>
533     * 
534     * <EMBED CLASS="external-html" DATA-FILE-ID=NEWS_STE_CHANGE>
535     */
536    public static final NewsSite ElEspectador = new NewsSite
537    (
538        "El Espectador, Columbia", Country.Colombia, "https://elespectador.com", LC.ES,
539        "El Espectador (meaning \"The Spectator\") is a newspaper with national circulation within " +
540        "Colombia, founded by Fidel Cano Gutiérrez on 22 March 1887 in Medellín and published " +
541        "since 1915 in Bogotá. It changed from a daily to a weekly edition in 2001, following a " +
542        "financial crisis, and became a daily again on 11 May 2008, a comeback which had been " +
543        "long rumoured, in tabloid format (28 x 39.5 cm). From 1997 to 2011 its main shareholder " +
544        "was Julio Mario Santo Domingo.",
545        newsPaperSections.get("ElEspectador"),
546        StrFilter.comparitor(TextComparitor.ENDS_WITH, "/"),
547        NewsSites::EL_ESPECTADOR_LINKS_GETTER,
548        ArticleGet.usual("article"),
549        null /* bannerAndAdFinder */
550    );
551
552
553    /**
554     * The News Site at address: <CODE><A HREF="https://www.gov.cn/" TARGET=_blank>
555     * "https://www.gov.cn/"</A></CODE> has a Java-Script "Links Carousel".  Essentially, there
556     * is a section with "Showcased News Articles" that are intended to be emphasize anywhere
557     * between four and eight primary articles.
558     *
559     * <BR /><BR />This Links-Carousel is wrapped in an HTML Divider Element as below:
560     * {@code <DIV CLASS="slider-carousel">}.
561     *
562     * <BR /><BR />If this code were translated into an "XPath Query" or "CSS Selector", it would
563     * read: <B>{@code div[class=slider-carousel] a}</B>.  Specifically it says to  find all
564     * {@code 'Anchor'} elements that are descendants of {@code '<DIV CLASS="slider-carousel">'}
565     * Elements.
566     * 
567     * @see InnerTagGetInclusive#first(Vector, String, String, TextComparitor, String[])
568     * @see TagNodeGet#all(Vector, TC, String[])
569     * @see TagNode#AV(String)
570     */
571    public static Vector<String> GOVCN_CAROUSEL_LINKS_GETTER(URL url, Vector<HTMLNode> page)
572    {
573        Vector<String>  ret     = new Vector<>();
574        String          urlStr;
575
576        // Find the first <DIV CLASS="slider-carousel"> ... </DIV> section
577        Vector<HTMLNode> carouselDIV = InnerTagGetInclusive.first
578            (page, "div", "class", TextComparitor.CN_CI, "slider-carousel");
579
580        // Retrieve any HTML Anchor <A HREF=...> ... </A> found within the contents of the
581        // Divider.
582        for (TagNode tn: TagNodeGet.all(carouselDIV, TC.OpeningTags, "a"))
583            if ((urlStr = tn.AV("href")) != null)
584                ret.add(urlStr);
585
586        return ret;
587    };
588
589    /**
590     * This is the {@code NewsSite} definition for the Newspaper located at:
591     * <CODE><A HREF="https://www.gov.cn/" TARGET=_blank>
592     * https://www.gov.cn/</A></CODE>.
593     * 
594     * <BR /><BR />The "Carousels" are just the emphasized or "HiLighted" links that are
595     * on three separate pages.  There is a complete-link {@code NewsSite} definition that
596     * will retrieve all links - <I>not just the links hilited by the carousel.</I>
597     * 
598     * <BR /><BR /><TABLE CLASS="BRIEFTABLE">
599     * <TR><TH>Parameter</TH>                       <TH>Significance</TH></TR>
600     * <TR><TD>Newspaper Name</TD>                  <TD>Chinese Government Web Portal</TD></TR>
601     * <TR><TD>Country of Origin</TD>               <TD>People's Republic of China</TD></TR>
602     * <TR><TD>Website URL</TD>                     <TD>{@code https://gov.cn}</TD></TR>
603     * <TR><TD>Newspaper Printing Language</TD>     <TD>Mandarin Chinese</TD></TR>
604     * </TABLE>
605     * 
606     * <BR /><TABLE CLASS="NEWSSITE">
607     * <TR><TH>Parameter</TH><TH>Purpose</TH><TH>Value</TH></TR>
608     * <TR> <TD>Newspaper Article Groups / Sections</TD>
609     *      <TD>Scrape Sections</TD>
610     *      <TD><I>Retrieved from Data File</I></TD>
611     * </TR>
612     * <TR> <TD><B>{@link StrFilter}</B></TD>
613     *      <TD>News Web-Site Section-Page Aritlce-Link ({@code <A HREF=...>}) Filter</TD>
614     *      <TD>{@code HREF} must match:
615     *          {@code "^http://www.gov.cn/(?:.+?/)?\\d{4}-\\d{2}/\\d{2}/(?:.+?/)?content_\\d+.htm(?:l)?(#\\d+)?"}
616     *      </TD>
617     * </TR>
618     * <TR> <TD><B>{@link LinksGet}</B></TD>
619     *      <TD>Used to <B><I>manually</I></B> retrieve Article-Link {@code URL's}</TD>
620     *      <TD>Invokes method {@link #GOVCN_CAROUSEL_LINKS_GETTER(URL, Vector)}</TD>
621     * </TR>
622     * <TR> <TD><B>{@link ArticleGet}</B></TD>
623     *      <TD>Retrieves Article-Body Content from an Article-Link Web-Page</TD>
624     *      <TD>{@code <DIV CLASS="article ...">...</DIV>}
625     *          <BR />See: {@link ArticleGet#usual(TextComparitor, String[])}
626     *          <BR />See: {@link TextComparitor#C}
627     *      </TD>
628     * </TR>
629     * </TABLE>
630     * 
631     * <BR />View a copy of the logs that are generated from using this {@code NewsSite}.
632     * <BR /><BR /><UL CLASS="JDUL">
633     * <LI> <CODE><B><A HREF='doc-files/Logs/GovCNCarousel-ScrapeURLs.html'>
634     *      Gov.CN Carousel ScrapeURLs LOG</A></B></CODE>
635     *      </LI>
636     * <LI> <CODE><B><A HREF='doc-files/Logs/GovCNCarousel-ScrapeArticles.html'>
637     *      Gov.CN Carousel ScrapeArticles LOG</A></B></CODE>
638     *      </LI>
639     * </UL>
640     * 
641     * <EMBED CLASS="external-html" DATA-FILE-ID=NEWS_STE_CHANGE>
642     */
643    public static final NewsSite GovCNCarousel = new NewsSite
644    (
645        "Chinese Government Web Portal", Country.China, "https://gov.cn/", LC.ZH_CN,
646        "The Chinese Government Sponsored Web-Site",
647        newsPaperSections.get("GovCNCarousel"),
648        StrFilter.regExKEEP(Pattern.compile(
649            "^http://www.gov.cn/(?:.+?/)?\\d{4}-\\d{2}/\\d{2}/(?:.+?/)?content_\\d+.htm(?:l)?(#\\d+)?"
650        ), false),
651        NewsSites::GOVCN_CAROUSEL_LINKS_GETTER,
652        ArticleGet.usual(TextComparitor.C, "article"),
653        null /* bannerAndAddFinder */
654    );
655
656    /**
657     * This is the {@code NewsSite} definition for the Newspaper located at:
658     * <CODE><A HREF="https://www.gov.cn/" TARGET=_blank>
659     * https://www.gov.cn/</A></CODE>.
660     *
661     * <BR /><BR />This version of the "Gov.CN" website will scour a larger set of section
662     * {@code URL's}, and will not limit the returned Article-Links to just those found on the
663     * java-script carousel.  The Java-Script Carousel will almost always have a total of five
664     * news-article links available.  This definition of {@code 'NewsSite'} may return up to 
665     * thirty to forty different articles per news-section.
666     * 
667     * <BR /><BR /><TABLE CLASS="BRIEFTABLE">
668     * <TR><TH>Parameter</TH>                       <TH>Significance</TH></TR>
669     * <TR><TD>Newspaper Name</TD>                  <TD>Chinese Government Web Portal</TD></TR>
670     * <TR><TD>Country of Origin</TD>               <TD>People's Republic of China</TD></TR>
671     * <TR><TD>Website URL</TD>                     <TD>{@code https://gov.cn}</TD></TR>
672     * <TR><TD>Newspaper Printing Language</TD>     <TD>Mandarin Chinese</TD></TR>
673     * </TABLE>
674     * 
675     * <BR /><TABLE CLASS="NEWSSITE">
676     * <TR><TH>Parameter</TH><TH>Purpose</TH><TH>Value</TH></TR>
677     * <TR> <TD>Newspaper Article Groups / Sections</TD>
678     *      <TD>Scrape Sections</TD>
679     *      <TD><I>Retrieved from Data File</I></TD>
680     * </TR>
681     * <TR> <TD><B>{@link StrFilter}</B></TD>
682     *      <TD>News Web-Site Section-Page Aritlce-Link ({@code <A HREF=...>}) Filter</TD>
683     *      <TD>{@code HREF} must match:
684     *          {@code "^http://www.gov.cn/(?:.+?/)?\\d{4}-\\d{2}/\\d{2}/(?:.+?/)?content_\\d+.htm(?:l)?(#\\d+)?"}
685     *      </TD>
686     * </TR>
687     * <TR> <TD><B>{@link LinksGet}</B></TD>
688     *      <TD>Used to <B><I>manually</I></B> retrieve Article-Link {@code URL's}</TD>
689     *      <TD><B>{@code null}</B>.  Retrieves <B><I>all</I></B> Anchor-Links on a Section-Page.
690     *          Note that {@code URL's} must still pass the previous {@code StrFilter} (above)
691     *          in order to be parsed as {@link Article}'s.
692     *      </TD>
693     * </TR> 
694     * <TR> <TD><B>{@link ArticleGet}</B></TD>
695     *      <TD>Retrieves Article-Body Content from an Article-Link Web-Page</TD>
696     *      <TD>{@code <DIV CLASS="article ...">...</DIV>}
697     *          <BR />See: {@link ArticleGet#usual(TextComparitor, String[])}
698     *          <BR />See: {@link TextComparitor#C}
699     *      </TD>
700     * </TR>
701     * </TABLE>
702     * 
703     * <BR />View a copy of the logs that are generated from using this {@code NewsSite}.
704     * <BR /><BR /><UL CLASS="JDUL">
705     * <LI> <CODE><B><A HREF='doc-files/Logs/GovCN-ScrapeURLs.html'>
706     *      Gov.CN ScrapeURLs LOG</A></B></CODE>
707     *      </LI>
708     * <LI> <CODE><B><A HREF='doc-files/Logs/GovCN-ScrapeArticles.html'>
709     *      Gov.CN ScrapeArticles LOG</A></B></CODE>
710     *      </LI>
711     * </UL>
712     * 
713     * <EMBED CLASS="external-html" DATA-FILE-ID=NEWS_STE_CHANGE>
714     */
715    public static final NewsSite GovCN = new NewsSite
716    (
717        "Chinese Government Web Portal", Country.China, "https://gov.cn/", LC.ZH_CN,
718        "The Chinese Government Sponsored Web-Site",
719        newsPaperSections.get("GovCN"),
720        StrFilter.regExKEEP(Pattern.compile(
721            "^http://www.gov.cn/(?:.+?/)?\\d{4}-\\d{2}/\\d{2}/(?:.+?/)?content_\\d+.htm(?:l)?(#\\d+)?"
722        ), false),
723        null,
724        ArticleGet.usual(TextComparitor.C, "article"),
725        null /* bannerAndAddFinder */
726    );
727
728}