001package Torello.HTML.Tools.NewsSite; 002 003import Torello.HTML.*; 004import Torello.HTML.NodeSearch.*; 005import Torello.HTML.Tools.NewsSite.*; 006import Torello.Java.*; 007 008import Torello.Languages.LC; 009 010import java.util.*; 011import java.util.regex.*; 012 013import java.net.URL; 014import java.io.*; 015 016/** 017 * This class is nothing more than an 'Example Class' that contains some foreign-language 018 * based news web-pages, from both overseas and from Latin America. 019 * 020 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=NEWS_SITES> 021 */ 022public class NewsSites 023{ 024 private NewsSites() { } 025 026 @SuppressWarnings("unchecked") 027 private static final Hashtable<String, Vector<URL>> newsPaperSections = 028 (Hashtable<String, Vector<URL>>) LFEC.readObjectFromFile_JAR 029 (NewsSite.class, "data-files/SectionURLs.htdat", true, Hashtable.class); 030 031 032 /** 033 * This example will run the news-site scrape on the Chinese Government News Article 034 * Carousel. 035 * 036 * <BR /><BR /><B><SPAN STYLE="color: red;">IMPORTANT NOTE:</B></SPAN> This will method will 037 * create a directory called <B>"cnb"</B> on your file-system where it will write the contents 038 * of (most likely) 15 news-paper articles to disk as HTML files. 039 * 040 * The output log generated by this method may be viewed here: 041 * <BR /><BR /><B><CODE><A HREF='doc-files/Logs/Gov.CN.log.html'> 042 * Gov.CN.log.html</A></B></CODE> 043 * 044 * @throws IOException This throws for IO errors that may occur when reading the web-server, 045 * or when saving the web-pages or images to the file-system. 046 * 047 * @see FileRW#delTree(String, boolean, Appendable) 048 * @see NewsSite 049 * @see FileRW#writeFile(CharSequence, String) 050 * @see C#toHTML(String, boolean, boolean, boolean) 051 */ 052 public static void runExample() throws IOException 053 { 054 StorageWriter log = new StorageWriter(); 055 056 // This directory will contain ".dat" files that are simply "Serialized" HTML Vectors. 057 // Each ".dat" file will contain precisely one HTML page. 058 059 final String dataFilesDir = "cnb" + File.separator + "articleData" + File.separator; 060 061 // This directory will contain sub-directories with ".html" files (and image-files) 062 // for each news-article that is saved / downloaded. 063 064 final String htmlFilesDir = "cnb" + File.separator + "articleHTML" + File.separator; 065 066 // This CLEARS WHATEVE DATA IS CURRENTLY IN THE DIRECTORY (by deleting all its contents) 067 // The following code is the same as the UNIX Shell Command: 068 // rm -r cnb/articleData/ 069 // mkdir cnb/articleData 070 071 FileRW.delTree(dataFilesDir, true, log); 072 073 // The following code is the same as the UNIX Shell Command: 074 // rm -r cnb/articleHTML/ 075 // mkdir cnb/articleHTML 076 077 FileRW.delTree(htmlFilesDir, true, log); 078 079 080 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 081 // Previous Download Data Erased (if any), Start today's News-Site Scrape 082 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 083 084 // Use the "GovCNCarousel" instance that is created in this class as a NewsSite 085 NewsSite ns = NewsSites.GovCNCarousel; 086 087 // Call the "Scrape URLs" class to retrieve all of the available newspaper articles 088 // on the Java-Script "Article Carousel" Again, the "Article Carousel" is just this 089 // little widget at the top of the page that rotates (usually) five hilited / emphasized 090 // news-article links for today 091 092 Vector<Vector<String>> articleURLs = ScrapeURLs.get(ns, log); 093 094 // This is usually not very important if only a small number of articles are being 095 // scraped. When downloading hundreds of articles - being able to pause if there is a 096 // web-site IOError (And restart) is very important. 097 // 098 // The standard factory-generated "getFSInstance" creates a small file on the file-system 099 // for saving the "Download State" while downloading... 100 101 Pause pause = Pause.getFSInstance("cnb" + File.separator + "state.dat"); 102 pause.initialize(); 103 104 // The "Scraped Articles" will be sent to the directory named by "dataFilesDir" 105 // Using the File-System to save these articles is the default-factory means for 106 // saving article-data. Writing a customized "ScapedArticleReceiver" to do anything 107 // from saving article-data to a Data-Base up to and including e-mailing article data 108 // is possible using a self-written "ScrapedArticleReceiver" 109 110 ScrapedArticleReceiver receiver = ScrapedArticleReceiver.saveToFS(dataFilesDir); 111 112 // This will download each of the article's from their web-page URL. The web-page 113 // article URL's were retrieved by "Scraped URLs". The saved HTML (as HTML Vectors) 114 // is sent to the "Article Receiver" (defined in the previous step). These news articles 115 // are saved as ".dat" since they are serialized java-objects. 116 // 117 // Explaining some "unnamed parameters" passed to the method invocation below: 118 // 119 // true: [skipArticlesWithoutPhotos] Skips Mandarin Chinese Newspaper Articles that do not 120 // include at least one photo. Photos usually help when reading foreign news articles. 121 // null: [bannerAndAdFinder] Some sites include images for Facebook links or advertising. 122 // Gov.CN usually doesn't have these, but occasionally there are extraneous links. 123 // for the purposes of this example, this parameter is ignored, and passed null. 124 // false: [keepOriginalPageHTML] The "Complete Page" - content before the Article Body is 125 // extracted from the Article Web-Page is not saved. This can occasionally be useful 126 // if the HTML <HEAD>...</HEAD> has JSON or React-JS data to extract. 127 128 ScrapeArticles.download 129 (receiver, articleURLs, ns.articleGetter, true, null, false, pause, log); 130 131 // Now this will convert each of the ".dat" files to an ".html" file - and also it 132 // will download the pictures / image included in the article. 133 // 134 // Explaining some "unnamed parameters" passed to the method invocation below: 135 // 136 // true: [cleanIt] This runs some basic HTML remove operations. The best way to see 137 // what the parameter "cleanIt" asks to have removed is to view the class "ToHTML" 138 // null: [HTMLModifier] Cleaning up other extraneous links and content in an newspaper 139 // article body like advertising or links to other articles is usually necessary. 140 // Anywhere between 1 and 10 lines of NodeSearch Removal Operations will get rid of 141 // unnecessary HTML. For the purposes of this example, such a cleaning operation is 142 // not done here - although the final articles do include some "links to other 143 // articles" that is not "CLEANED" like it should be. 144 145 ToHTML.convert(dataFilesDir, htmlFilesDir, true, null, log); 146 147 // NOTE: The log of running this command on Debian UNIX / LINUX may be viewed in the 148 // JavaDoc Comments in the top of this method. If this method is run in an MS-DOS 149 // or Windows Environment, there will be no screen colors available to view. 150 151 FileRW.writeFile( 152 C.toHTML(log.getString(), true, true, true), 153 "cnb" + File.separator + "Gov.CN.log.html" 154 ); 155 } 156 157 /** 158 * Prints the contents of the Data File. Invoking this command allows a programmer to see 159 * which "sub-sections" are ascribed to each of the different news-paper definitions in this 160 * class. Each "sub-section" is nothing more than a {@code URL}-branch of the primary web 161 * site {@code URL}. 162 * 163 * <DIV CLASS="HTML">{@code 164 * <!-- If the following were the primary news-site --> 165 * http://news.baidu.com 166 * 167 * <!-- This would be a "sub-section" of the primary site --> 168 * http://news.baidu.com/sports 169 * }</DIV> 170 * 171 * <BR /><BR />Can be called from the command line. 172 * <BR /><BR />If a single command-line argument is passed to {@code "argv[0]"}, the contents 173 * of the "Sections URL Data File" will be output to a text-file that is named using the 174 * {@code String} passed to {@code "argv[0]"}. 175 * 176 * @param argv These are the command line arguments passed by the JRE to this method. 177 * @throws IOException If there are any problems while attempting to save the output to the 178 * the output file (if one was named / requested). 179 */ 180 public static void main(String[] argv) throws IOException 181 { 182 // Uncomment this line to run the example code (instead of section-data print) 183 // runExample(); System.exit(0); 184 185 // The data-file is loaded into private field "newsPaperSections" 186 // This private field is a Hashtable<String, Vector<URL>>. Convert each of 187 // these sections so that they may be printed to terminal and maybe to a text 188 // file. 189 190 StringBuilder sb = new StringBuilder(); 191 192 for (String newspaper : newsPaperSections.keySet()) 193 { 194 sb.append(newspaper + '\n'); 195 for (URL section : newsPaperSections.get(newspaper)) 196 sb.append(section.toString() + '\n'); 197 sb.append("\n\n***************************************************\n\n"); 198 } 199 200 String s = sb.toString(); 201 System.out.println(s); 202 203 // If there is a command-line parameter, it shall be interpreted a file-name. 204 // The contents of the "sections data-file" (as text) will be written a file on the 205 // file-system using the String-value of "argv[0]" as the name of the output-filename. 206 207 if (argv.length == 1) FileRW.writeFile(s, argv[0]); 208 } 209 210 // URLFilter.regexKEEP(Pattern.compile("^http.+baidu\\.com\\/s\\?id=\\d+$"))); 211 // ArticleGet.usual(TextComparitor.CN_CI, "article-content")); 212 213 /** 214 * The News Site at address: <CODE><A HREF="https://www.abc.es/" TARGET=_blank> 215 * "https://www.abc.es/"</A></CODE> is slightly more complicated when retrieving News-Article 216 * Links. 217 * 218 * <BR /><BR />Notice that each newspaper article {@code URL}-link is "wrapped" in an HTML 219 * {@code '<ARTICLE>...</ARTICLE>'} Element. 220 * 221 * <BR /><BR />If this code were translated into an "XPath Query" or "CSS Selector", it would 222 * read: <B>{@code article a}</B>. Specifically it says to find all {@code 'Anchor'} elements 223 * that are descendants of {@code 'Article'} Elements. 224 * 225 * @see TagNodeFindL1Inclusive#all(Vector, String) 226 * @see TagNodeGet#first(Vector, int, int, TC, String[]) 227 * @see TagNode#AV(String) 228 */ 229 public static Vector<String> ABC_LINKS_GETTER(URL url, Vector<HTMLNode> page) 230 { 231 Vector<String> ret = new Vector<>(); TagNode tn; String urlStr; 232 233 // Links are kept inside <ARTICLE> ... </ARTICLE> on the main / section page. 234 for (DotPair article : TagNodeFindL1Inclusive.all(page, "article")) 235 236 // Now find the <A HREF=...> ... </A> 237 if ((tn = TagNodeGet.first(page, article.start, article.end, TC.OpeningTags, "a")) 238 != null) 239 240 if ((urlStr = tn.AV("href")) != null) 241 ret.add(urlStr); 242 243 return ret; 244 } 245 246 /** 247 * This is the {@code NewsSite} definition for the Newspaper located at: 248 * <CODE><A HREF="https://www.abc.es/" TARGET=_blank>https://www.abc.es/</A></CODE>. 249 * 250 * <BR /><BR /><TABLE CLASS=JDBriefTable> 251 * <TR><TH>Parameter</TH> <TH>Significance</TH></TR> 252 * <TR><TD>Newspaper Name</TD> <TD>ABC España</TD></TR> 253 * <TR><TD>Country of Origin</TD> <TD>Spain</TD></TR> 254 * <TR><TD>Website URL</TD> <TD>{@code https://abc.es}</TD></TR> 255 * <TR><TD>Newspaper Printing Language</TD> <TD>Spanish</TD></TR> 256 * </TABLE> 257 * 258 * <BR /><TABLE CLASS=NEWSSITE> 259 * <TR><TH>Parameter</TH><TH>Purpose</TH><TH>Value</TH></TR> 260 * <TR> <TD>Newspaper Article Groups / Sections</TD> 261 * <TD>Scrape Sections</TD> 262 * <TD><I>Retrieved from Data File</I></TD> 263 * </TR> 264 * <TR> <TD><B>{@link StrFilter}</B></TD> 265 * <TD>News Web-Site Section-Page Aritlce-Link ({@code <A HREF=...>}) Filter</TD> 266 * <TD>{@code 'HREF'} must end with {@code '.html'} 267 * <BR />See: {@link StrFilter#comparitor(TextComparitor, String[])} 268 * <BR />See: {@link TextComparitor#EW_CI} 269 * </TD> 270 * </TR> 271 * <TR> <TD><B>{@link LinksGet}</B></TD> 272 * <TD>Used to <B><I>manually</I></B> retrieve Article-Link {@code URL's}</TD> 273 * <TD>Invokes method {@link #ABC_LINKS_GETTER(URL, Vector)}</TD> 274 * </TR> 275 * <TR> <TD><B>{@link ArticleGet}</B></TD> 276 * <TD>Retrieves Article-Body Content from an Article-Link Web-Page</TD> 277 * <TD>{@code <MAIN>...</MAIN>}<BR />See: {@link ArticleGet#usual(String)}</TD> 278 * </TR> 279 * </TABLE> 280 * 281 * <BR />View a copy of the logs that are generated from using this {@code NewsSite} instance. 282 * <BR /><BR /><UL CLASS=JDUL> 283 * <LI> <CODE><B><A HREF='doc-files/Logs/ABC.ES-ScrapeURLs.html'> 284 * ABC.ES ScrapeURLs LOG</A></B></CODE> 285 * </LI> 286 * <LI> {@code ScrapeArticles} 287 * <BR /><B>IMPORTANT NOTE:</B> Though {@code ScrapeURL's} code <I>will check for 288 * duplicate {@code URL's}</I> that may be returned <I>within any given-section</I>, 289 * {@code Article URL's} may be repeated among the different sections of the newspaper. 290 * Since the {@code URL}-scrape returned nearly 3,000 articles, the log of 291 * an {@code Article} scrape is not included here. Proper duplicate {@code URL} checking 292 * code has obviously been written, but would be too complicated to show in this example. 293 * </LI> 294 * </UL> 295 * 296 * <EMBED CLASS='external-html' DATA-FILE-ID=NEWS_STE_CHANGE> 297 */ 298 public static final NewsSite ABCES = new NewsSite 299 ( 300 "ABC España", Country.Spain, "https://www.abc.es/", LC.ES, 301 "ABC is a Spanish national daily newspaper. It is the third largest general-interest " + 302 "newspaper in Spain, and the oldest newspaper still operating in Madrid.", 303 newsPaperSections.get("ABCES"), 304 StrFilter.comparitor(TextComparitor.EW_CI, ".html"), 305 NewsSites::ABC_LINKS_GETTER, 306 ArticleGet.usual("main"), 307 null /* bannerAndAdFinder */ 308 ); 309 310 /** 311 * This is the {@code NewsSite} definition for the Newspaper located at: 312 * <CODE><A HREF="https://www.elpulso.mx/" TARGET=_blank>https://www.elpulso.mx/</A></CODE>. 313 * 314 * <BR /><BR /><TABLE CLASS=JDBriefTable> 315 * <TR><TH>Parameter</TH> <TH>Significance</TH></TR> 316 * <TR><TD>Newspaper Name</TD> <TD>El Pulso, México</TD></TR> 317 * <TR><TD>Country of Origin</TD> <TD>México</TD></TR> 318 * <TR><TD>Website URL</TD> <TD>{@code https://elpulso.mx}</TD></TR> 319 * <TR><TD>Newspaper Printing Language</TD> <TD>Spanish</TD></TR> 320 * </TABLE> 321 * 322 * <BR /><TABLE CLASS=NEWSSITE> 323 * <TR><TH>Parameter</TH><TH>Purpose</TH><TH>Value</TH></TR> 324 * <TR> <TD>Newspaper Article Groups / Sections</TD> 325 * <TD>Scrape Sections</TD> 326 * <TD><I>Retrieved from Data File</I></TD> 327 * </TR> 328 * <TR> <TD><B>{@link StrFilter}</B></TD> 329 * <TD>News Web-Site Section-Page Aritlce-Link ({@code <A HREF=...>}) Filter</TD> 330 * <TD>{@code HREF} must match: {@code http://some.domain/YYYY/MM/DD/<article-name>/}</TD> 331 * </TR> 332 * <TR> <TD><B>{@link LinksGet}</B></TD> 333 * <TD>Used to <B><I>manually</I></B> retrieve Article-Link {@code URL's}</TD> 334 * <TD><B>{@code null}</B>. Retrieves <B><I>all</I></B> Anchor-Links on a Section-Page. 335 * Note that {@code URL's} must still pass the previous {@code StrFilter} (above) 336 * in order to be parsed as {@link Article}'s. 337 * </TD> 338 * </TR> 339 * <TR> <TD><B>{@link ArticleGet}</B></TD> 340 * <TD>Retrieves Article-Body Content from an Article-Link Web-Page</TD> 341 * <TD>{@code <DIV CLASS="entry-content">...</DIV>} 342 * <BR />See: {@link ArticleGet#usual(TextComparitor, String[])} 343 * <BR />See: {@link TextComparitor#C} 344 * </TD> 345 * </TR> 346 * </TABLE> 347 */ 348 public static final NewsSite Pulso = new NewsSite 349 ( 350 "El Pulso, México", Country.Mexico, "https://elpulso.mx", LC.ES, 351 "El Pulso newspaper is Spanish language newspaper in Mexico. It is showing breaking " + 352 "news, " + 353 "headlines, kids news, tourism news, entertainment news, study news, industrial news, " + 354 "economical news, health & beauty news, crime news, career news, Travel news, " + 355 "diet & fitness news, Top stories, special news, celebrity news.", 356 newsPaperSections.get("PULSO"), 357 StrFilter.regExKEEP(Pattern.compile( 358 "^https?:\\/{2}.*?\\/\\d{4}\\/\\d{2}\\/\\d{2}\\/[\\w-]{10,}\\/$" 359 ), false), 360 null /* LinksGet */, 361 ArticleGet.usual(TextComparitor.C, "entry-content"), 362 null /* bannerAndAddFinder */ 363 ); 364 365 /** 366 * The News Site at address: <CODE><A HREF="https://www.ElNacional.com/" TARGET=_blank> 367 * "https://www.ElNacional.com/"</A></CODE> is slightly more complicated when retrieving 368 * News-Article Links. 369 * 370 * <BR /><BR />Notice that each newspaper article {@code URL}-link is "wrapped" in an HTML 371 * {@code '<DIV CLASS="td-module-thumb">...</DIV>'} Element. 372 * 373 * <BR /><BR />If this code were translated into an "XPath Query" or "CSS Selector", it would 374 * read: <B>{@code div.td-module-thumb a}</B>. Specifically it says to find all 375 * {@code 'Anchor'} elements that are descendants of {@code 'DIV'} Elements where said 376 * Divider's CSS {@code CLASS} contains {@code 'td-module-thumb'}. 377 * 378 * @see InnerTagFindInclusive#all(Vector, String, String, TextComparitor, String[]) 379 * @see TagNodeGet#first(Vector, int, int, TC, String[]) 380 * @see TagNode#AV(String) 381 */ 382 public static Vector<String> EL_NACIONAL_LINKS_GETTER(URL url, Vector<HTMLNode> page) 383 { 384 Vector<String> ret = new Vector<>(); TagNode tn; String urlStr; 385 386 // Links are kept inside <DIV CLASS=td-module-thumb> ... </DIV> on the main / section page. 387 for (DotPair article : InnerTagFindInclusive.all 388 (page, "div", "class", TextComparitor.C, "td-module-thumb")) 389 390 // Now find the <A HREF=...> ... </A> 391 if ((tn = TagNodeGet.first 392 (page, article.start, article.end, TC.OpeningTags, "a")) != null) 393 394 if ((urlStr = tn.AV("href")) != null) 395 ret.add(urlStr); 396 397 return ret; 398 } 399 400 /** 401 * This is the {@code NewsSite} definition for the Newspaper located at: 402 * <CODE><A HREF="https://www.elnacional.com/" TARGET=_blank> 403 * https://www.elnacional.com/</A></CODE>. 404 * 405 * <BR /><BR /><TABLE CLASS=JDBriefTable> 406 * <TR><TH>Parameter</TH> <TH>Significance</TH></TR> 407 * <TR><TD>Newspaper Name</TD> <TD>El Nacional</TD></TR> 408 * <TR><TD>Country of Origin</TD> <TD>Venezuela</TD></TR> 409 * <TR><TD>Website URL</TD> <TD>{@code https://elnacional.com}</TD></TR> 410 * <TR><TD>Newspaper Printing Language</TD> <TD>Spanish</TD></TR> 411 * </TABLE> 412 * 413 * <BR /><TABLE CLASS=NEWSSITE> 414 * <TR><TH>Parameter</TH><TH>Purpose</TH><TH>Value</TH></TR> 415 * <TR> <TD>Newspaper Article Groups / Sections</TD> 416 * <TD>Scrape Sections</TD> 417 * <TD><I>Retrieved from Data File</I></TD> 418 * </TR> 419 * <TR> <TD><B>{@link URLFilter}</B></TD> 420 * <TD>News Web-Site Section-Page Aritlce-Link ({@code <A HREF=...>}) Filter</TD> 421 * <TD><B>{@code null}</B>. The {@code LinksGet} provided here will only return valid 422 * {@code Article URL's}, so there is no need for a {@code URLFilter}. 423 * </TD> 424 * </TR> 425 * <TR> <TD><B>{@link LinksGet}</B></TD> 426 * <TD>Used to <B><I>manually</I></B> retrieve Article-Link {@code URL's}</TD> 427 * <TD>Invokes method {@link #EL_NACIONAL_LINKS_GETTER(URL, Vector)}</TD> 428 * </TR> 429 * <TR> <TD><B>{@link ArticleGet}</B></TD> 430 * <TD>Retrieves Article-Body Content from an Article-Link Web-Page</TD> 431 * <TD>{@code <ARTICLE>...</ARTICLE>}<BR />See: {@link ArticleGet#usual(String)}</TD> 432 * </TR> 433 * </TABLE> 434 * 435 * <BR />View a copy of the logs that are generated from using this {@code NewsSite}. 436 * <BR /><BR /><UL CLASS=JDUL> 437 * <LI> <CODE><B><A HREF='doc-files/Logs/ElNacional-ScrapeURLs.html'> 438 * El Nacional ScrapeURLs LOG</A></B></CODE> 439 * </LI> 440 * <LI> <CODE><B><A HREF='doc-files/Logs/ElNacional-ScrapeArticles.html'> 441 * El Nacional ScrapeArticles LOG</A></B></CODE> 442 * </LI> 443 * </UL> 444 * 445 * <EMBED CLASS='external-html' DATA-FILE-ID=NEWS_STE_CHANGE> 446 */ 447 public static final NewsSite ElNacional = new NewsSite 448 ( 449 "El Nacional", Country.Venezuela, "https://elnacional.com", LC.ES, 450 "El Nacional is a Venezuelan publishing company under the name C.A. Editorial " + 451 "El Nacional, " + 452 "most widely known for its El Nacional newspaper and website. It, along with Últimas " + 453 "Noticias and El Universal, are the most widely read and circulated daily national " + 454 "newspapers in the country, and it has an average of more than 80,000 papers distributed " + 455 "daily and 170,000 copies on weekends.", 456 newsPaperSections.get("ElNacional"), 457 (URLFilter) null, /* The LinksGetter will only return valid Anchor's */ 458 NewsSites::EL_NACIONAL_LINKS_GETTER, 459 ArticleGet.usual("article"), 460 null /* bannerAndAdFinder */ 461 ); 462 463 /** 464 * The News Site at address: <CODE><A HREF="https://www.ElEspectador.com/" TARGET=_blank> 465 * "https://www.ElEspectador.com/"</A></CODE> is slightly more complicated when retrieving 466 * News-Article Links. 467 * 468 * <BR /><BR />Notice that each newspaper article {@code URL}-link is "wrapped" in an HTML 469 * {@code '<DIV CLASS="Card ...">...</DIV>'} Element. 470 * 471 * <BR /><BR />If this code were translated into an "XPath Query" or "CSS Selector", it would 472 * read: <B>{@code div.Card a.card-link}</B>. Specifically it says to find all 473 * {@code 'Anchor'} elements whose CSS {@code Class} contains {@code 'card-link'} and which 474 * are descendants of {@code 'DIV'} Elements where said Divider's 475 * CSS {@code CLASS} contains {@code 'Card'}. 476 * 477 * @see InnerTagFindInclusive#all(Vector, String, String, TextComparitor, String[]) 478 * @see InnerTagGet#first(Vector, int, int, String, String, TextComparitor, String[]) 479 * @see TagNode#AV(String) 480 */ 481 public static Vector<String> EL_ESPECTADOR_LINKS_GETTER(URL url, Vector<HTMLNode> page) 482 { 483 Vector<String> ret = new Vector<>(); 484 485 TagNode tn; 486 String urlStr; 487 488 // Links are kept inside <DIV CLASS="Card ..."> ... </DIV> on the main / section page. 489 for (DotPair article : InnerTagFindInclusive.all 490 (page, "div", "class", TextComparitor.C, "Card")) 491 492 // Now find the <A CLASS="card-link" HREF=...> ... </A> 493 if ((tn = InnerTagGet.first 494 (page, article.start, article.end, "a", "class", TextComparitor.C, "card-link")) 495 != null) 496 497 if ((urlStr = tn.AV("href")) != null) 498 ret.add(urlStr); 499 500 return ret; 501 } 502 503 /** 504 * This is the {@code NewsSite} definition for the Newspaper located at: 505 * <CODE><A HREF="https://www.elespectador.com/" TARGET=_blank> 506 * https://www.elespectador.com/</A></CODE>. 507 * 508 * <BR /><BR /><TABLE CLASS=JDBriefTable> 509 * <TR><TH>Parameter</TH> <TH>Significance</TH></TR> 510 * <TR><TD>Newspaper Name</TD> <TD>El Espectador</TD></TR> 511 * <TR><TD>Country of Origin</TD> <TD>Columbia</TD></TR> 512 * <TR><TD>Website URL</TD> <TD>{@code https://elespectador.com}</TD></TR> 513 * <TR><TD>Newspaper Printing Language</TD> <TD>Spanish</TD></TR> 514 * </TABLE> 515 * 516 * <BR /><TABLE CLASS=NEWSSITE> 517 * <TR><TH>Parameter</TH><TH>Purpose</TH><TH>Value</TH></TR> 518 * <TR> <TD>Newspaper Article Groups / Sections</TD> 519 * <TD>Scrape Sections</TD> 520 * <TD><I>Retrieved from Data File</I></TD> 521 * </TR> 522 * <TR> <TD><B>{@link StrFilter}</B></TD> 523 * <TD>News Web-Site Section-Page Aritlce-Link ({@code <A HREF=...>}) Filter</TD> 524 * <TD>{@code HREF} must end with a forward-slash {@code '/'} character. 525 * <BR />See: {@link TextComparitor#ENDS_WITH} 526 * </TD> 527 * </TR> 528 * <TR> <TD><B>{@link LinksGet}</B></TD> 529 * <TD>Used to <B><I>manually</I></B> retrieve Article-Link {@code URL's}</TD> 530 * <TD>Invokes method {@link #EL_NACIONAL_LINKS_GETTER(URL, Vector)}</TD> 531 * </TR> 532 * <TR> <TD><B>{@link ArticleGet}</B></TD> 533 * <TD>Retrieves Article-Body Content from an Article-Link Web-Page</TD> 534 * <TD>{@code <DIV CLASS="l-main">...</DIV>} 535 * <BR />See: {@link ArticleGet#usual(TextComparitor, String[])} 536 * <BR />See: {@link TextComparitor#C} 537 * </TD> 538 * </TR> 539 * </TABLE> 540 * 541 * <BR />View a copy of the logs that are generated from using this {@code NewsSite}. 542 * <BR /><BR /><UL CLASS=JDUL> 543 * <LI> <CODE><B><A HREF='doc-files/Logs/ElEspectador-ScrapeURLs.html'> 544 * El Espectador ScrapeURLs LOG</A></B></CODE> 545 * </LI> 546 * <LI> <CODE><B><A HREF='doc-files/Logs/ElEspectador-ScrapeArticles.html'> 547 * El Espectador ScrapeArticles LOG</A></B></CODE> 548 * </LI> 549 * </UL> 550 * 551 * <EMBED CLASS='external-html' DATA-FILE-ID=NEWS_STE_CHANGE> 552 */ 553 public static final NewsSite ElEspectador = new NewsSite 554 ( 555 "El Espectador, Columbia", Country.Colombia, "https://elespectador.com", LC.ES, 556 "El Espectador (meaning \"The Spectator\") is a newspaper with national circulation " + 557 "within " + 558 "Colombia, founded by Fidel Cano Gutiérrez on 22 March 1887 in Medellín and published " + 559 "since 1915 in Bogotá. It changed from a daily to a weekly edition in 2001, following a " + 560 "financial crisis, and became a daily again on 11 May 2008, a comeback which had been " + 561 "long rumoured, in tabloid format (28 x 39.5 cm). From 1997 to 2011 its main shareholder " + 562 "was Julio Mario Santo Domingo.", 563 newsPaperSections.get("ElEspectador"), 564 StrFilter.comparitor(TextComparitor.ENDS_WITH, "/"), 565 NewsSites::EL_ESPECTADOR_LINKS_GETTER, 566 ArticleGet.usual("article"), 567 null /* bannerAndAdFinder */ 568 ); 569 570 571 /** 572 * The News Site at address: <CODE><A HREF="https://www.gov.cn/" TARGET=_blank> 573 * "https://www.gov.cn/"</A></CODE> has a Java-Script "Links Carousel". Essentially, there 574 * is a section with "Showcased News Articles" that are intended to be emphasize anywhere 575 * between four and eight primary articles. 576 * 577 * <BR /><BR />This Links-Carousel is wrapped in an HTML Divider Element as below: 578 * {@code <DIV CLASS="slider-carousel">}. 579 * 580 * <BR /><BR />If this code were translated into an "XPath Query" or "CSS Selector", it would 581 * read: <B>{@code div[class=slider-carousel] a}</B>. Specifically it says to find all 582 * {@code 'Anchor'} elements that are descendants of {@code '<DIV CLASS="slider-carousel">'} 583 * Elements. 584 * 585 * @see InnerTagGetInclusive#first(Vector, String, String, TextComparitor, String[]) 586 * @see TagNodeGet#all(Vector, TC, String[]) 587 * @see TagNode#AV(String) 588 */ 589 public static Vector<String> GOVCN_CAROUSEL_LINKS_GETTER(URL url, Vector<HTMLNode> page) 590 { 591 Vector<String> ret = new Vector<>(); 592 String urlStr; 593 594 // Find the first <DIV CLASS="slider-carousel"> ... </DIV> section 595 Vector<HTMLNode> carouselDIV = InnerTagGetInclusive.first 596 (page, "div", "class", TextComparitor.CN_CI, "slider-carousel"); 597 598 // Retrieve any HTML Anchor <A HREF=...> ... </A> found within the contents of the 599 // Divider. 600 601 for (TagNode tn: TagNodeGet.all(carouselDIV, TC.OpeningTags, "a")) 602 if ((urlStr = tn.AV("href")) != null) 603 ret.add(urlStr); 604 605 return ret; 606 }; 607 608 /** 609 * This is the {@code NewsSite} definition for the Newspaper located at: 610 * <CODE><A HREF="https://www.gov.cn/" TARGET=_blank> 611 * https://www.gov.cn/</A></CODE>. 612 * 613 * <BR /><BR />The "Carousels" are just the emphasized or "HiLighted" links that are 614 * on three separate pages. There is a complete-link {@code NewsSite} definition that 615 * will retrieve all links - <I>not just the links hilited by the carousel.</I> 616 * 617 * <BR /><BR /><TABLE CLASS=JDBriefTable> 618 * <TR><TH>Parameter</TH> <TH>Significance</TH></TR> 619 * <TR><TD>Newspaper Name</TD> <TD>Chinese Government Web Portal</TD></TR> 620 * <TR><TD>Country of Origin</TD> <TD>People's Republic of China</TD></TR> 621 * <TR><TD>Website URL</TD> <TD>{@code https://gov.cn}</TD></TR> 622 * <TR><TD>Newspaper Printing Language</TD> <TD>Mandarin Chinese</TD></TR> 623 * </TABLE> 624 * 625 * <BR /><TABLE CLASS=NEWSSITE> 626 * <TR><TH>Parameter</TH><TH>Purpose</TH><TH>Value</TH></TR> 627 * <TR> <TD>Newspaper Article Groups / Sections</TD> 628 * <TD>Scrape Sections</TD> 629 * <TD><I>Retrieved from Data File</I></TD> 630 * </TR> 631 * <TR> <TD><B>{@link StrFilter}</B></TD> 632 * <TD>News Web-Site Section-Page Aritlce-Link ({@code <A HREF=...>}) Filter</TD> 633 * <TD>{@code HREF} must match: 634 * {@code "^http://www.gov.cn/(?:.+?/)?\\d{4}-\\d{2}/\\d{2}/(?:.+?/)?content_\\d+.htm(?:l)?(#\\d+)?"} 635 * </TD> 636 * </TR> 637 * <TR> <TD><B>{@link LinksGet}</B></TD> 638 * <TD>Used to <B><I>manually</I></B> retrieve Article-Link {@code URL's}</TD> 639 * <TD>Invokes method {@link #GOVCN_CAROUSEL_LINKS_GETTER(URL, Vector)}</TD> 640 * </TR> 641 * <TR> <TD><B>{@link ArticleGet}</B></TD> 642 * <TD>Retrieves Article-Body Content from an Article-Link Web-Page</TD> 643 * <TD>{@code <DIV CLASS="article ...">...</DIV>} 644 * <BR />See: {@link ArticleGet#usual(TextComparitor, String[])} 645 * <BR />See: {@link TextComparitor#C} 646 * </TD> 647 * </TR> 648 * </TABLE> 649 * 650 * <BR />View a copy of the logs that are generated from using this {@code NewsSite}. 651 * <BR /><BR /><UL CLASS=JDUL> 652 * <LI> <CODE><B><A HREF='doc-files/Logs/GovCNCarousel-ScrapeURLs.html'> 653 * Gov.CN Carousel ScrapeURLs LOG</A></B></CODE> 654 * </LI> 655 * <LI> <CODE><B><A HREF='doc-files/Logs/GovCNCarousel-ScrapeArticles.html'> 656 * Gov.CN Carousel ScrapeArticles LOG</A></B></CODE> 657 * </LI> 658 * </UL> 659 * 660 * <EMBED CLASS='external-html' DATA-FILE-ID=NEWS_STE_CHANGE> 661 */ 662 public static final NewsSite GovCNCarousel = new NewsSite 663 ( 664 "Chinese Government Web Portal", Country.China, "https://gov.cn/", LC.ZH_CN, 665 "The Chinese Government Sponsored Web-Site", 666 newsPaperSections.get("GovCNCarousel"), 667 StrFilter.regExKEEP(Pattern.compile( 668 "^http://www.gov.cn/(?:.+?/)?\\d{4}-\\d{2}/\\d{2}/(?:.+?/)?" + 669 "content_\\d+.htm(?:l)?(#\\d+)?" 670 ), false), 671 NewsSites::GOVCN_CAROUSEL_LINKS_GETTER, 672 ArticleGet.usual(TextComparitor.C, "article"), 673 null /* bannerAndAddFinder */ 674 ); 675 676 /** 677 * This is the {@code NewsSite} definition for the Newspaper located at: 678 * <CODE><A HREF="https://www.gov.cn/" TARGET=_blank> 679 * https://www.gov.cn/</A></CODE>. 680 * 681 * <BR /><BR />This version of the "Gov.CN" website will scour a larger set of section 682 * {@code URL's}, and will not limit the returned Article-Links to just those found on the 683 * java-script carousel. The Java-Script Carousel will almost always have a total of five 684 * news-article links available. This definition of {@code 'NewsSite'} may return up to 685 * thirty to forty different articles per news-section. 686 * 687 * <BR /><BR /><TABLE CLASS=JDBriefTable> 688 * <TR><TH>Parameter</TH> <TH>Significance</TH></TR> 689 * <TR><TD>Newspaper Name</TD> <TD>Chinese Government Web Portal</TD></TR> 690 * <TR><TD>Country of Origin</TD> <TD>People's Republic of China</TD></TR> 691 * <TR><TD>Website URL</TD> <TD>{@code https://gov.cn}</TD></TR> 692 * <TR><TD>Newspaper Printing Language</TD> <TD>Mandarin Chinese</TD></TR> 693 * </TABLE> 694 * 695 * <BR /><TABLE CLASS=NEWSSITE> 696 * <TR><TH>Parameter</TH><TH>Purpose</TH><TH>Value</TH></TR> 697 * <TR> <TD>Newspaper Article Groups / Sections</TD> 698 * <TD>Scrape Sections</TD> 699 * <TD><I>Retrieved from Data File</I></TD> 700 * </TR> 701 * <TR> <TD><B>{@link StrFilter}</B></TD> 702 * <TD>News Web-Site Section-Page Aritlce-Link ({@code <A HREF=...>}) Filter</TD> 703 * <TD>{@code HREF} must match: 704 * {@code "^http://www.gov.cn/(?:.+?/)?\\d{4}-\\d{2}/\\d{2}/(?:.+?/)?content_\\d+.htm(?:l)?(#\\d+)?"} 705 * </TD> 706 * </TR> 707 * <TR> <TD><B>{@link LinksGet}</B></TD> 708 * <TD>Used to <B><I>manually</I></B> retrieve Article-Link {@code URL's}</TD> 709 * <TD><B>{@code null}</B>. Retrieves <B><I>all</I></B> Anchor-Links on a Section-Page. 710 * Note that {@code URL's} must still pass the previous {@code StrFilter} (above) 711 * in order to be parsed as {@link Article}'s. 712 * </TD> 713 * </TR> 714 * <TR> <TD><B>{@link ArticleGet}</B></TD> 715 * <TD>Retrieves Article-Body Content from an Article-Link Web-Page</TD> 716 * <TD>{@code <DIV CLASS="article ...">...</DIV>} 717 * <BR />See: {@link ArticleGet#usual(TextComparitor, String[])} 718 * <BR />See: {@link TextComparitor#C} 719 * </TD> 720 * </TR> 721 * </TABLE> 722 * 723 * <BR />View a copy of the logs that are generated from using this {@code NewsSite}. 724 * <BR /><BR /><UL CLASS=JDUL> 725 * <LI> <CODE><B><A HREF='doc-files/Logs/GovCN-ScrapeURLs.html'> 726 * Gov.CN ScrapeURLs LOG</A></B></CODE> 727 * </LI> 728 * <LI> <CODE><B><A HREF='doc-files/Logs/GovCN-ScrapeArticles.html'> 729 * Gov.CN ScrapeArticles LOG</A></B></CODE> 730 * </LI> 731 * </UL> 732 * 733 * <EMBED CLASS='external-html' DATA-FILE-ID=NEWS_STE_CHANGE> 734 */ 735 public static final NewsSite GovCN = new NewsSite 736 ( 737 "Chinese Government Web Portal", Country.China, "https://gov.cn/", LC.ZH_CN, 738 "The Chinese Government Sponsored Web-Site", 739 newsPaperSections.get("GovCN"), 740 StrFilter.regExKEEP(Pattern.compile( 741 "^http://www.gov.cn/(?:.+?/)?\\d{4}-\\d{2}/\\d{2}/(?:.+?/)?" + 742 "content_\\d+.htm(?:l)?(#\\d+)?" 743 ), false), 744 null, 745 ArticleGet.usual(TextComparitor.C, "article"), 746 null /* bannerAndAddFinder */ 747 ); 748}