001package Torello.HTML.Tools.NewsSite; 002 003import Torello.HTML.*; 004import Torello.HTML.NodeSearch.*; 005import Torello.HTML.Tools.NewsSite.*; 006import Torello.Java.*; 007 008import Torello.Languages.LC; 009 010import java.util.*; 011import java.util.regex.*; 012 013import java.net.URL; 014import java.io.*; 015 016/** 017 * This class is nothing more than an 'Example Class' that contains some foreign-language 018 * based news web-pages, from both overseas and from Latin America. 019 * 020 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=NEWS_SITES> 021 */ 022public class NewsSites 023{ 024 private NewsSites() { } 025 026 @SuppressWarnings("unchecked") 027 private static final Hashtable<String, Vector<URL>> newsPaperSections = 028 (Hashtable<String, Vector<URL>>) LFEC.readObjectFromFile_JAR 029 (Torello.Data.DataFileLoader.class, "data17.htdat", true, Hashtable.class); 030 031 /** 032 * This example will run the news-site scrape on the Chinese Government News Article 033 * Carousel. 034 * 035 * <BR /><BR /><B><SPAN STYLE="color: red;">IMPORTANT NOTE:</B></SPAN> This will method will 036 * create a directory called <B>"cnb"</B> on your file-system where it will write the contents 037 * of (most likely) 15 news-paper articles to disk as HTML files. 038 * 039 * The output log generated by this method may be viewed here: 040 * <BR /><BR /><B><CODE><A HREF='doc-files/Logs/Gov.CN.log.html'> 041 * Gov.CN.log.html</A></B></CODE> 042 * 043 * @throws IOException This throws for IO errors that may occur when reading the web-server, 044 * or when saving the web-pages or images to the file-system. 045 * 046 * @see FileRW#delTree(String, boolean, Appendable) 047 * @see NewsSite 048 * @see FileRW#writeFile(CharSequence, String) 049 * @see C#toHTML(String, boolean, boolean, boolean) 050 */ 051 public static void runExample() throws IOException 052 { 053 StorageWriter log = new StorageWriter(); 054 055 // This directory will contain ".dat" files that are simply "Serialized" HTML Vectors. 056 // Each ".dat" file will contain precisely one HTML page. 057 final String dataFilesDir = "cnb" + File.separator + "articleData" + File.separator; 058 059 // This directory will contain sub-directories with ".html" files (and image-files) 060 // for each news-article that is saved / downloaded. 061 final String htmlFilesDir = "cnb" + File.separator + "articleHTML" + File.separator; 062 063 // This CLEARS WHATEVE DATA IS CURRENTLY IN THE DIRECTORY (by deleting all its contents) 064 // The following code is the same as the UNIX Shell Command: 065 // rm -r cnb/articleData/ 066 // mkdir cnb/articleData 067 FileRW.delTree(dataFilesDir, true, log); 068 069 // The following code is the same as the UNIX Shell Command: 070 // rm -r cnb/articleHTML/ 071 // mkdir cnb/articleHTML 072 FileRW.delTree(htmlFilesDir, true, log); 073 074 // ***************************************** 075 // Previous Download Data Erased (if any) 076 // Start the today's News-Site Scrape 077 // ***************************************** 078 079 // Use the "GovCNCarousel" instance that is created in this class as a NewsSite 080 NewsSite ns = NewsSites.GovCNCarousel; 081 082 // Call the "Scrape URLs" class to retrieve all of the available newspaper articles 083 // on the Java-Script "Article Carousel" Again, the "Article Carousel" is just this 084 // little widget at the top of the page that rotates (usually) five hilited / emphasized 085 // news-article links for today 086 Vector<Vector<String>> articleURLs = ScrapeURLs.get(ns, log); 087 088 // This is usually not very important if only a small number of articles are being 089 // scraped. When downloading hundreds of articles - being able to pause if there is a 090 // web-site IOError (And restart) is very important. 091 // 092 // The standard factory-generated "getFSInstance" creates a small file on the file-system 093 // for saving the "Download State" while downloading... 094 095 Pause pause = Pause.getFSInstance("cnb" + File.separator + "state.dat"); 096 pause.initialize(); 097 098 // The "Scraped Articles" will be sent to the directory named by "dataFilesDir" 099 // Using the File-System to save these articles is the default-factory means for 100 // saving article-data. Writing a customized "ScapedArticleReceiver" to do anything 101 // from saving article-data to a Data-Base up to and including e-mailing article data 102 // is possible using a self-written "ScrapedArticleReceiver" 103 ScrapedArticleReceiver receiver = ScrapedArticleReceiver.saveToFS(dataFilesDir); 104 105 // This will download each of the article's from their web-page URL. The web-page 106 // article URL's were retrieved by "Scraped URLs". The saved HTML (as HTML Vectors) 107 // is sent to the "Article Receiver" (defined in the previous step). These news articles 108 // are saved as ".dat" since they are serialized java-objects. 109 // 110 // Explaining some "unnamed parameters" passed to the method invocation below: 111 // 112 // true: [skipArticlesWithoutPhotos] Skips Mandarin Chinese Newspaper Articles that do not 113 // include at least one photo. Photos usually help when reading foreign news articles. 114 // null: [bannerAndAdFinder] Some sites include images for Facebook links or advertising. 115 // Gov.CN usually doesn't have these, but occasionally there are extraneous links. 116 // for the purposes of this example, this parameter is ignored, and passed null. 117 // false: [keepOriginalPageHTML] The "Complete Page" - content before the Article Body is 118 // extracted from the Article Web-Page is not saved. This can occasionally be useful 119 // if the HTML <HEAD>...</HEAD> has JSON or React-JS data to extract. 120 121 ScrapeArticles.download 122 (receiver, articleURLs, ns.articleGetter, true, null, false, pause, log); 123 124 // Now this will convert each of the ".dat" files to an ".html" file - and also it 125 // will download the pictures / image included in the article. 126 // 127 // Explaining some "unnamed parameters" passed to the method invocation below: 128 // 129 // true: [cleanIt] This runs some basic HTML remove operations. The best way to see 130 // what the parameter "cleanIt" asks to have removed is to view the class "ToHTML" 131 // null: [HTMLModifier] Cleaning up other extraneous links and content in an newspaper 132 // article body like advertising or links to other articles is usually necessary. 133 // Anywhere between 1 and 10 lines of NodeSearch Removal Operations will get rid of 134 // unnecessary HTML. For the purposes of this example, such a cleaning operation is 135 // not done here - although the final articles do include some "links to other 136 // articles" that is not "CLEANED" like it should be. 137 138 ToHTML.convert(dataFilesDir, htmlFilesDir, true, null, log); 139 140 // NOTE: The log of running this command on Debian UNIX / LINUX may be viewed in the 141 // JavaDoc Comments in the top of this method. If this method is run in an MS-DOS 142 // or Windows Environment, there will be no screen colors available to view. 143 FileRW.writeFile( 144 C.toHTML(log.getString(), true, true, true), 145 "cnb" + File.separator + "Gov.CN.log.html" 146 ); 147 } 148 149 /** 150 * Prints the contents of the Data File. Invoking this command allows a programmer to see 151 * which "sub-sections" are ascribed to each of the different news-paper definitions in this 152 * class. Each "sub-section" is nothing more than a {@code URL}-branch of the primary web 153 * site {@code URL}. 154 * 155 * <DIV CLASS="HTML">{@code 156 * <!-- If the following were the primary news-site --> 157 * http://news.baidu.com 158 * 159 * <!-- This would be a "sub-section" of the primary site --> 160 * http://news.baidu.com/sports 161 * }</DIV> 162 * 163 * <BR /><BR />Can be called from the command line. 164 * <BR /><BR />If a single command-line argument is passed to {@code "argv[0]"}, the contents 165 * of the "Sections URL Data File" will be output to a text-file that is named using the 166 * {@code String} passed to {@code "argv[0]"}. 167 * 168 * @param argv These are the command line arguments passed by the JRE to this method. 169 * @throws IOException If there are any problems while attempting to save the output to the 170 * the output file (if one was named / requested). 171 */ 172 public static void main(String[] argv) throws IOException 173 { 174 // Uncomment this line to run the example code (instead of section-data print) 175 // runExample(); System.exit(0); 176 177 // The data-file is loaded into private field "newsPaperSections" 178 // This private field is a Hashtable<String, Vector<URL>>. Convert each of 179 // these sections so that they may be printed to terminal and maybe to a text 180 // file. 181 StringBuilder sb = new StringBuilder(); 182 for (String newspaper : newsPaperSections.keySet()) 183 { 184 sb.append(newspaper + '\n'); 185 for (URL section : newsPaperSections.get(newspaper)) 186 sb.append(section.toString() + '\n'); 187 sb.append("\n\n***************************************************\n\n"); 188 } 189 190 String s = sb.toString(); 191 System.out.println(s); 192 193 // If there is a command-line parameter, it shall be interpreted a file-name. 194 // The contents of the "sections data-file" (as text) will be written a file on the 195 // file-system using the String-value of "argv[0]" as the name of the output-filename. 196 if (argv.length == 1) FileRW.writeFile(s, argv[0]); 197 } 198 199 // URLFilter.regexKEEP(Pattern.compile("^http.+baidu\\.com\\/s\\?id=\\d+$"))); 200 // ArticleGet.usual(TextComparitor.CN_CI, "article-content")); 201 202 /** 203 * The News Site at address: <CODE><A HREF="https://www.abc.es/" TARGET=_blank> 204 * "https://www.abc.es/"</A></CODE> is slightly more complicated when retrieving News-Article 205 * Links. 206 * 207 * <BR /><BR />Notice that each newspaper article {@code URL}-link is "wrapped" in an HTML 208 * {@code '<ARTICLE>...</ARTICLE>'} Element. 209 * 210 * <BR /><BR />If this code were translated into an "XPath Query" or "CSS Selector", it would 211 * read: <B>{@code article a}</B>. Specifically it says to find all {@code 'Anchor'} elements 212 * that are descendants of {@code 'Article'} Elements. 213 * 214 * @see TagNodeFindL1Inclusive#all(Vector, String) 215 * @see TagNodeGet#first(Vector, int, int, TC, String[]) 216 * @see TagNode#AV(String) 217 */ 218 public static Vector<String> ABC_LINKS_GETTER(URL url, Vector<HTMLNode> page) 219 { 220 Vector<String> ret = new Vector<>(); TagNode tn; String urlStr; 221 222 // Links are kept inside <ARTICLE> ... </ARTICLE> on the main / section page. 223 for (DotPair article : TagNodeFindL1Inclusive.all(page, "article")) 224 225 // Now find the <A HREF=...> ... </A> 226 if ((tn = TagNodeGet.first(page, article.start, article.end, TC.OpeningTags, "a")) != null) 227 228 if ((urlStr = tn.AV("href")) != null) 229 ret.add(urlStr); 230 231 return ret; 232 } 233 234 /** 235 * This is the {@code NewsSite} definition for the Newspaper located at: 236 * <CODE><A HREF="https://www.abc.es/" TARGET=_blank>https://www.abc.es/</A></CODE>. 237 * 238 * <BR /><BR /><TABLE CLASS=JDBriefTable> 239 * <TR><TH>Parameter</TH> <TH>Significance</TH></TR> 240 * <TR><TD>Newspaper Name</TD> <TD>ABC España</TD></TR> 241 * <TR><TD>Country of Origin</TD> <TD>Spain</TD></TR> 242 * <TR><TD>Website URL</TD> <TD>{@code https://abc.es}</TD></TR> 243 * <TR><TD>Newspaper Printing Language</TD> <TD>Spanish</TD></TR> 244 * </TABLE> 245 * 246 * <BR /><TABLE CLASS="NEWSSITE"> 247 * <TR><TH>Parameter</TH><TH>Purpose</TH><TH>Value</TH></TR> 248 * <TR> <TD>Newspaper Article Groups / Sections</TD> 249 * <TD>Scrape Sections</TD> 250 * <TD><I>Retrieved from Data File</I></TD> 251 * </TR> 252 * <TR> <TD><B>{@link StrFilter}</B></TD> 253 * <TD>News Web-Site Section-Page Aritlce-Link ({@code <A HREF=...>}) Filter</TD> 254 * <TD>{@code 'HREF'} must end with {@code '.html'} 255 * <BR />See: {@link StrFilter#comparitor(TextComparitor, String[])} 256 * <BR />See: {@link TextComparitor#EW_CI} 257 * </TD> 258 * </TR> 259 * <TR> <TD><B>{@link LinksGet}</B></TD> 260 * <TD>Used to <B><I>manually</I></B> retrieve Article-Link {@code URL's}</TD> 261 * <TD>Invokes method {@link #ABC_LINKS_GETTER(URL, Vector)}</TD> 262 * </TR> 263 * <TR> <TD><B>{@link ArticleGet}</B></TD> 264 * <TD>Retrieves Article-Body Content from an Article-Link Web-Page</TD> 265 * <TD>{@code <MAIN>...</MAIN>}<BR />See: {@link ArticleGet#usual(String)}</TD> 266 * </TR> 267 * </TABLE> 268 * 269 * <BR />View a copy of the logs that are generated from using this {@code NewsSite} instance. 270 * <BR /><BR /><UL CLASS=JDUL> 271 * <LI> <CODE><B><A HREF='doc-files/Logs/ABC.ES-ScrapeURLs.html'> 272 * ABC.ES ScrapeURLs LOG</A></B></CODE> 273 * </LI> 274 * <LI> {@code ScrapeArticles} 275 * <BR /><B>IMPORTANT NOTE:</B> Though {@code ScrapeURL's} code <I>will check for 276 * duplicate {@code URL's}</I> that may be returned <I>within any given-section</I>, 277 * {@code Article URL's} may be repeated among the different sections of the newspaper. 278 * Since the {@code URL}-scrape returned nearly 3,000 articles, the log of 279 * an {@code Article} scrape is not included here. Proper duplicate {@code URL} checking 280 * code has obviously been written, but would be too complicated to show in this example. 281 * </LI> 282 * </UL> 283 * 284 * <EMBED CLASS='external-html' DATA-FILE-ID=NEWS_STE_CHANGE> 285 */ 286 public static final NewsSite ABCES = new NewsSite 287 ( 288 "ABC España", Country.Spain, "https://www.abc.es/", LC.ES, 289 "ABC is a Spanish national daily newspaper. It is the third largest general-interest " + 290 "newspaper in Spain, and the oldest newspaper still operating in Madrid.", 291 newsPaperSections.get("ABCES"), 292 StrFilter.comparitor(TextComparitor.EW_CI, ".html"), 293 NewsSites::ABC_LINKS_GETTER, 294 ArticleGet.usual("main"), 295 null /* bannerAndAdFinder */ 296 ); 297 298 /** 299 * This is the {@code NewsSite} definition for the Newspaper located at: 300 * <CODE><A HREF="https://www.elpulso.mx/" TARGET=_blank>https://www.elpulso.mx/</A></CODE>. 301 * 302 * <BR /><BR /><TABLE CLASS=JDBriefTable> 303 * <TR><TH>Parameter</TH> <TH>Significance</TH></TR> 304 * <TR><TD>Newspaper Name</TD> <TD>El Pulso, México</TD></TR> 305 * <TR><TD>Country of Origin</TD> <TD>México</TD></TR> 306 * <TR><TD>Website URL</TD> <TD>{@code https://elpulso.mx}</TD></TR> 307 * <TR><TD>Newspaper Printing Language</TD> <TD>Spanish</TD></TR> 308 * </TABLE> 309 * 310 * <BR /><TABLE CLASS="NEWSSITE"> 311 * <TR><TH>Parameter</TH><TH>Purpose</TH><TH>Value</TH></TR> 312 * <TR> <TD>Newspaper Article Groups / Sections</TD> 313 * <TD>Scrape Sections</TD> 314 * <TD><I>Retrieved from Data File</I></TD> 315 * </TR> 316 * <TR> <TD><B>{@link StrFilter}</B></TD> 317 * <TD>News Web-Site Section-Page Aritlce-Link ({@code <A HREF=...>}) Filter</TD> 318 * <TD>{@code HREF} must match: {@code http://some.domain/YYYY/MM/DD/<article-name>/}</TD> 319 * </TR> 320 * <TR> <TD><B>{@link LinksGet}</B></TD> 321 * <TD>Used to <B><I>manually</I></B> retrieve Article-Link {@code URL's}</TD> 322 * <TD><B>{@code null}</B>. Retrieves <B><I>all</I></B> Anchor-Links on a Section-Page. 323 * Note that {@code URL's} must still pass the previous {@code StrFilter} (above) 324 * in order to be parsed as {@link Article}'s. 325 * </TD> 326 * </TR> 327 * <TR> <TD><B>{@link ArticleGet}</B></TD> 328 * <TD>Retrieves Article-Body Content from an Article-Link Web-Page</TD> 329 * <TD>{@code <DIV CLASS="entry-content">...</DIV>} 330 * <BR />See: {@link ArticleGet#usual(TextComparitor, String[])} 331 * <BR />See: {@link TextComparitor#C} 332 * </TD> 333 * </TR> 334 * </TABLE> 335 */ 336 public static final NewsSite Pulso = new NewsSite 337 ( 338 "El Pulso, México", Country.Mexico, "https://elpulso.mx", LC.ES, 339 "El Pulso newspaper is Spanish language newspaper in Mexico. It is showing breaking news, " + 340 "headlines, kids news, tourism news, entertainment news, study news, industrial news, " + 341 "economical news, health & beauty news, crime news, career news, Travel news, " + 342 "diet & fitness news, Top stories, special news, celebrity news.", 343 newsPaperSections.get("PULSO"), 344 StrFilter.regExKEEP(Pattern.compile( 345 "^https?:\\/{2}.*?\\/\\d{4}\\/\\d{2}\\/\\d{2}\\/[\\w-]{10,}\\/$" 346 ), false), 347 null /* LinksGet */, 348 ArticleGet.usual(TextComparitor.C, "entry-content"), 349 null /* bannerAndAddFinder */ 350 ); 351 352 /** 353 * The News Site at address: <CODE><A HREF="https://www.ElNacional.com/" TARGET=_blank> 354 * "https://www.ElNacional.com/"</A></CODE> is slightly more complicated when retrieving 355 * News-Article Links. 356 * 357 * <BR /><BR />Notice that each newspaper article {@code URL}-link is "wrapped" in an HTML 358 * {@code '<DIV CLASS="td-module-thumb">...</DIV>'} Element. 359 * 360 * <BR /><BR />If this code were translated into an "XPath Query" or "CSS Selector", it would 361 * read: <B>{@code div.td-module-thumb a}</B>. Specifically it says to find all 362 * {@code 'Anchor'} elements that are descendants of {@code 'DIV'} Elements where said 363 * Divider's CSS {@code CLASS} contains {@code 'td-module-thumb'}. 364 * 365 * @see InnerTagFindInclusive#all(Vector, String, String, TextComparitor, String[]) 366 * @see TagNodeGet#first(Vector, int, int, TC, String[]) 367 * @see TagNode#AV(String) 368 */ 369 public static Vector<String> EL_NACIONAL_LINKS_GETTER(URL url, Vector<HTMLNode> page) 370 { 371 Vector<String> ret = new Vector<>(); TagNode tn; String urlStr; 372 373 // Links are kept inside <DIV CLASS=td-module-thumb> ... </DIV> on the main / section page. 374 for (DotPair article : InnerTagFindInclusive.all 375 (page, "div", "class", TextComparitor.C, "td-module-thumb")) 376 377 // Now find the <A HREF=...> ... </A> 378 if ((tn = TagNodeGet.first 379 (page, article.start, article.end, TC.OpeningTags, "a")) != null) 380 381 if ((urlStr = tn.AV("href")) != null) 382 ret.add(urlStr); 383 384 return ret; 385 } 386 387 /** 388 * This is the {@code NewsSite} definition for the Newspaper located at: 389 * <CODE><A HREF="https://www.elnacional.com/" TARGET=_blank> 390 * https://www.elnacional.com/</A></CODE>. 391 * 392 * <BR /><BR /><TABLE CLASS=JDBriefTable> 393 * <TR><TH>Parameter</TH> <TH>Significance</TH></TR> 394 * <TR><TD>Newspaper Name</TD> <TD>El Nacional</TD></TR> 395 * <TR><TD>Country of Origin</TD> <TD>Venezuela</TD></TR> 396 * <TR><TD>Website URL</TD> <TD>{@code https://elnacional.com}</TD></TR> 397 * <TR><TD>Newspaper Printing Language</TD> <TD>Spanish</TD></TR> 398 * </TABLE> 399 * 400 * <BR /><TABLE CLASS="NEWSSITE"> 401 * <TR><TH>Parameter</TH><TH>Purpose</TH><TH>Value</TH></TR> 402 * <TR> <TD>Newspaper Article Groups / Sections</TD> 403 * <TD>Scrape Sections</TD> 404 * <TD><I>Retrieved from Data File</I></TD> 405 * </TR> 406 * <TR> <TD><B>{@link URLFilter}</B></TD> 407 * <TD>News Web-Site Section-Page Aritlce-Link ({@code <A HREF=...>}) Filter</TD> 408 * <TD><B>{@code null}</B>. The {@code LinksGet} provided here will only return valid 409 * {@code Article URL's}, so there is no need for a {@code URLFilter}. 410 * </TD> 411 * </TR> 412 * <TR> <TD><B>{@link LinksGet}</B></TD> 413 * <TD>Used to <B><I>manually</I></B> retrieve Article-Link {@code URL's}</TD> 414 * <TD>Invokes method {@link #EL_NACIONAL_LINKS_GETTER(URL, Vector)}</TD> 415 * </TR> 416 * <TR> <TD><B>{@link ArticleGet}</B></TD> 417 * <TD>Retrieves Article-Body Content from an Article-Link Web-Page</TD> 418 * <TD>{@code <ARTICLE>...</ARTICLE>}<BR />See: {@link ArticleGet#usual(String)}</TD> 419 * </TR> 420 * </TABLE> 421 * 422 * <BR />View a copy of the logs that are generated from using this {@code NewsSite}. 423 * <BR /><BR /><UL CLASS=JDUL> 424 * <LI> <CODE><B><A HREF='doc-files/Logs/ElNacional-ScrapeURLs.html'> 425 * El Nacional ScrapeURLs LOG</A></B></CODE> 426 * </LI> 427 * <LI> <CODE><B><A HREF='doc-files/Logs/ElNacional-ScrapeArticles.html'> 428 * El Nacional ScrapeArticles LOG</A></B></CODE> 429 * </LI> 430 * </UL> 431 * 432 * <EMBED CLASS='external-html' DATA-FILE-ID=NEWS_STE_CHANGE> 433 */ 434 public static final NewsSite ElNacional = new NewsSite 435 ( 436 "El Nacional", Country.Venezuela, "https://elnacional.com", LC.ES, 437 "El Nacional is a Venezuelan publishing company under the name C.A. Editorial El Nacional, " + 438 "most widely known for its El Nacional newspaper and website. It, along with Últimas " + 439 "Noticias and El Universal, are the most widely read and circulated daily national " + 440 "newspapers in the country, and it has an average of more than 80,000 papers distributed " + 441 "daily and 170,000 copies on weekends.", 442 newsPaperSections.get("ElNacional"), 443 (URLFilter) null, /* The LinksGetter will only return valid Anchor's */ 444 NewsSites::EL_NACIONAL_LINKS_GETTER, 445 ArticleGet.usual("article"), 446 null /* bannerAndAdFinder */ 447 ); 448 449 /** 450 * The News Site at address: <CODE><A HREF="https://www.ElEspectador.com/" TARGET=_blank> 451 * "https://www.ElEspectador.com/"</A></CODE> is slightly more complicated when retrieving 452 * News-Article Links. 453 * 454 * <BR /><BR />Notice that each newspaper article {@code URL}-link is "wrapped" in an HTML 455 * {@code '<DIV CLASS="Card ...">...</DIV>'} Element. 456 * 457 * <BR /><BR />If this code were translated into an "XPath Query" or "CSS Selector", it would 458 * read: <B>{@code div.Card a.card-link}</B>. Specifically it says to find all 459 * {@code 'Anchor'} elements whose CSS {@code Class} contains {@code 'card-link'} and which 460 * are descendants of {@code 'DIV'} Elements where said Divider's 461 * CSS {@code CLASS} contains {@code 'Card'}. 462 * 463 * @see InnerTagFindInclusive#all(Vector, String, String, TextComparitor, String[]) 464 * @see InnerTagGet#first(Vector, int, int, String, String, TextComparitor, String[]) 465 * @see TagNode#AV(String) 466 */ 467 public static Vector<String> EL_ESPECTADOR_LINKS_GETTER(URL url, Vector<HTMLNode> page) 468 { 469 Vector<String> ret = new Vector<>(); TagNode tn; String urlStr; 470 471 // Links are kept inside <DIV CLASS="Card ..."> ... </DIV> on the main / section page. 472 for (DotPair article : InnerTagFindInclusive.all 473 (page, "div", "class", TextComparitor.C, "Card")) 474 475 // Now find the <A CLASS="card-link" HREF=...> ... </A> 476 if ((tn = InnerTagGet.first 477 (page, article.start, article.end, "a", "class", TextComparitor.C, "card-link")) != null) 478 479 if ((urlStr = tn.AV("href")) != null) 480 ret.add(urlStr); 481 482 return ret; 483 } 484 485 /** 486 * This is the {@code NewsSite} definition for the Newspaper located at: 487 * <CODE><A HREF="https://www.elespectador.com/" TARGET=_blank> 488 * https://www.elespectador.com/</A></CODE>. 489 * 490 * <BR /><BR /><TABLE CLASS=JDBriefTable> 491 * <TR><TH>Parameter</TH> <TH>Significance</TH></TR> 492 * <TR><TD>Newspaper Name</TD> <TD>El Espectador</TD></TR> 493 * <TR><TD>Country of Origin</TD> <TD>Columbia</TD></TR> 494 * <TR><TD>Website URL</TD> <TD>{@code https://elespectador.com}</TD></TR> 495 * <TR><TD>Newspaper Printing Language</TD> <TD>Spanish</TD></TR> 496 * </TABLE> 497 * 498 * <BR /><TABLE CLASS="NEWSSITE"> 499 * <TR><TH>Parameter</TH><TH>Purpose</TH><TH>Value</TH></TR> 500 * <TR> <TD>Newspaper Article Groups / Sections</TD> 501 * <TD>Scrape Sections</TD> 502 * <TD><I>Retrieved from Data File</I></TD> 503 * </TR> 504 * <TR> <TD><B>{@link StrFilter}</B></TD> 505 * <TD>News Web-Site Section-Page Aritlce-Link ({@code <A HREF=...>}) Filter</TD> 506 * <TD>{@code HREF} must end with a forward-slash {@code '/'} character. 507 * <BR />See: {@link TextComparitor#ENDS_WITH} 508 * </TD> 509 * </TR> 510 * <TR> <TD><B>{@link LinksGet}</B></TD> 511 * <TD>Used to <B><I>manually</I></B> retrieve Article-Link {@code URL's}</TD> 512 * <TD>Invokes method {@link #EL_NACIONAL_LINKS_GETTER(URL, Vector)}</TD> 513 * </TR> 514 * <TR> <TD><B>{@link ArticleGet}</B></TD> 515 * <TD>Retrieves Article-Body Content from an Article-Link Web-Page</TD> 516 * <TD>{@code <DIV CLASS="l-main">...</DIV>} 517 * <BR />See: {@link ArticleGet#usual(TextComparitor, String[])} 518 * <BR />See: {@link TextComparitor#C} 519 * </TD> 520 * </TR> 521 * </TABLE> 522 * 523 * <BR />View a copy of the logs that are generated from using this {@code NewsSite}. 524 * <BR /><BR /><UL CLASS=JDUL> 525 * <LI> <CODE><B><A HREF='doc-files/Logs/ElEspectador-ScrapeURLs.html'> 526 * El Espectador ScrapeURLs LOG</A></B></CODE> 527 * </LI> 528 * <LI> <CODE><B><A HREF='doc-files/Logs/ElEspectador-ScrapeArticles.html'> 529 * El Espectador ScrapeArticles LOG</A></B></CODE> 530 * </LI> 531 * </UL> 532 * 533 * <EMBED CLASS='external-html' DATA-FILE-ID=NEWS_STE_CHANGE> 534 */ 535 public static final NewsSite ElEspectador = new NewsSite 536 ( 537 "El Espectador, Columbia", Country.Colombia, "https://elespectador.com", LC.ES, 538 "El Espectador (meaning \"The Spectator\") is a newspaper with national circulation within " + 539 "Colombia, founded by Fidel Cano Gutiérrez on 22 March 1887 in Medellín and published " + 540 "since 1915 in Bogotá. It changed from a daily to a weekly edition in 2001, following a " + 541 "financial crisis, and became a daily again on 11 May 2008, a comeback which had been " + 542 "long rumoured, in tabloid format (28 x 39.5 cm). From 1997 to 2011 its main shareholder " + 543 "was Julio Mario Santo Domingo.", 544 newsPaperSections.get("ElEspectador"), 545 StrFilter.comparitor(TextComparitor.ENDS_WITH, "/"), 546 NewsSites::EL_ESPECTADOR_LINKS_GETTER, 547 ArticleGet.usual("article"), 548 null /* bannerAndAdFinder */ 549 ); 550 551 552 /** 553 * The News Site at address: <CODE><A HREF="https://www.gov.cn/" TARGET=_blank> 554 * "https://www.gov.cn/"</A></CODE> has a Java-Script "Links Carousel". Essentially, there 555 * is a section with "Showcased News Articles" that are intended to be emphasize anywhere 556 * between four and eight primary articles. 557 * 558 * <BR /><BR />This Links-Carousel is wrapped in an HTML Divider Element as below: 559 * {@code <DIV CLASS="slider-carousel">}. 560 * 561 * <BR /><BR />If this code were translated into an "XPath Query" or "CSS Selector", it would 562 * read: <B>{@code div[class=slider-carousel] a}</B>. Specifically it says to find all 563 * {@code 'Anchor'} elements that are descendants of {@code '<DIV CLASS="slider-carousel">'} 564 * Elements. 565 * 566 * @see InnerTagGetInclusive#first(Vector, String, String, TextComparitor, String[]) 567 * @see TagNodeGet#all(Vector, TC, String[]) 568 * @see TagNode#AV(String) 569 */ 570 public static Vector<String> GOVCN_CAROUSEL_LINKS_GETTER(URL url, Vector<HTMLNode> page) 571 { 572 Vector<String> ret = new Vector<>(); 573 String urlStr; 574 575 // Find the first <DIV CLASS="slider-carousel"> ... </DIV> section 576 Vector<HTMLNode> carouselDIV = InnerTagGetInclusive.first 577 (page, "div", "class", TextComparitor.CN_CI, "slider-carousel"); 578 579 // Retrieve any HTML Anchor <A HREF=...> ... </A> found within the contents of the 580 // Divider. 581 for (TagNode tn: TagNodeGet.all(carouselDIV, TC.OpeningTags, "a")) 582 if ((urlStr = tn.AV("href")) != null) 583 ret.add(urlStr); 584 585 return ret; 586 }; 587 588 /** 589 * This is the {@code NewsSite} definition for the Newspaper located at: 590 * <CODE><A HREF="https://www.gov.cn/" TARGET=_blank> 591 * https://www.gov.cn/</A></CODE>. 592 * 593 * <BR /><BR />The "Carousels" are just the emphasized or "HiLighted" links that are 594 * on three separate pages. There is a complete-link {@code NewsSite} definition that 595 * will retrieve all links - <I>not just the links hilited by the carousel.</I> 596 * 597 * <BR /><BR /><TABLE CLASS=JDBriefTable> 598 * <TR><TH>Parameter</TH> <TH>Significance</TH></TR> 599 * <TR><TD>Newspaper Name</TD> <TD>Chinese Government Web Portal</TD></TR> 600 * <TR><TD>Country of Origin</TD> <TD>People's Republic of China</TD></TR> 601 * <TR><TD>Website URL</TD> <TD>{@code https://gov.cn}</TD></TR> 602 * <TR><TD>Newspaper Printing Language</TD> <TD>Mandarin Chinese</TD></TR> 603 * </TABLE> 604 * 605 * <BR /><TABLE CLASS="NEWSSITE"> 606 * <TR><TH>Parameter</TH><TH>Purpose</TH><TH>Value</TH></TR> 607 * <TR> <TD>Newspaper Article Groups / Sections</TD> 608 * <TD>Scrape Sections</TD> 609 * <TD><I>Retrieved from Data File</I></TD> 610 * </TR> 611 * <TR> <TD><B>{@link StrFilter}</B></TD> 612 * <TD>News Web-Site Section-Page Aritlce-Link ({@code <A HREF=...>}) Filter</TD> 613 * <TD>{@code HREF} must match: 614 * {@code "^http://www.gov.cn/(?:.+?/)?\\d{4}-\\d{2}/\\d{2}/(?:.+?/)?content_\\d+.htm(?:l)?(#\\d+)?"} 615 * </TD> 616 * </TR> 617 * <TR> <TD><B>{@link LinksGet}</B></TD> 618 * <TD>Used to <B><I>manually</I></B> retrieve Article-Link {@code URL's}</TD> 619 * <TD>Invokes method {@link #GOVCN_CAROUSEL_LINKS_GETTER(URL, Vector)}</TD> 620 * </TR> 621 * <TR> <TD><B>{@link ArticleGet}</B></TD> 622 * <TD>Retrieves Article-Body Content from an Article-Link Web-Page</TD> 623 * <TD>{@code <DIV CLASS="article ...">...</DIV>} 624 * <BR />See: {@link ArticleGet#usual(TextComparitor, String[])} 625 * <BR />See: {@link TextComparitor#C} 626 * </TD> 627 * </TR> 628 * </TABLE> 629 * 630 * <BR />View a copy of the logs that are generated from using this {@code NewsSite}. 631 * <BR /><BR /><UL CLASS=JDUL> 632 * <LI> <CODE><B><A HREF='doc-files/Logs/GovCNCarousel-ScrapeURLs.html'> 633 * Gov.CN Carousel ScrapeURLs LOG</A></B></CODE> 634 * </LI> 635 * <LI> <CODE><B><A HREF='doc-files/Logs/GovCNCarousel-ScrapeArticles.html'> 636 * Gov.CN Carousel ScrapeArticles LOG</A></B></CODE> 637 * </LI> 638 * </UL> 639 * 640 * <EMBED CLASS='external-html' DATA-FILE-ID=NEWS_STE_CHANGE> 641 */ 642 public static final NewsSite GovCNCarousel = new NewsSite 643 ( 644 "Chinese Government Web Portal", Country.China, "https://gov.cn/", LC.ZH_CN, 645 "The Chinese Government Sponsored Web-Site", 646 newsPaperSections.get("GovCNCarousel"), 647 StrFilter.regExKEEP(Pattern.compile( 648 "^http://www.gov.cn/(?:.+?/)?\\d{4}-\\d{2}/\\d{2}/(?:.+?/)?content_\\d+.htm(?:l)?(#\\d+)?" 649 ), false), 650 NewsSites::GOVCN_CAROUSEL_LINKS_GETTER, 651 ArticleGet.usual(TextComparitor.C, "article"), 652 null /* bannerAndAddFinder */ 653 ); 654 655 /** 656 * This is the {@code NewsSite} definition for the Newspaper located at: 657 * <CODE><A HREF="https://www.gov.cn/" TARGET=_blank> 658 * https://www.gov.cn/</A></CODE>. 659 * 660 * <BR /><BR />This version of the "Gov.CN" website will scour a larger set of section 661 * {@code URL's}, and will not limit the returned Article-Links to just those found on the 662 * java-script carousel. The Java-Script Carousel will almost always have a total of five 663 * news-article links available. This definition of {@code 'NewsSite'} may return up to 664 * thirty to forty different articles per news-section. 665 * 666 * <BR /><BR /><TABLE CLASS=JDBriefTable> 667 * <TR><TH>Parameter</TH> <TH>Significance</TH></TR> 668 * <TR><TD>Newspaper Name</TD> <TD>Chinese Government Web Portal</TD></TR> 669 * <TR><TD>Country of Origin</TD> <TD>People's Republic of China</TD></TR> 670 * <TR><TD>Website URL</TD> <TD>{@code https://gov.cn}</TD></TR> 671 * <TR><TD>Newspaper Printing Language</TD> <TD>Mandarin Chinese</TD></TR> 672 * </TABLE> 673 * 674 * <BR /><TABLE CLASS="NEWSSITE"> 675 * <TR><TH>Parameter</TH><TH>Purpose</TH><TH>Value</TH></TR> 676 * <TR> <TD>Newspaper Article Groups / Sections</TD> 677 * <TD>Scrape Sections</TD> 678 * <TD><I>Retrieved from Data File</I></TD> 679 * </TR> 680 * <TR> <TD><B>{@link StrFilter}</B></TD> 681 * <TD>News Web-Site Section-Page Aritlce-Link ({@code <A HREF=...>}) Filter</TD> 682 * <TD>{@code HREF} must match: 683 * {@code "^http://www.gov.cn/(?:.+?/)?\\d{4}-\\d{2}/\\d{2}/(?:.+?/)?content_\\d+.htm(?:l)?(#\\d+)?"} 684 * </TD> 685 * </TR> 686 * <TR> <TD><B>{@link LinksGet}</B></TD> 687 * <TD>Used to <B><I>manually</I></B> retrieve Article-Link {@code URL's}</TD> 688 * <TD><B>{@code null}</B>. Retrieves <B><I>all</I></B> Anchor-Links on a Section-Page. 689 * Note that {@code URL's} must still pass the previous {@code StrFilter} (above) 690 * in order to be parsed as {@link Article}'s. 691 * </TD> 692 * </TR> 693 * <TR> <TD><B>{@link ArticleGet}</B></TD> 694 * <TD>Retrieves Article-Body Content from an Article-Link Web-Page</TD> 695 * <TD>{@code <DIV CLASS="article ...">...</DIV>} 696 * <BR />See: {@link ArticleGet#usual(TextComparitor, String[])} 697 * <BR />See: {@link TextComparitor#C} 698 * </TD> 699 * </TR> 700 * </TABLE> 701 * 702 * <BR />View a copy of the logs that are generated from using this {@code NewsSite}. 703 * <BR /><BR /><UL CLASS=JDUL> 704 * <LI> <CODE><B><A HREF='doc-files/Logs/GovCN-ScrapeURLs.html'> 705 * Gov.CN ScrapeURLs LOG</A></B></CODE> 706 * </LI> 707 * <LI> <CODE><B><A HREF='doc-files/Logs/GovCN-ScrapeArticles.html'> 708 * Gov.CN ScrapeArticles LOG</A></B></CODE> 709 * </LI> 710 * </UL> 711 * 712 * <EMBED CLASS='external-html' DATA-FILE-ID=NEWS_STE_CHANGE> 713 */ 714 public static final NewsSite GovCN = new NewsSite 715 ( 716 "Chinese Government Web Portal", Country.China, "https://gov.cn/", LC.ZH_CN, 717 "The Chinese Government Sponsored Web-Site", 718 newsPaperSections.get("GovCN"), 719 StrFilter.regExKEEP(Pattern.compile( 720 "^http://www.gov.cn/(?:.+?/)?\\d{4}-\\d{2}/\\d{2}/(?:.+?/)?content_\\d+.htm(?:l)?(#\\d+)?" 721 ), false), 722 null, 723 ArticleGet.usual(TextComparitor.C, "article"), 724 null /* bannerAndAddFinder */ 725 ); 726 727}