001package Torello.HTML.Tools.NewsSite; 002 003import java.util.function.*; 004import java.util.*; 005import java.util.regex.*; 006 007import java.net.URL; 008 009import Torello.HTML.*; 010import Torello.HTML.NodeSearch.*; 011 012import Torello.Java.ParallelArrayException; 013 014/** 015 * A function-pointer / lambda target for extracting an article's content from the web-page 016 * from whence it was downloaded; including several {@code static}-builder methods for the 017 * most common means of finding the HTML-Tags that wrap artilce-HTML on news-media websites. 018 * 019 * <EMBED CLASS='external-html' DATA-FILE-ID=ARTICLE_GET> 020 */ 021@FunctionalInterface 022public interface ArticleGet extends java.io.Serializable 023{ 024 /** <EMBED CLASS='external-html' DATA-FILE-ID=SVUIDFI> */ 025 public static final long serialVersionUID = 1; 026 027 // ****************************************************************************************** 028 // Standard Functional Interface Method 029 // ****************************************************************************************** 030 031 /** 032 * <EMBED CLASS='external-html' DATA-FILE-ID=FUNC_INTER_METH> 033 * 034 * <BR /><BR />This method's purpose is to take a "Scraped HTML Page" (stored as a 035 * Vectorized-HTML Web-Page), and return an HTML {@code Vector} that contains only the 036 * "Article Content" - <I>which is usually just called the "Article Body."</I> Perhaps it 037 * seems daunting, but <I>the usual way</I> to get the actual article-body of an HTML 038 * News-Website Page is to simply identify an {@code HTML <DIV ID="..." CLASS="...">} 039 * surrounding element. 040 * 041 * <BR /><BR />This class has <I>several different static-methods called "usual"</I> which 042 * automatically create a page-getter. The example at the top of this class should hiLite 043 * how this works. Extracting news-content from a page that has already been downloaded - is 044 * usually trivial. The point really becomes identifying the {@code <DIV>}'s {@code class=...} 045 * or {@code id=...} attributes & page-structure to find the article-body. Generally, in 046 * your browser just click the {@code View Source} and look at manually to find the attributes 047 * used. Using the myriad Get methods from {@code Torello.HTML.NodeSearch} usually boils down 048 * to code that looks surreptitiously like Java-Script: 049 * 050 * <BR /><DIV CLASS="JAVASCRIPT">{@code 051 * var articleHTML = document.getElementById("article-body").innerHTML; 052 * 053 * // or... 054 * var articleHTML = document.getElementByClassName("article-body").innerHTML; 055 * }</DIV> 056 * 057 * <BR />Using the {@code NodeSearch} package, the above DOM-Tree Java-Script is easily written 058 * in Java as below: 059 * 060 * <DIV CLASS="SNIP">{@code 061 * // For articles with HTML divider elements having an "ID" attribute to specify the article 062 * // body, get the article using the code below. In this example, the particular newspaper 063 * // web-site has articles whose content ("Article Body") is simply wrapped in an HTML 064 * // HTML Divider Element: <DIV ID="article-body"> ... </DIV> 065 * 066 * // For extracting that content use the NodeSearch Package Class: InnerTagGetInclusive 067 * 068 * Vector<HTMLNode> articleBody = InnerTagGetInclusive 069 * (page, "div", "id", TextComparitor.EQ_CI, "article-body"); 070 * 071 * // To use this NodeSearch Package Class with the NewsSite Package, simply use one of the 072 * // 'usual' methods in class ArticleGet, and the lambda Functional Interface "ArticleGet" 073 * // will be built automatically as such: 074 * 075 * ArticleGet getter = ArticleGet.usual("div", "id", TextComparitor.EQ_CI, "article-body"); 076 * 077 * // For articles with HTML divider elements having an "CLASS" attribute to specify 078 * // the article body, get the article with the following code. Note that in this example 079 * // the article body is wrapped in an HTML Divider Element that has the characteristics 080 * // <DIV CLASS="article-body"> ... </DIV>. The content of a Newspaper Article can be easily 081 * // extracted with just one line of code using the methods in the NodeSearch Package as 082 * // follows: 083 * 084 * Vector<HTMLNode> articleBody = InnerTagGetInclusive 085 * (page, "div", "class", TextComparitor.C, "article-body"); 086 * 087 * // which should be written for use with the ScrapeArticles class as using the 'usual' 088 * // methods in ArticleGet as such: 089 * 090 * ArticleGet getter = ArticleGet.usual(TextComparitor.EQ_CI, "article-body"); 091 * }</DIV> 092 * 093 * <BR /><BR /><B>NOTE:</B> For all examples above, the text-string "article-body" will be 094 * a tag-value that (was) decided/chosen by the HTML news-website, or content-website you want 095 * to scrape. 096 * 097 * <BR /><BR /><B><SPAN STYLE="color: red">ALSO:</SPAN></B> One might have to be careful about 098 * modifying the input to this {@code Predicate}. Each and every one of the NodeSearch classes 099 * retrieves a copy (read: <B><I>a clone</I></B>) of the input {@code Vector} (other than the 100 * classes that actually use the term "remove.") However, if you were to write an Article 101 * Get lambda of your own (rather than using the "usual" methods), make sure you know whether 102 * you are going to <I>intentionally</I>, modify the input-page, and if so, remember you have. 103 * 104 * <BR /><BR /><B><SPAN STYLE="color: red">FURTHERMORE:</SPAN></B> There are many content-based 105 * web-sites that have some (even "a lot") of spurious HTML information inside the primary 106 * article body, even after the header & footer information has been eliminated. It may be 107 * necessary to do some vector-cleaning later on. For example: getting rid of "Post to 108 * Facebook", "Post to Twitter" or "E-Mail Link" buttons. 109 */ 110 public Vector<HTMLNode> apply(URL url, Vector<HTMLNode> page) throws ArticleGetException; 111 112 // ****************************************************************************************** 113 // Filter Factory / Filter-Generator static-methods 114 // ****************************************************************************************** 115 116 /** 117 * <I>This is a static, factory method for building ArticleGet.</I> 118 * 119 * <BR /><BR />This builds an "Article Getter" based on a parameter-specified HTML Tag. Two 120 * or three common HTML "semantic elements" used for wrapping newspaper article-content 121 * include these: 122 * 123 * <BR /><BR /><UL CLASS=JDUL> 124 * <LI>{@code <ARTICLE ...> article-body </ARITCLE>}</LI> 125 * <LI>{@code <MAIN ...> article-body </MAIN>}</LI> 126 * <LI>{@code <SECTION ...> article-body </SECTION>}</LI> 127 * </UL> 128 * 129 * <BR />Identifying which tag to use can be accomplished by going to the main-page of an 130 * internet news web-site, selecting a news-article, and then using the {@code "View Source"} 131 * or the {@code "View Page Source"} depending upon which browser your are using, and then 132 * scanning the HTML to find what elements are used to wrap the article-body. 133 * 134 * <BR /><BR />Call this method, and use the ArticleGet that it generates/returns with the 135 * {@code class NewsSiteScrape}. As long as the news or content website that you are scraping 136 * has it's page-body wrapped inside of an {@code HTML <DIV>} element <I><B>whose 137 * {@code CSS 'class'} specifier is one you have uncovered by inspecting the 138 * page-manually</B></I> then {@code ArticleGet} produced by this factory-method will retrieve 139 * your page content appropriately. 140 * 141 * @param htmlTag This should be the HTML element that is used to wrap the actual news-content 142 * article-body of an HTML news web-site page. 143 * 144 * @return This returns an "Article Getter" that just picks out the part of a news-website 145 * article that lies between the open and closed version of the specified htmlTag. 146 */ 147 public static ArticleGet usual(String htmlTag) 148 { 149 final String htmlTagLC = htmlTag.toLowerCase(); 150 151 // This 'final String' is merely used for proper error reporting in any potential 152 // exception-messages, nothing else. 153 final String functionNameStr = "TagNodeGetInclusive.first(page, \"" + htmlTagLC + "\");"; 154 155 156 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 157 // FAIL-FAST: Check user-input for possible errors BEFORE building the Lambda-Function. 158 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 159 160 // Check for valid HTML Token 161 HTMLTokException.check(htmlTagLC); 162 163 // Self-Closing / Singleton Tags CANNOT be used with INCLUSIVE Retrieval Operations. 164 InclusiveException.check(htmlTagLC); 165 166 167 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 168 // Build the instance, using a lambda-expression 169 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 170 171 return (URL url, Vector<HTMLNode> page) -> 172 { 173 // This exception-check is done on every invocation of this Lambda-Function. 174 // It is merely checking that these inputs are not-null, and page is of non-zero size. 175 // ArticleGetException is a compile-time, checked exception. It is important to halt 176 // News-Site Scrape Progress when "Empty News-Page Data" is being passed here. 177 // NOTE: This would imply an internal-error with class Download has occurred. 178 179 ArticleGetException.check(url, page); 180 181 Vector<HTMLNode> ret; 182 183 try 184 { ret = TagNodeGetInclusive.first(page, htmlTagLC); } 185 186 catch (Exception e) 187 { 188 throw new ArticleGetException 189 (ArticleGetException.GOT_EXCEPTION, functionNameStr, e); 190 } 191 192 // These error-checks are used to deduce whether the "Article Get" was successful. 193 // When this exception is thrown, it means that the user-specified means of "Retrieving 194 // an Article Body" FAILED. In this case, the "innerHTML" of the specified htmlTag was 195 // not found, and produced a null news-article page, or an empty news-article page. 196 197 if (ret == null) throw new ArticleGetException 198 (ArticleGetException.RET_NULL, functionNameStr); 199 200 if (ret.size() == 0) throw new ArticleGetException 201 (ArticleGetException.RET_EMPTY_VECTOR, functionNameStr); 202 203 return ret; 204 }; 205 } 206 207 /** 208 * <I>This is a static, factory method for building ArticleGet.</I> 209 * 210 * <BR /><BR />This builds an "Article Getter" for you, using the most common way to get 211 * an article - specifically via the {@code HTML <DIV CLASS="...">} element and it's 212 * {@code CSS 'class'} selector. 213 * 214 * <BR /><BR />Call this method, and use the ArticleGet that it generates/returns with the 215 * {@code class NewsSiteScrape}. As long as the news or content website that you are scraping 216 * has it's page-body wrapped inside of an {@code HTML <DIV>} element <I><B>whose 217 * {@code CSS 'class'} specifier is one you have uncovered by inspecting the 218 * page-manually</B></I> then {@code ArticleGet} produced by this factory-method will retrieve 219 * your page content appropriately. 220 * 221 * @param tc This should be any of the pre-instantiated {@code TextComparitor's}. Again, a 222 * TextComparitor is just a {@code String} compare function like: {@code equals, contains, 223 * StrCmpr.containsIgnoreCase(...)}, etc... 224 * 225 * @param cssClassCompareStrings These are the values to be used by the 226 * {@code TextComparitor} when comparing with the value of the CSS-Selector {@code "Class"} 227 * from the list of {@code DIV} elements on the page. 228 * 229 * @return This returns an "Article Getter" that just picks out the part of a news-website 230 * article that lies between the HTML-{@code DIV} Element nodes whose class is identified by 231 * the "CSS (Cascading Style Sheets) {@code 'class'} identifier, and the 232 * {@code TextComparitor} parameter that you have chosen. 233 */ 234 public static ArticleGet usual(TextComparitor tc, String... cssClassCompareStrings) 235 { 236 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 237 // FAIL-FAST: Check user-input for possible errors BEFORE building the Lambda-Function. 238 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 239 240 // Check for valid compareStrings 241 TCCompareStrException.check(cssClassCompareStrings); 242 243 if (tc == null) throw new NullPointerException 244 ("Null has been passed to TextComparitor Parameter 'tc', but this is not allowed here."); 245 246 // This 'final' String is merely used for proper error reporting in any potential 247 // exception-messages, nothing else. 248 249 final String functionNameStr = 250 "InnerTagGetInclusive.first(page, \"div\", \"class\", " + 251 STR_FORMAT_TC_PARAMS(tc, cssClassCompareStrings) + ")"; 252 253 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 254 // Build the instance, using a lambda-expression 255 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 256 257 return (URL url, Vector<HTMLNode> page) -> 258 { 259 // This exception-check is done on every invocation of this Lambda-Function. 260 // It is merely checking that these inputs are not-null, and page is of non-zero size. 261 // ArticleGetException is a compile-time, checked exception. It is important to halt 262 // News-Site Scrape Progress when "Empty News-Page Data" is being passed here. 263 // NOTE: This would imply an internal-error with class Download has occurred. 264 265 ArticleGetException.check(url, page); 266 267 Vector<HTMLNode> ret; 268 269 try 270 { 271 ret = InnerTagGetInclusive.first 272 (page, "div", "class", tc, cssClassCompareStrings); 273 } 274 catch (Exception e) 275 { 276 throw new ArticleGetException 277 (ArticleGetException.GOT_EXCEPTION, functionNameStr, e); 278 } 279 280 // These error-checks are used to deduce whether the "Article Get" was successful. 281 // When this exception is thrown, it means that the user-specified means of "Retrieving 282 // an Article Body" FAILED. In this case, the "innerHTML" of the specified htmltag and 283 // class of the <DIV CLASS=...> produced a null news-article page, or an empty 284 // news-article page. 285 286 if (ret == null) throw new ArticleGetException 287 (ArticleGetException.RET_NULL, functionNameStr); 288 289 if (ret.size() == 0) throw new ArticleGetException 290 (ArticleGetException.RET_EMPTY_VECTOR, functionNameStr); 291 292 return ret; 293 }; 294 } 295 296 /** 297 * <I>This is a static, factory method for building ArticleGet.</I> 298 * 299 * <BR /><BR />This gives more options for building your article getter. In almost 95% of the 300 * news-websites, the article or page-body is between and open and close HTML DIV element, 301 * and the {@code <DIV CLASS="...">} can be found by the {@code CSS 'class'} attribute. 302 * <I><B>However,</B></I> This factory method allows a programmer to select article content 303 * that handles other cases than the {@code 95%}, where you specify the HTML-token, 304 * attribute-<B STYLE='color: red;'>name</B> and use the usual {@code TextComparitor} to find 305 * the article. 306 * 307 * @param htmlTag This is almost always a {@code "DIV"} element, but if you wish to specify 308 * something else, possibly a paragraph element ({@code <P>}), or maybe an {@code <IFRAME>} 309 * or {@code <FRAME>}, then you may. 310 * 311 * @param innerTag This is almost always a {@code "CLASS"} attribute, but if you need to use 312 * {@code "ID"} or something different altogether - possibly a site-specific tag, then use the 313 * innerTag / attribute-<B STYLE='color: red;'>name</B> of your choice. 314 * 315 * @param tc This should be any of the pre-instantiated {@code TextComparitor's}. Again, a 316 * {@code TextComparitor} is just a {@code String} compare function like: {@code equals, 317 * contains, StrCmpr.containsIgnoreCase(...)}. 318 * 319 * @param attributeValueCompareStrings These are the {@code String's} compared with using 320 * the innerTag <B STYLE='color: red;'>value</B> using the {@code TextComparitor}. 321 * 322 * @return This returns an "Article Getter" that picks out the part of a news-website article 323 * that lies between the HTML element which matches the {@code 'htmlTag', 'innerTag' (id, 324 * class, or "other")}, and whose attribute-<B STYLE='color: red;'>value</B> of the specified 325 * {@code inner-tag} can be matched by the {@code TextComparitor} and the 326 * compare-{@code String's}. 327 */ 328 public static ArticleGet usual 329 (String htmlTag, String innerTag, TextComparitor tc, String... attributeValueCompareStrings) 330 { 331 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 332 // FAIL-FAST: Check user-input for possible errors BEFORE building the Lambda-Function. 333 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 334 335 TCCompareStrException.check(attributeValueCompareStrings); 336 337 if (tc == null) throw new NullPointerException 338 ("Null has been passed to TextComparitor Parameter 'tc', but this is not allowed here."); 339 340 final String htmlTagLC = htmlTag.toLowerCase(); 341 final String innerTagLC = innerTag.toLowerCase(); 342 343 // This 'final String' is merely used for proper error reporting in any potential 344 // exception-messages, nothing else. 345 346 final String functionNameStr = 347 "InnerTagGetInclusive.first(page, \"" + htmlTag + "\", \"" + innerTag + "\", " + 348 STR_FORMAT_TC_PARAMS(tc, attributeValueCompareStrings) + ")"; 349 350 // Check for valid HTML Tag. 351 HTMLTokException.check(htmlTagLC); 352 353 // Self-Closing / Singleton Tags CANNOT be used with INCLUSIVE Retrieval Operations. 354 InclusiveException.check(htmlTagLC); 355 356 357 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 358 // Build the instance, using a lambda-expression 359 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 360 361 return (URL url, Vector<HTMLNode> page) -> 362 { 363 // This exception-check is done on every invocation of this Lambda-Function. 364 // It is merely checking that these inputs are not-null, and page is of non-zero size. 365 // ArticleGetException is a compile-time, checked exception. It is important to halt 366 // News-Site Scrape Progress when "Empty News-Page Data" is being passed here. 367 // NOTE: This would imply an internal-error with class Download has occurred. 368 369 ArticleGetException.check(url, page); 370 371 Vector<HTMLNode> ret; 372 373 try 374 { 375 ret = InnerTagGetInclusive.first 376 (page, htmlTagLC, innerTagLC, tc, attributeValueCompareStrings); 377 } 378 catch (Exception e) // unlikely 379 { 380 throw new ArticleGetException 381 (ArticleGetException.GOT_EXCEPTION, functionNameStr, e); 382 } 383 384 // These error-checks are used to deduce whether the "Article Get" was successful. 385 // When this exception is thrown, it means that the user-specified means of "Retrieving 386 // an Article Body" FAILED. In this case, the "innerHTML" of the specified htmlTag and 387 // attribute produced a null news-article page, or an empty news-article page. 388 389 if (ret == null) throw new ArticleGetException 390 (ArticleGetException.RET_NULL, functionNameStr); 391 392 if (ret.size() == 0) throw new ArticleGetException 393 (ArticleGetException.RET_EMPTY_VECTOR, functionNameStr); 394 395 return ret; 396 }; 397 } 398 399 /** 400 * <I>This is a static, factory method for building ArticleGet.</I> 401 * 402 * <BR /><BR />This gives more options for building your article getter. In almost 95% of the 403 * news-websites, the article or page-body is between and open and close HTML DIV element, and 404 * the {@code <DIV CLASS="...">} can be found by the {@code CSS 'class'} attribute. 405 * <I><B>However,</B></I> This factory method allows a programmer to select article content 406 * that handles other cases than the {@code 95%}. Here, you may specify the HTML-token, 407 * attribute-<B STYLE='color: red;'>name</B> and use a Java Regular-Expression handler to 408 * test the <B STYLE='color: red;'>value</B> of the attribute - no matter how complicated or 409 * bizarre. 410 * 411 * @param htmlTag This is almost always a {@code "DIV"} element, but if you wish to specify 412 * something else, possibly a paragraph element ({@code <P>}), or maybe an {@code <IFRAME>} 413 * or {@code <FRAME>}, then you may. 414 * 415 * @param innerTag This is almost always a {@code "CLASS"} attribute, but if you need to use 416 * {@code "ID"} or something different altogether - possibly a site-specific tag, then use the 417 * innerTag / attribute-<B STYLE='color: red;'>name</B> of your choice. 418 * 419 * @param innerTagValuePattern Any regular-expression. It will be used to <B>PASS</B> or 420 * <B>FAIL</B> the attribute-<B STYLE='color: red;'>value</B> <I>(a name that is used 421 * interchangeably in this scrape/search package for 422 * "inner-tag-<B STYLE='color: red;'>value</B>")</I> when compared against this 423 * regular-expression parameter. 424 * 425 * <BR /><BR /><B>HELP:</B> This would be like saying: 426 * <DIV CLASS="SNIP">{@code 427 * // Pick some random HTML TagNode 428 * TagNode aTagNode = (TagNode) page.elementAt(index_to_test); 429 * 430 * // Gets the attribute value of "innerTag" 431 * String attributeValue = aTagNode.AV(innerTag); 432 * 433 * // Make sure the HTML-token is as specified 434 * // calls to: java.util.regex.*; 435 * boolean passFail = aTagNode.tok.equals(htmlTag) && 436 * innerTagValuePattern.matcher(attributeValue).find(); 437 * }</DIV> 438 * 439 * @return This returns an "Article Getter" that picks out the part of a news-website article 440 * that lays between the HTML element which matches the htmlTag, innerTag and value-testing 441 * regex {@code Pattern "innerTagValuePattern"}. 442 */ 443 public static ArticleGet usual(String htmlTag, String innerTag, Pattern innerTagValuePattern) 444 { 445 final String htmlTagLC = htmlTag.toLowerCase(); 446 final String innerTagLC = innerTag.toLowerCase(); 447 448 // This 'final String' is merely used for proper error reporting in any potential 449 // exception-messages, nothing else. 450 451 final String functionNameStr = 452 "InnerTagGetInclusive.first(page, \"" + htmlTag + "\", \"" + innerTag + "\", " + 453 innerTagValuePattern.pattern() + ")"; 454 455 456 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 457 // FAIL-FAST: Check user-input for possible errors BEFORE building the Lambda-Function. 458 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 459 460 HTMLTokException.check(htmlTagLC); 461 InclusiveException.check(htmlTagLC); 462 463 464 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 465 // Build the instance, using a lambda-expression 466 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 467 468 return (URL url, Vector<HTMLNode> page) -> 469 { 470 // This exception-check is done on every invocation of this Lambda-Function. 471 // It is merely checking that these inputs are not-null, and page is of non-zero size. 472 // ArticleGetException is a compile-time, checked exception. It is important to halt 473 // News-Site Scrape Progress when "Empty News-Page Data" is being passed here. 474 // NOTE: This would imply an internal-error with class Download has occurred. 475 476 ArticleGetException.check(url, page); 477 478 Vector<HTMLNode> ret; 479 480 try 481 { 482 ret = InnerTagGetInclusive.first 483 (page, htmlTagLC, innerTagLC, innerTagValuePattern); 484 } 485 catch (Exception e) // unlikely 486 { 487 throw new ArticleGetException 488 (ArticleGetException.GOT_EXCEPTION, functionNameStr, e); 489 } 490 491 // These error-checks are used to deduce whether the "Article Get" was successful. 492 // When this exception is thrown, it means that the user-specified means of "Retrieving 493 // an Article Body" FAILED. In this case, the "innerHTML" of the specified htmlTag and 494 // attribute produced a null news-article page, or an empty news-article page. 495 496 if (ret == null) throw new ArticleGetException 497 (ArticleGetException.RET_NULL, functionNameStr); 498 499 if (ret.size() == 0) throw new ArticleGetException 500 (ArticleGetException.RET_EMPTY_VECTOR, functionNameStr); 501 502 return ret; 503 }; 504 } 505 506 /** 507 * <I>This is a static, factory method for building ArticleGet.</I> 508 * 509 * <BR /><BR />This gives more options for building your article getter. In almost 95% of the 510 * news-websites, the article or page-body is between and open and close HTML {@code 'DIV'} 511 * element, and the {@code <DIV CLASS="...">} can be found by the {@code CSS 'class'} attribute. 512 * <I><B>However,</B></I> This factory method allows a programmer to select article content 513 * that handles other cases than the {@code 95%}, where you specify the HTML-token, 514 * attribute-<B STYLE='color: red;'>name</B> and a {@code Predicate<String>} for finding the 515 * page-body. 516 * 517 * @param htmlTag This is almost always a {@code "DIV"} element, but if you wish to specify 518 * something else, possibly a paragraph element ({@code <P>}), or maybe an {@code <IFRAME>} 519 * or {@code <FRAME>}, then you may. 520 * 521 * @param innerTag This is almost always a {@code "CLASS"} attribute, but if you need to use 522 * {@code "ID"} or something different altogether - possibly a site-specific tag, then use the 523 * innerTag / attribute-<B STYLE='color: red;'>name</B> of your choice. 524 * 525 * @param p This java "lambda {@code Predicate}" will just receive the 526 * attribute-<B STYLE='color: red;'>value</B> from the "inner-tag" and provide a yes/no answer. 527 * 528 * @return This returns an "Article Getter" that matches an HTML element specified by 529 * {@code 'htmlTag', 'innerTag'} and the result of the {@code String-Predicate} parameter 530 * {@code 'p'} on the <B STYLE='color: red;'>value</B> of that inner-tag. 531 */ 532 public static ArticleGet usual(String htmlTag, String innerTag, Predicate<String> p) 533 { 534 final String htmlTagLC = htmlTag.toLowerCase(); 535 final String innerTagLC = innerTag.toLowerCase(); 536 537 // This 'final' String is merely used for proper error reporting in any potential 538 // exception-messages, nothing else. 539 540 final String functionNameStr = 541 "InnerTagGetInclusive.first(page, \"" + htmlTag + "\", \"" + innerTag + "\", " + 542 "Predicate<String>)"; 543 544 545 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 546 // FAIL-FAST: Check user-input for possible errors BEFORE building the Lambda-Function. 547 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 548 549 HTMLTokException.check(htmlTagLC); 550 InclusiveException.check(htmlTagLC); 551 552 if (p == null) throw new NullPointerException 553 ("Null has been passed to Predicate parameter 'p'. This is not allowed here."); 554 555 556 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 557 // Build the instance, using a lambda-expression 558 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 559 560 return (URL url, Vector<HTMLNode> page) -> 561 { 562 // This exception-check is done on every invocation of this Lambda-Function. 563 // It is merely checking that these inputs are not-null, and page is of non-zero size. 564 // ArticleGetException is a compile-time, checked exception. It is important to halt 565 // News-Site Scrape Progress when "Empty News-Page Data" is being passed here. 566 // NOTE: This would imply an internal-error with class Download has occurred. 567 568 ArticleGetException.check(url, page); 569 570 Vector<HTMLNode> ret; 571 572 try 573 { ret = InnerTagGetInclusive.first(page, htmlTagLC, innerTagLC, p); } 574 575 catch (Exception e) 576 { 577 throw new ArticleGetException 578 (ArticleGetException.GOT_EXCEPTION, functionNameStr, e); 579 } 580 581 // These error-checks are used to deduce whether the "Article Get" was successful. 582 // When this exception is thrown, it means that the user-specified means of "Retrieving 583 // an Article Body" FAILED. In this case, the "innerHTML" of the specified htmlTag and 584 // attribute produced a null news-article page, or an empty news-article page. 585 586 if (ret == null) throw new ArticleGetException 587 (ArticleGetException.RET_NULL, functionNameStr, null); 588 589 if (ret.size() == 0) throw new ArticleGetException 590 (ArticleGetException.RET_EMPTY_VECTOR, functionNameStr, null); 591 592 return ret; 593 }; 594 } 595 596 /** 597 * <I>This is a static, factory method for building ArticleGet.</I> 598 * 599 * <BR /><BR />This factory method generates an "ArticleGet" that will retrieve news-article 600 * body-content based on a "start-tag" and an "end-tag." It is <B><I>very</I></B> to note, 601 * that the text can only match a single text-node, and not span multiple text-nodes, or be 602 * within {@code TagNode's} at all! This should be easy to find, print up the HTML page as a 603 * {@code Vector}, and inspect it! 604 * 605 * @param startTextTag This must be text from an HTML {@code TextNode} that is 606 * <I><B>contained</B> within one (single) {@code TextNode}</I> of the vectorized-HTML page. 607 * 608 * @param endTextTag This must be text from an HTML {@code TextNode} that is also 609 * <B><I>contained</B> in a single {@code TextNode}</I> of the vectorized-HTML page. 610 * 611 * @return This will return an "Article Getter" that looks for <B><I>non-HTML Text</I></B> in 612 * the article, specified by the text-tag parameters, and gets it. 613 */ 614 public static ArticleGet usual(String startTextTag, String endTextTag) 615 { 616 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 617 // FAIL-FAST: Check user-input for possible errors BEFORE building the Lambda-Function. 618 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 619 620 if (startTextTag == null) throw new NullPointerException 621 ("Null has been passed to parameter 'startTextTag', but this is not allowed here."); 622 623 if (endTextTag == null) throw new NullPointerException 624 ("Null has been passed to parameter 'endTextTag', but this is not allowed here."); 625 626 627 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 628 // Build the instance, using a lambda-expression 629 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 630 631 return (URL url, Vector<HTMLNode> page) -> 632 { 633 // This exception-check is done on every invokation of this Lambda-Function. 634 // It is merely checking that these inputs are not-null, and page is of non-zero size. 635 // ArticleGetException is a compile-time, checked exception. It is important to halt 636 // News-Site Scrape Progress when "Empty News-Page Data" is being passed here. 637 // NOTE: This would imply an internal-error with class Download has occured. 638 639 ArticleGetException.check(url, page); 640 641 int start = -1; 642 int end = -1; 643 HTMLNode n = null; 644 645 while (start++ < page.size()) 646 if ((n = page.elementAt(start)) instanceof TextNode) 647 if (n.str.contains(startTextTag)) 648 break; 649 650 while (end++ < page.size()) 651 if ((n = page.elementAt(end)) instanceof TextNode) 652 if (n.str.contains(endTextTag)) 653 break; 654 655 // These error-checks are used to deduce whether the "Article Get" was successful. 656 // When this exception is thrown, it means that the user-specified means of "Retrieving 657 // an Article Body" FAILED. In this case it is because the start/end tags were not found 658 // in the text of the vectorized-html news-article web-page. 659 660 if (start == page.size()) throw new ArticleGetException( 661 "Start Text Tag [" + startTextTag + "], was not found on the News Article HTML " + 662 "page." 663 ); 664 665 if (end == page.size()) throw new ArticleGetException( 666 "End Text Tag [" + endTextTag + "], was not found on the News Article HTML " + 667 "page." 668 ); 669 670 return Util.cloneRange(page, start, end + 1); 671 }; 672 } 673 674 /** 675 * <I>This is a static, factory method for building ArticleGet.</I> 676 * 677 * This factory method generates an "ArticleGet" that will retrieve news-article body-content 678 * based on starting and ending regular-expressions. The matches performed by the Regular 679 * Expression checker will be performed on {@code TextNode's}, not on the {@code TagNode's}, or 680 * the page itself. It is <B><I>very</I></B> to note, that the text can only match a single 681 * {@code TextNode}, and not span multiple {@code TextNode's}, or be within {@code TagNode's} 682 * at all! This should be easy to find, print up the HTML page as a {@code Vector}, and 683 * inspect it! 684 * 685 * @param startPattern This must be a regular expression {@code Pattern} that matches an HTML 686 * {@code TextNode} that is <I><B>contained</B> within one (single) {@code TextNode}</I> of 687 * the vectorized-HTML page. 688 * 689 * @param endPattern This must be a regular expression {@code Pattern} that matches an HTML 690 * {@code TextNode} that is also <B><I>contained</B> in a single {@code TextNode}</I> of the 691 * vectorized-HTML page. 692 * 693 * @return This will return an "Article Getter" that looks for <B><I>non-HTML Text</I></B> 694 * in the article, specified by the regular-expression pattern-matching parameters, and gets it. 695 */ 696 public static ArticleGet usual(Pattern startPattern, Pattern endPattern) 697 { 698 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 699 // FAIL-FAST: Check user-input for possible errors BEFORE building the Lambda-Function. 700 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 701 702 if (startPattern == null) throw new NullPointerException 703 ("Null has been passed to parameter 'startPattern', but this is not allowed here."); 704 705 if (endPattern == null) throw new NullPointerException 706 ("Null has been passed to parameter 'endPattern', but this is not allowed here."); 707 708 709 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 710 // Build the instance, using a lambda-expression 711 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 712 713 return (URL url, Vector<HTMLNode> page) -> 714 { 715 // This exception-check is done on every invokation of this Lambda-Function. 716 // It is merely checking that these inputs are not-null, and page is of non-zero size. 717 // ArticleGetException is a compile-time, checked exception. It is important to halt 718 // News-Site Scrape Progress when "Empty News-Page Data" is being passed here. 719 // NOTE: This would imply an internal-error with class Download has occured. 720 721 ArticleGetException.check(url, page); 722 int start = -1; 723 int end = -1; 724 HTMLNode n = null; 725 726 while (start++ < page.size()) 727 if ((n = page.elementAt(start)) instanceof TextNode) 728 if (startPattern.matcher(n.str).find()) 729 break; 730 731 while (end++ < page.size()) 732 if ((n = page.elementAt(end)) instanceof TextNode) 733 if (endPattern.matcher(n.str).find()) 734 break; 735 736 // These error-checks are used to deduce whether the "Article Get" was successful. 737 // When this exception is thrown, it means that the user-specified means of "Retrieving 738 // an Article Body" FAILED. In this case it is because the start or end regex failed to 739 // match. 740 741 if (start == page.size()) throw new ArticleGetException( 742 "Start Pattern [" + startPattern.toString() + "], was not found on the HTML " + 743 "page." 744 ); 745 746 if (end == page.size()) throw new ArticleGetException 747 ("End Pattern [" + endPattern.toString() + "], was not found on the HTML page."); 748 749 return Util.cloneRange(page, start, end + 1); 750 }; 751 } 752 753 /** 754 * <I>This is a static, factory method for building ArticleGet.</I> 755 * 756 * This is just a way to put a list of article-parse objects into a single "branching" 757 * article-parse {@code Object}. The two parameters must be equal-length arrays, with non-null 758 * elements. Each {@code 'urlSelector'} will be tested, and when a selector passes, the 759 * {@code ArticleGet} that is created will use the "parallel getter" from the parallel array 760 * "getters." 761 * 762 * <BR /><BR /><B>LAY-SPEAK:</B> The best way to summarize this is if a programmer is going to 763 * use the {@code NewsSiteScrape} class, and planning to scrape a site that has different types 764 * of news-articles, he will need differing {@code "ArticleGet"} methods. This class will take 765 * two {@code array's} that match the {@code URL} from which the article was retrieved with the 766 * particular "getter" method you have provided. When I scrape the address: 767 * {@code http://www.baidu.com/} - a Chinese News Web-Site, it links to at least three primary 768 * domains: 769 * 770 * <BR /><BR /><OL CLASS=JDOL> 771 * <LI>{@code http://...chinesenews.com/director.../article...}</LI> 772 * <LI>{@code http://...xinhuanet.com/director.../article...}</LI> 773 * <LI>{@code http://...cctv.com/director.../article...}</LI> 774 * </OL> 775 * 776 * <BR />Results from each of these sites need to be "handled" just ever-so-slightly different. 777 * 778 * @param urlSelectors This is a list of {@code Predicate<URL>} elements. When one of these 779 * returns {@code TRUE} for a particular {@code URL}, then the index of that 780 * {@code URL}-selector in it's {@code array} will be used to call the appropriate getter from 781 * the parallel-{@code array} input-parameter {@code 'getters'}. 782 * 783 * @param getters This is a list of getter elements. These should be tailored to the 784 * particular news-website source that are chosen/selected by the {@code 'urlSelectors'} 785 * parallel {@code array}. 786 * 787 * @return This will be a "master {@code ArticleGet}" or a "dispatch {@code ArticleGet}." 788 * All it does is simply traverse the first {@code array} looking for a 789 * {@code Predicate}-match from the {@code 'urlSelectors'}, and then calls the getter in the 790 * parallel {@code array}. 791 * 792 * <BR /><BR /><B>NOTE:</B> If none of the {@code 'urlSelectors'} match when this 793 * <B><I>"dispatch"</B></I> or rather <B><I>"branch"</I></B> is called by {@code class 794 * NewsSiteScrape}, the function/getter that is returned will throw an 795 * {@code ArticleGetException}. It is important that the programmer only allow article 796 * {@code URL's} that he can capably handled to pass to {@code class NewsSiteScrape}. 797 * 798 * @throws IllegalArgumentException Will throw this exception if: 799 * 800 * <BR /><BR /><UL CLASS=JDUL> 801 * <LI>Either of these parameters are null</LI> 802 * <LI>If they are not parallel, with differing lengths.</LI> 803 * <LI>If either contain a null value.</LI> 804 * </UL> 805 */ 806 public static ArticleGet branch(URLFilter[] urlSelectors, ArticleGet[] getters) 807 { 808 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 809 // FAIL-FAST: Check user-input for possible errors BEFORE building the Lambda-Function. 810 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 811 812 if (urlSelectors.length == 0) throw new IllegalArgumentException 813 ("parameter 'urlSelectors' had zero-elements."); 814 815 if (getters.length == 0) throw new IllegalArgumentException 816 ("parameter 'getters' had zero-elements."); 817 818 ParallelArrayException.check(urlSelectors, "urlSelectors", true, getters, "getters", true); 819 820 821 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 822 // Build the instance, using a lambda-expression 823 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 824 825 return (URL url, Vector<HTMLNode> page) -> 826 { 827 for (int i=0; i < urlSelectors.length; i++) 828 if (urlSelectors[i].test(url)) 829 return getters[i].apply(url, page); 830 831 throw new ArticleGetException( 832 "None of the urlSelecctors you have provided matched the URL sent to this " + 833 "instance of ArticleGet." 834 ); 835 }; 836 } 837 838 839 // ****************************************************************************************** 840 // Other Methods 841 // ****************************************************************************************** 842 843 /** 844 * This is the standard-java {@code Function 'andThen'} method. 845 * 846 * @param after This is the {@code ArticleGet} that will be (automatically) applied after 847 * {@code 'this'} function. 848 * 849 * @return A new, composite {@code ArticleGet} that performs both operations. It will: 850 * 851 * <BR /><BR /><OL CLASS=JDOL> 852 * <LI> Run {@code 'this'} function's {@code 'apply'} method to a 853 * {@code URL, Vector<HTMLNode>}, and return a {@code Vector<HTMLNode>}. 854 * <BR /><BR /> 855 * </LI> 856 * <LI> Then it will run the {@code 'after'} function's {@code 'apply'} method to the 857 * results of {@code 'this.apply(...)'} and return the result. 858 * <BR /> 859 * </LI> 860 * </OL> 861 */ 862 default ArticleGet andThen(ArticleGet after) 863 { return (URL url, Vector<HTMLNode> page) -> after.apply(url, this.apply(url, page)); } 864 865 /** 866 * This is the standard-java {@code Function 'compose'} method. 867 * 868 * @param before This is the {@code ArticleGet} that is performed first, whose results are 869 * sent to {@code 'this'} function. 870 * 871 * @return A new composite {@code ArticleGet} that performs both operations. 872 * It will: 873 * 874 * <BR /><BR /><OL CLASS=JDOL> 875 * <LI> Run the {@code 'before'} function's {@code 'apply'} method to a 876 * {@code URL, Vector<HTMLNode>}, and return a {@code Vector<HTMLNode>}. 877 * </LI> 878 * <LI> Then it will run {@code 'this'} function's {@code 'apply'} method to the 879 * results of the {@code before.apply(...)} and return the result. 880 * </LI> 881 * </OL> 882 */ 883 default ArticleGet compose(ArticleGet before) 884 { return (URL url, Vector<HTMLNode> page) -> this.apply(url, before.apply(url, page)); } 885 886 /** 887 * The identity function will always return the same {@code Vector<HTMLNode>} as output that 888 * it receives as input. This is one of the {@code default} Java's lambda-methods. 889 * 890 * @return a new {@code ArticleGet} which (it should be obvious) is of type: 891 * {@code java.util.function.Function<Vector<HTMLNode>, Vector<HTMLNode>>} 892 * <BR /><BR />...<I> where the returned {@code Vector} is always the same (identical) to 893 * the input {@code Vector}.</I> 894 */ 895 static ArticleGet identity() 896 { 897 return (URL url, Vector<HTMLNode> page) -> 898 { 899 ArticleGetException.check(url, page); 900 return page; 901 }; 902 } 903 904 // Internally used "Helper Method" 905 /** Internally Used. */ 906 static String STR_FORMAT_TC_PARAMS(TextComparitor tc, String... compareStrings) 907 { 908 String tcName = TextComparitor.getName(tc); 909 910 String ret = (tcName != null) 911 ? "TextComparitor." + tcName + ", " 912 : "TextComparitor.(Anonymous-TC), "; 913 914 for (int i=0; i < compareStrings.length; i++) 915 { 916 String str = compareStrings[i]; 917 918 if ((ret.length() + str.length()) > 120) 919 { 920 ret += "\"" + (str.substring(0, str.length() - (120 - ret.length())) + "..."); 921 break; 922 } 923 else ret += "\"" + str; 924 925 ret += (i < (compareStrings.length - 1)) ? "\", " : "\")"; 926 } 927 928 return ret; 929 } 930}