001package Torello.HTML; 002 003import Torello.Java.Additional.Ret2; 004import Torello.Java.StringParse; 005 006import java.util.*; 007import java.util.regex.*; 008import java.io.*; 009import java.util.zip.*; 010import java.net.URL; 011import java.net.HttpURLConnection; 012import java.nio.charset.Charset; 013 014import Torello.JavaDoc.StaticFunctional; 015import Torello.JavaDoc.Excuse; 016 017/** 018 * Some standard utilities for transfering & downloading HTML from web-sites and then storing 019 * that content in memory as a Java {@code String} - <I>which, subsequently, can be written to 020 * disk, transferred elsewhere, or even parsed (using class {@link HTMLPage})</I>. 021 * 022 * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPE> 023 */ 024@StaticFunctional( 025 Excused={"USER_AGENT", "USE_USER_AGENT"}, 026 Excuses={Excuse.CONFIGURATION, Excuse.FLAG} 027) 028public class Scrape 029{ 030 private Scrape() { } 031 032 /** 033 * When opening an {@code HTTP URL} connection, it is usually a good idea to use a 034 * {@code "User Agent"} The default behavior in this Scrape & Search Package is to connect 035 * using the {@code public static String USER_AGENT = "Chrome/61.0.3163.100";} 036 * 037 * <BR /><BR /><B>NOTE:</B> This behavior may be changed by modifying these 038 * {@code public static} variables. 039 * 040 * <BR /><BR /><B>ALSO:</B> If the boolean {@link #USE_USER_AGENT} is set to {@code FALSE}, 041 * then no User-Agent will be used at all. 042 */ 043 public static String USER_AGENT = "Chrome/61.0.3163.100"; 044 045 /** 046 * When opening an {@code HTTP URL} connection, it is usually a good idea to use a 047 * {@code "User Agent"} The default behavior in this Scrape & Search Package is to connect 048 * using the {@code public static String USER_AGENT = "Chrome/61.0.3163.100";} 049 * 050 * <BR /><BR /><B>NOTE:</B> This behavior may be changed by modifying these 051 * {@code public static} variables. 052 * 053 * <BR /><BR /><B>ALSO:</B> If this boolean is set to {@code FALSE}, then no User-Agent will be 054 * used at all. 055 */ 056 public static boolean USE_USER_AGENT = true; 057 058 059 // ******************************************************************************************** 060 // ******************************************************************************************** 061 // HTTP Headers stuff 062 // ******************************************************************************************** 063 // ******************************************************************************************** 064 065 066 /** 067 * This method will check whether the {@code HTTP Header} returned by a website has been 068 * encoded using the {@code GZIP Compression} encoding. It expects the {@code java.util.Map} 069 * that is returned from an invocation of {@code HttpURLConnection.getHeaderFields()}. 070 * 071 * <BR /><BR /><B CLASS=JDDescLabel>Case-Insensitive:</B> 072 * 073 * <BR />Since {@code HTTP Headers} are considered <B>CASE INSENSITIVE</B>, all {@code String} 074 * comparisons done in this method shall ignore case. 075 * 076 * @param httpHeaders This is a simply {@code java.util.Map<String, List<String>>}. It 077 * <B><I>must be</I></B> the exact map that is returned by the {@code HttpURLConnection}. 078 * 079 * @return If this map contains a property named {@code "Content-Encoding"} <B><I>AND</I></B> 080 * this property has a <I>property-value</I> in it's list equal to {@code "gzip"}, then this 081 * method will return {@code TRUE}. Otherwise this method will return {@code FALSE}. 082 */ 083 public static boolean usesGZIP(Map<String, List<String>> httpHeaders) 084 { 085 // NOTE: HTTP Headers are CASE-INSENSITIVE, so a loop is needed to check if 086 // certain values are present - rather than the (more simple) Map.containsKey(...) 087 088 for (String prop : httpHeaders.keySet()) 089 090 // Check (Case Insensitive) if the HTTP Headers Map has the property "Content-Encoding" 091 // NOTE: The Map's returned have been known to contain null keys, so check for that here. 092 093 if ((prop != null) && prop.equalsIgnoreCase("Content-Encoding")) 094 095 // Check (Case Insensitive), if any of the properties assigned to "Content-Encoding" 096 // is "GZIP". If this is found, return TRUE immediately. 097 098 for (String vals : httpHeaders.get(prop)) 099 if (vals.equalsIgnoreCase("gzip")) return true; 100 101 // The property-value "GZIP" wasn't found, so return FALSE. 102 return false; 103 } 104 105 /** 106 * This method will check whether the {@code HTTP Header} returned by a website has been 107 * encoded using the {@code ZIP Compression (PKZIP, Deflate)} encoding. It expects the 108 * {@code java.util.Map} that is returned from an invokation of 109 * {@code HttpURLConnection.getHeaderFields()}. 110 * 111 * @param httpHeaders This is a simply {@code java.util.Map<String, List<String>>}. It 112 * <B><I>must be</I></B> the exact map that is returned by the {@code HttpURLConnection}. 113 * 114 * @return If this map contains a property named {@code "Content-Encoding"} <B><I>AND</I></B> 115 * this property has a <I>property-value</I> in it's list equal to {@code "deflate"}, then this 116 * method will return {@code TRUE}. Otherwise this method will return {@code FALSE}. 117 * 118 * <BR /><BR /><B>NOTE:</B> Since {@code HTTP Headers} are considered <B>CASE INSENSITIVE</B>, 119 * all {@code String} comparisons done in this method shall ignore case. 120 */ 121 public static boolean usesDeflate(Map<String, List<String>> httpHeaders) 122 { 123 // NOTE: HTTP Headers are CASE-INSENSITIVE, so a loop is needed to check if 124 // certain values are present - rather than the (more simple) Map.containsKey(...) 125 126 for (String prop : httpHeaders.keySet()) 127 128 // Check (Case Insensitive) if the HTTP Headers Map has the property "Content-Encoding" 129 // NOTE: The returned Maps have been known to contain null keys, so check for that here 130 131 if ((prop != null) && prop.equalsIgnoreCase("Content-Encoding")) 132 133 // Check (Case Insensitive), if any properties assigned to "Content-Encoding" are 134 // "DEFLATE" - then return TRUE immediately. 135 136 for (String vals : httpHeaders.get(prop)) 137 if (vals.equalsIgnoreCase("deflate")) return true; 138 139 // The property-value "deflate" wasn't found, so return FALSE. 140 return false; 141 } 142 143 /** 144 * This method will check whether the {@code HTTP Header} returned by a website has been 145 * encoded using compression. It expects the 146 * {@code java.util.Map} that is returned from an invokation of 147 * {@code HttpURLConnection.getHeaderFields()}. 148 * 149 * @param httpHeaders This is a simply {@code java.util.Map<String, List<String>>}. It 150 * <B><I>must be</I></B> the exact map that is returned by the {@code HttpURLConnection}. 151 * 152 * @param is This should be the {@code InputStream} that is returned from the 153 * {@code HttpURLConnection} when reqesting the content from the web-server that is hosting the 154 * {@code URL}. The {@code HTTP Headers} will be searched, and if a compression algorithm 155 * has been specified (<I>and the algorithm is one of the algorithm's automatically handled 156 * by Java</I>) - then this {@code InputStream} shall be <B>wrapped</B> by the appropriate 157 * decompression algorithm. 158 * 159 * @return If this map contains a property named {@code "Content-Encoding"} <B><I>AND</I></B> 160 * this property has a <I>property-value</I> in it's list equal to either {@code "deflate"} 161 * or {@code "gzip"}, then this shall return a <I>wrapped {@code InputStream}</I> that is 162 * capable of handling the <I>decompression algorithm</I>. 163 * 164 * <BR /><BR /><B>NOTE:</B> Since {@code HTTP Headers} are considered <B>CASE INSENSITIVE</B>, 165 * all {@code String} comparisons done in this method shall ignore case. 166 */ 167 public static InputStream checkHTTPCompression 168 (Map<String, List<String>> httpHeaders, InputStream is) throws IOException 169 { 170 // NOTE: HTTP Headers are CASE-INSENSITIVE, so a loop is needed to check if 171 // certain values are present - rather than the (more simple) Map.containsKey(...) 172 173 for (String prop : httpHeaders.keySet()) 174 175 // Check (Case Insensitive) if the HTTP Headers Map has the property "Content-Encoding" 176 // NOTE: The returned Maps have been known to contain null keys, so check for that here 177 178 if ((prop != null) && prop.equalsIgnoreCase("Content-Encoding")) 179 180 // Check (Case Insensitive), if any properties assigned to "Content-Encoding" 181 // are "DEFLATE" or "GZIP" - then return the compression-algorithm immediately. 182 183 for (String vals : httpHeaders.get(prop)) 184 185 if (vals.equalsIgnoreCase("gzip")) return new GZIPInputStream(is); 186 else if (vals.equalsIgnoreCase("deflate")) return new ZipInputStream(is); 187 188 // Neither of the property-values "gzip" or "deflate" were found. 189 // Return the original input stream. 190 191 return is; 192 } 193 194 /** 195 * This method shall simply take as input a {@code java.util.Map} which contains the 196 * {@code HTTP Header} properties that <I>must have been generated</I> by a call to the method 197 * {@code HttpURLConnection.getHeaderFields()}. It will produce a Java {@code String} that 198 * lists these headers in text / readable format. 199 * 200 * @param httpHeaders This parameter must be an instance of 201 * {@code java.util.Map<String, List<String>>} and it should have been generated by a call to 202 * {@code HttpURLConnection.getHeaderFields()}. The property names and values contained by 203 * this {@code Map} will be iterated and printed to a returned {@code java.lang.String}. 204 * 205 * @return This shall return a printed version of the {@code Map}. 206 */ 207 public static String httpHeadersToString(Map<String, List<String>> httpHeaders) 208 { 209 StringBuilder sb = new StringBuilder(); 210 int max = 0; 211 212 // To ensure that the output string is "aligned", check the length of each of the 213 // keys in the HTTP Header. 214 215 for (String key : httpHeaders.keySet()) if (key.length() > max) max = key.length(); 216 217 max += 5; 218 219 // Iterate all of the Properties that are included in the 'httpHeaders' parameter 220 // It is important to note that the java "toString()" method for the List<String> that 221 // is used to store the Property-Values list works great, without any changes. 222 223 for (String key : httpHeaders.keySet()) sb.append( 224 StringParse.rightSpacePad(key + ':', max) + 225 httpHeaders.get(key).toString() + '\n' 226 ); 227 228 return sb.toString(); 229 } 230 231 232 // ******************************************************************************************** 233 // ******************************************************************************************** 234 // Some various ways to open a connection to a website. 235 // ******************************************************************************************** 236 // ******************************************************************************************** 237 238 239 /** 240 * Convenience Method. 241 * <BR />Invokes: {@link #openConn(URL)} 242 */ 243 public static BufferedReader openConn(String url) throws IOException 244 { return openConn(new URL(url)); } 245 246 /** 247 * Opens a standard connection to a {@code URL}, and returns a {@code BufferedReader} for 248 * reading from it. 249 * 250 * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEGZIP> 251 * <EMBED CLASS='external-html' DATA-FILE-ID=SCUA> <!-- User Agent, Browser War Note --> 252 * 253 * @param url This may be an Internet-{@code URL.} 254 * 255 * @return A java {@code BufferedReader} for retrieving the data from the internet connection. 256 * 257 * @see #USER_AGENT 258 * @see #USE_USER_AGENT 259 * @see #checkHTTPCompression(Map, InputStream) 260 */ 261 public static BufferedReader openConn(URL url) throws IOException 262 { 263 HttpURLConnection con = (HttpURLConnection) url.openConnection(); 264 265 con.setRequestMethod("GET"); 266 267 if (USE_USER_AGENT) con.setRequestProperty("User-Agent", USER_AGENT); 268 269 InputStream is = checkHTTPCompression(con.getHeaderFields(), con.getInputStream()); 270 271 return new BufferedReader(new InputStreamReader(is)); 272 } 273 274 /** 275 * Opens a {@code UTF8} connection to a {@code URL}, and returns a {@code BufferedReader} for 276 * reading it, <B><I>and also</I></B> the {@code HTTP Header} that was returned by the 277 * {@code HTTP Server}. 278 * 279 * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEGZIP> 280 * 281 * @param url This may be an Internet {@code URL}. 282 * 283 * @return <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPERET2> 284 * 285 * @throws IOException 286 * 287 * @see #checkHTTPCompression(Map, InputStream) 288 */ 289 public static Ret2<BufferedReader, Map<String, List<String>>> openConnGetHeader(URL url) 290 throws IOException 291 { 292 HttpURLConnection con = (HttpURLConnection) url.openConnection(); 293 294 con.setRequestMethod("GET"); 295 296 if (USE_USER_AGENT) con.setRequestProperty("User-Agent", USER_AGENT); 297 298 Map<String, List<String>> httpHeaders = con.getHeaderFields(); 299 300 InputStream is = checkHTTPCompression(httpHeaders, con.getInputStream()); 301 302 return new Ret2<BufferedReader, Map<String, List<String>>> 303 (new BufferedReader(new InputStreamReader(is)), httpHeaders); 304 } 305 306 /** 307 * Convenience Method. 308 * <BR />Invokes: {@link #openConn_iso_8859_1(URL)} 309 */ 310 public static BufferedReader openConn_iso_8859_1(String url) throws IOException 311 { return openConn_iso_8859_1(new URL(url)); } 312 313 /** 314 * Will open an {@code ISO-8859} connection to a {@code URL}, and returns a 315 * {@code BufferedReader} for reading it. 316 * 317 * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEGZIP> 318 * <EMBED CLASS='external-html' DATA-FILE-ID=SCUA> <!-- User Agent, Browser War Note --> 319 * 320 * @param url This may be an Internet {@code URL}. The site and page to which it points should 321 * return data encoded in the {@code ISO-8859} charset. 322 * 323 * @return A java {@code BufferedReader} for retrieving the data from the internet connection. 324 * 325 * @see #USER_AGENT 326 * @see #USE_USER_AGENT 327 * @see #checkHTTPCompression(Map, InputStream) 328 */ 329 public static BufferedReader openConn_iso_8859_1(URL url) throws IOException 330 { 331 HttpURLConnection con = (HttpURLConnection) url.openConnection(); 332 333 con.setRequestMethod("GET"); 334 335 if (USE_USER_AGENT) con.setRequestProperty("User-Agent", USER_AGENT); 336 337 con.setRequestProperty("Content-Type", "text/html; charset=iso-8859-1"); 338 339 InputStream is = checkHTTPCompression(con.getHeaderFields(), con.getInputStream()); 340 341 return new BufferedReader(new InputStreamReader(is, Charset.forName("iso-8859-1"))); 342 } 343 344 345 /** 346 * Opens a {@code ISO-8859-1} connection to a {@code URL}, and returns a {@code BufferedReader} 347 * for reading it, <B><I>and also</I></B> the {@code HTTP Header} that was returned by the 348 * {@code HTTP Server}. 349 * 350 * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEGZIP> 351 * 352 * @param url This may be an Internet {@code URL}. The site and page to which it points should 353 * return data encoded in the {@code ISO-8859-1} charset. 354 * 355 * @return <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPERET2> 356 * 357 * @throws IOException 358 * 359 * @see #checkHTTPCompression(Map, InputStream) 360 */ 361 public static Ret2<BufferedReader, Map<String, List<String>>> 362 openConnGetHeader_iso_8859_1(URL url) 363 throws IOException 364 { 365 HttpURLConnection con = (HttpURLConnection) url.openConnection(); 366 367 con.setRequestMethod("GET"); 368 369 if (USE_USER_AGENT) con.setRequestProperty("User-Agent", USER_AGENT); 370 371 con.setRequestProperty("Content-Type", "charset=iso-8859-1"); 372 373 Map<String, List<String>> httpHeaders = con.getHeaderFields(); 374 375 InputStream is = checkHTTPCompression(httpHeaders, con.getInputStream()); 376 377 return new Ret2<BufferedReader, Map<String, List<String>>>( 378 new BufferedReader(new InputStreamReader(is, Charset.forName("charset=iso-8859-1"))), 379 httpHeaders 380 ); 381 } 382 383 /** 384 * Convenience Method. 385 * <BR />Invokes: {@link #openConn_UTF8(URL)}. 386 */ 387 public static BufferedReader openConn_UTF8(String url) throws IOException 388 { return openConn_UTF8(new URL(url)); } 389 390 /** 391 * Opens a {@code UTF8} connection to a {@code URL}, and returns a {@code BufferedReader} for 392 * reading it. 393 * 394 * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEUTF8> 395 * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEGZIP> 396 * <EMBED CLASS='external-html' DATA-FILE-ID=SCUA> <!-- User Agent, Browser War Note --> 397 * 398 * @param url This may be an Internet {@code URL}. The site and page to which it points should 399 * return data encoded in the {@code UTF-8} charset. 400 * 401 * @return A java {@code BufferedReader} for retrieving the data from the internet connection. 402 * 403 * @see #USER_AGENT 404 * @see #USE_USER_AGENT 405 * @see #checkHTTPCompression(Map, InputStream) 406 */ 407 public static BufferedReader openConn_UTF8(URL url) throws IOException 408 { 409 HttpURLConnection con = (HttpURLConnection) url.openConnection(); 410 411 con.setRequestMethod("GET"); 412 413 if (USE_USER_AGENT) con.setRequestProperty("User-Agent", USER_AGENT); 414 415 con.setRequestProperty("Content-Type", "charset=UTF-8"); 416 417 InputStream is = checkHTTPCompression(con.getHeaderFields(), con.getInputStream()); 418 419 return new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8"))); 420 } 421 422 /** 423 * Opens a {@code UTF8} connection to a {@code URL}, and returns a {@code BufferedReader} for 424 * reading it, <B><I>and also</I></B> the {@code HTTP Header} that was returned by the 425 * {@code HTTP Server}. 426 * 427 * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEUTF8> 428 * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEGZIP> 429 * 430 * @param url This may be an Internet {@code URL}. The site and page to which it points should 431 * return data encoded in the {@code UTF-8} charet. 432 * 433 * @return <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPERET2> 434 * 435 * @throws IOException 436 * @see #checkHTTPCompression(Map, InputStream) 437 */ 438 public static Ret2<BufferedReader, Map<String, List<String>>> openConnGetHeader_UTF8(URL url) 439 throws IOException 440 { 441 HttpURLConnection con = (HttpURLConnection) url.openConnection(); 442 443 con.setRequestMethod("GET"); 444 445 if (USE_USER_AGENT) con.setRequestProperty("User-Agent", USER_AGENT); 446 447 con.setRequestProperty("Content-Type", "charset=UTF-8"); 448 449 Map<String, List<String>> httpHeaders = con.getHeaderFields(); 450 451 InputStream is = checkHTTPCompression(httpHeaders, con.getInputStream()); 452 453 return new Ret2<BufferedReader, Map<String, List<String>>>( 454 new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8"))), 455 httpHeaders 456 ); 457 } 458 459 460 // ******************************************************************************************** 461 // ******************************************************************************************** 462 // Some simple/easy HTML scrape functions, saves to a String. 463 // ******************************************************************************************** 464 // ******************************************************************************************** 465 466 467 /** 468 * Convenience Method. 469 * <BR />Invokes: {@link #scrapePage(BufferedReader)} 470 * <BR />Obtains: {@code BufferedReader} from {@link #openConn(String)} 471 */ 472 public static String scrapePage(String url) throws IOException 473 { return scrapePage(openConn(url)); } 474 475 /** 476 * Convenience Method. 477 * <BR />Invokes: {@link #scrapePage(BufferedReader)} 478 * <BR />Obtains: {@code BufferedReader} from {@link #openConn(URL)} 479 */ 480 public static String scrapePage(URL url) throws IOException 481 { return scrapePage(openConn(url)); } 482 483 /** 484 * This scrapes a website and dumps the entire contents into a {@code java.lang.String}. 485 * 486 * @param br This is a {@code Reader} that needs to have been connected to a Website that will 487 * output text/html data. 488 * 489 * @return The text/html data - returned inside a {@code String} 490 */ 491 public static String scrapePage(BufferedReader br) throws IOException 492 { 493 StringBuffer sb = new StringBuffer(); 494 String s; 495 496 while ((s = br.readLine()) != null) sb.append(s + "\n"); 497 498 return sb.toString(); 499 } 500 501 502 // ******************************************************************************************** 503 // ******************************************************************************************** 504 // Some simple/easy HTML scrape functions, saves to a Vector<String>. 505 // ******************************************************************************************** 506 // ******************************************************************************************** 507 508 509 /** 510 * Convenience Method. 511 * <BR />Invokes: {@link #scrapePageToVector(BufferedReader, boolean)} 512 * <BR />Obtains: {@code BufferedReader} from {@link #openConn(String)} 513 */ 514 public static Vector<String> scrapePageToVector(String url, boolean includeNewLine) 515 throws IOException 516 { return scrapePageToVector(openConn(url), includeNewLine); } 517 518 /** 519 * Convenience Method. 520 * <BR />Invokes: {@link #scrapePageToVector(BufferedReader, boolean)} 521 * <BR />Obtains: {@code Bufferedeader} from {@link #openConn(URL)} 522 */ 523 public static Vector<String> scrapePageToVector(URL url, boolean includeNewLine) 524 throws IOException 525 { return scrapePageToVector(openConn(url), includeNewLine); } 526 527 /** 528 * This will scrape the entire contents of an HTML page to a {@code Vector<String>} Each 529 * line of the text/HTML page is demarcated by the reception of a {@code '\n'} character 530 * from the web-server. 531 * 532 * @param br This is the input source of the HTML page. It will query for String data. 533 * 534 * @param includeNewLine This will append the {@code '\n'} character to the end of each 535 * {@code String} in the {@code Vector}. 536 * 537 * @return a {@code Vector} of {@code String's} where each {@code String} is a line on the 538 * web-page. 539 * 540 * @see #scrapePageToVector(String, boolean) 541 */ 542 public static Vector<String> scrapePageToVector(BufferedReader br, boolean includeNewLine) 543 throws IOException 544 { 545 Vector<String> ret = new Vector<>(); 546 String s = null; 547 548 if (includeNewLine) 549 550 while ((s = br.readLine()) != null) 551 ret.add(s + '\n'); 552 553 else 554 555 while ((s = br.readLine()) != null) 556 ret.add(s); 557 558 return ret; 559 } 560 561 562 // ******************************************************************************************** 563 // ******************************************************************************************** 564 // Main HTML scrape functions - used by main class of "HTMLPage.getPageTokens()" 565 // ******************************************************************************************** 566 // ******************************************************************************************** 567 568 569 /** 570 * This receives an input stream that is contains a pipe to a website that will produce HTML. 571 * The HTML is read from the website, and returned as a {@code String.} 572 * This is called "scraping HTML." 573 * 574 * @param startTag If this is null, the scrape will begin with the first character received. 575 * If this contains a {@code String}, the scrape will not include any text/HTML data that 576 * occurs prior to the first occurrence of {@code 'startTag'} 577 * 578 * @param endTag If this is null, the scrape will read the entire contents of text/HTML data 579 * from the {@code Bufferedreader br} parameter. If this contains a {@code String}, then data 580 * will be read and included in the result until {@code 'endTag'} is received. 581 * 582 * @return a {@code StringBuffer} that is text/html data retrieved from the Reader. 583 * Call {@code toString()} on the return value to retrieve that {@code String.} 584 * 585 * @throws ScrapeException If, after download completes, either the {@code 'startTag'} or the 586 * parameter {@code 'endTag'} do not represent {@code String's} that were found within the 587 * downloaded page, this exception is thrown. 588 */ 589 public static StringBuffer getHTML(BufferedReader br, String startTag, String endTag) 590 throws IOException 591 { 592 StringBuffer html = new StringBuffer(); 593 String s; 594 595 // Nice Long Name... Guess what it means 596 boolean alreadyFoundEndTagInStartTagLine = false; 597 598 // If the startTag parameter is not null, skip all content, until the startTag is found! 599 if (startTag != null) 600 { 601 boolean foundStartTag = false; 602 603 while ((s = br.readLine()) != null) 604 605 if (s.contains(startTag)) 606 { 607 int startTagPos = s.indexOf(startTag); 608 609 foundStartTag = true; 610 611 // NOTE: Sometimes the 'startTag' and 'endTag' are on the same line! 612 // This happens, for instance, on Yahoo Photos, when giant lines 613 // (no line-breaks) are transmitted 614 // Hence... *really* long variable name, this is confusing! 615 616 s = s.substring(startTagPos); 617 618 if ((endTag != null) && s.contains(endTag)) 619 { 620 s = s.substring(0, s.indexOf(endTag) + endTag.length()); 621 622 alreadyFoundEndTagInStartTagLine = true; 623 } 624 625 html.append(s + "\n"); break; 626 } 627 628 if (! foundStartTag) throw new ScrapeException 629 ("Start Tag: '" + startTag + "' was Not Found on Page."); 630 } 631 632 // if the endTag parameter is not null, stop reading as soon as the end-tag is found 633 if (endTag != null) 634 { 635 // NOTE: This 'if' is inside curly-braces, because there is an 'else' that "goes with" 636 // the 'if' above... BUT NOT the following 'if' 637 638 if (! alreadyFoundEndTagInStartTagLine) 639 { 640 boolean foundEndTag = false; 641 642 while ((s = br.readLine()) != null) 643 644 if (s.contains(endTag)) 645 { 646 foundEndTag = true; 647 int endTagPos = s.indexOf(endTag); 648 html.append(s.substring(0, endTagPos + endTag.length()) + "\n"); 649 break; 650 } 651 652 else html.append(s + "\n"); 653 654 if (! foundEndTag) throw new ScrapeException 655 ("End Tag: '" + endTag + "' was Not Found on Page."); 656 } 657 } 658 659 // ELSE: (endTag *was* null) ... read all content until EOF ... or ... "EOWP" (end of web-page) 660 else 661 662 while ((s = br.readLine()) != null) 663 html.append(s + "\n"); 664 665 // Kind of an annoying line, but this is the new "Multi-Threaded" thing I added. 666 return html; 667 } 668 669 670 /** 671 * This receives an input stream that is contains a pipe to a website that will produce HTML. 672 * The HTML is read from the website, and returned as a {@code String.} 673 * This is called "scraping HTML." 674 * 675 * @param startLineNum If this is {@code '0'} or {@code '1'}, the scrape will begin with the 676 * first character received. If this contains a positive integer, the scrape will not include 677 * any text/HTML data that occurs prior to {@code int startLineNum} lines of text/html having 678 * been received. 679 * 680 * @param endLineNum If this is negative, the scrape will read the entire contents of 681 * text/HTML data from the {@code Bufferedreader br} parameter (until {@code EOF} is 682 * encountered). If this contains a positive integer, then data will be read and included in 683 * the result until {@code int endLineNum} lines of text/html have been received. 684 * 685 * @return a {@code StringBuffer} that is text/html data retrieved from the Reader. 686 * Call {@code toString()} on the return value to retrieve that {@code String} 687 * 688 * @throws IllegalArgumentException If parameter {@code 'startLineNum'} is negative or greater 689 * than {@code 'endLineNum'} If {@code 'endLineNum'} was negative, this test is skipped. 690 * 691 * @throws ScrapeException If there were not enough lines read from the {@code BufferedReader} 692 * parameter to be consistent with the values in {@code 'startLineNum'} and 693 * {@code 'endLineNum'} 694 */ 695 public static StringBuffer getHTML(BufferedReader br, int startLineNum, int endLineNum) 696 throws IOException 697 { 698 StringBuffer html = new StringBuffer(); 699 String s = ""; 700 701 // NOTE: Arrays start at 0, **BUT** HTML page line counts start at 1! 702 int curLineNum = 1; 703 704 if (startLineNum < 0) throw new IllegalArgumentException( 705 "The parameter startLineNum is negative: " + startLineNum + " but this is not " + 706 "allowed." 707 ); 708 709 if (endLineNum == 0) throw new IllegalArgumentException 710 ("The parameter endLineNum is zero, but this is not allowed."); 711 712 endLineNum = (endLineNum < 0) ? 1 : endLineNum; 713 startLineNum = (startLineNum == 0) ? 1 : startLineNum; 714 715 if ((endLineNum < startLineNum) && (endLineNum != 1)) throw new IllegalArgumentException( 716 "The parameter startLineNum is: " + startLineNum + "\n" + 717 "The parameter endLineNum is: " + endLineNum + "\n" + 718 "It is required that the latter is larger than the former, " + 719 "or it must be 0 or negative to signify read until EOF." 720 ); 721 722 if (startLineNum > 1) 723 { 724 while (curLineNum++ < startLineNum) 725 726 if (br.readLine() == null) throw new ScrapeException( 727 "The HTML Page that was given didn't even have enough lines to read " + 728 "quantity in variable startLineNum.\nstartLineNum = " + startLineNum + 729 " and read " + (curLineNum-1) + " line(s) before EOF." 730 ); 731 732 // Off-By-One computer science error correction - remember post-decrement, means the 733 // last loop iteration didn't read line, but did increment the loop counter! 734 735 curLineNum--; 736 } 737 738 // endLineNum==1 means/imples that we don't have to heed the 739 // endLineNum variable ==> read to EOF/null! 740 741 if (endLineNum == 1) 742 743 while ((s = br.readLine()) != null) 744 html.append(s + "\n"); 745 746 // endLineNum > 1 ==> Head endLineNum variable! 747 else 748 { 749 // System.out.println("At START of LOOP: curLineNum = " + curLineNum + 750 // " and endLineNum = " + endLineNum); 751 752 for ( ;curLineNum <= endLineNum; curLineNum++) 753 754 if ((s = br.readLine()) != null) html.append(s + "\n"); 755 else break; 756 757 // NOTE: curLineNum-1 and endLineNum+1 are used because: 758 // 759 // ** The loop counter (curLineNum) breaks when the next line to read is the one 760 // passed the endLineNum 761 // ** endLineNum+1 is the appropriate state if enough lines were read from the 762 // HTML Page 763 // ** curLineNum-1 is the number of the last line read from the HTML 764 765 if (curLineNum != (endLineNum+1)) throw new ScrapeException( 766 "The HTML Page that was read didn't have enough lines to read to quantity in " + 767 "variable endLineNum.\nendLineNum = " + endLineNum + " but only read " + 768 (curLineNum-1) + " line(s) before EOF." 769 ); 770 } 771 772 // Kind of an annoying line, but this is the new "Multi-Threaded" thing I added. 773 return html; 774 } 775}