Source code

001package Torello.HTML;
002
003import Torello.Java.Additional.Ret2;
004import Torello.Java.StringParse;
005
006import java.util.*;
007import java.util.regex.*;
008import java.io.*;
009import java.util.zip.*;
010import java.net.URL;
011import java.net.HttpURLConnection;
012import java.nio.charset.Charset;
013
014import Torello.JavaDoc.StaticFunctional;
015import Torello.JavaDoc.Excuse;
016
017/**
018 * Some standard utilities for transfering &amp; downloading HTML from web-sites and then storing
019 * that content in memory as a Java {@code String} - <I>which, subsequently, can be written to
020 * disk, transferred elsewhere, or even parsed (using class {@link HTMLPage})</I>.
021 * 
022 * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPE>
023 */
024@StaticFunctional(
025    Excused={"USER_AGENT", "USE_USER_AGENT"},
026    Excuses={Excuse.CONFIGURATION, Excuse.FLAG}
027)
028public class Scrape
029{
030    private Scrape() { }
031
032    /**
033     * When opening an {@code HTTP URL} connection, it is usually a good idea to use a
034     * {@code "User Agent"}  The default behavior in this Scrape &amp; Search Package is to connect
035     * using the {@code public static String USER_AGENT = "Chrome/61.0.3163.100";}
036     * 
037     * <BR /><BR /><B>NOTE:</B> This behavior may be changed by modifying these
038     * {@code public static} variables.
039     * 
040     * <BR /><BR /><B>ALSO:</B> If the boolean {@link #USE_USER_AGENT} is set to {@code FALSE},
041     * then no User-Agent will be used at all.
042     */
043    public static String USER_AGENT = "Chrome/61.0.3163.100";
044
045    /**
046     * When opening an {@code HTTP URL} connection, it is usually a good idea to use a
047     * {@code "User Agent"} The default behavior in this Scrape &amp; Search Package is to connect
048     * using the {@code public static String USER_AGENT = "Chrome/61.0.3163.100";}
049     * 
050     * <BR /><BR /><B>NOTE:</B> This behavior may be changed by modifying these
051     * {@code public static} variables.
052     * 
053     * <BR /><BR /><B>ALSO:</B> If this boolean is set to {@code FALSE}, then no User-Agent will be
054     * used at all.
055     */
056    public static boolean USE_USER_AGENT = true;
057
058
059    // ********************************************************************************************
060    // ********************************************************************************************
061    // HTTP Headers stuff
062    // ********************************************************************************************
063    // ********************************************************************************************
064
065
066    /**
067     * This method will check whether the {@code HTTP Header} returned by a website has been
068     * encoded using the {@code GZIP Compression} encoding.  It expects the {@code java.util.Map}
069     * that is returned from an invocation of {@code HttpURLConnection.getHeaderFields()}.
070     * 
071     * <BR /><BR /><B CLASS=JDDescLabel>Case-Insensitive:</B>
072     * 
073     * <BR />Since {@code HTTP Headers} are considered <B>CASE INSENSITIVE</B>, all {@code String}
074     * comparisons done in this method shall ignore case.
075     * 
076     * @param httpHeaders This is a simply {@code java.util.Map<String, List<String>>}.  It 
077     * <B><I>must be</I></B> the exact map that is returned by the {@code HttpURLConnection}.
078     *
079     * @return If this map contains a property named {@code "Content-Encoding"} <B><I>AND</I></B>
080     * this property has a <I>property-value</I> in it's list equal to {@code "gzip"}, then this
081     * method will return {@code TRUE}.  Otherwise this method will return {@code FALSE}.
082     */
083    public static boolean usesGZIP(Map<String, List<String>> httpHeaders)
084    {
085        // NOTE: HTTP Headers are CASE-INSENSITIVE, so a loop is needed to check if
086        //       certain values are present - rather than the (more simple) Map.containsKey(...)
087
088        for (String prop : httpHeaders.keySet())
089
090            // Check (Case Insensitive) if the HTTP Headers Map has the property "Content-Encoding"
091            // NOTE: The Map's returned have been known to contain null keys, so check for that here.
092
093            if ((prop != null) && prop.equalsIgnoreCase("Content-Encoding"))
094
095                // Check (Case Insensitive), if any of the properties assigned to "Content-Encoding"
096                // is "GZIP".  If this is found, return TRUE immediately.
097    
098                for (String vals : httpHeaders.get(prop))
099                    if (vals.equalsIgnoreCase("gzip")) return true;
100
101        // The property-value "GZIP" wasn't found, so return FALSE.
102        return false;
103    }
104
105    /**
106     * This method will check whether the {@code HTTP Header} returned by a website has been
107     * encoded using the {@code ZIP Compression (PKZIP, Deflate)} encoding.  It expects the
108     * {@code java.util.Map} that is returned from an invokation of
109     * {@code HttpURLConnection.getHeaderFields()}.
110     * 
111     * @param httpHeaders This is a simply {@code java.util.Map<String, List<String>>}.  It 
112     * <B><I>must be</I></B> the exact map that is returned by the {@code HttpURLConnection}.
113     *
114     * @return If this map contains a property named {@code "Content-Encoding"} <B><I>AND</I></B>
115     * this property has a <I>property-value</I> in it's list equal to {@code "deflate"}, then this
116     * method will return {@code TRUE}.  Otherwise this method will return {@code FALSE}.
117     * 
118     * <BR /><BR /><B>NOTE:</B> Since {@code HTTP Headers} are considered <B>CASE INSENSITIVE</B>,
119     * all {@code String} comparisons done in this method shall ignore case.
120     */
121    public static boolean usesDeflate(Map<String, List<String>> httpHeaders)
122    {
123        // NOTE: HTTP Headers are CASE-INSENSITIVE, so a loop is needed to check if
124        //       certain values are present - rather than the (more simple) Map.containsKey(...)
125
126        for (String prop : httpHeaders.keySet())
127
128            // Check (Case Insensitive) if the HTTP Headers Map has the property "Content-Encoding"
129            // NOTE: The returned Maps have been known to contain null keys, so check for that here
130
131            if ((prop != null) && prop.equalsIgnoreCase("Content-Encoding"))
132
133                // Check (Case Insensitive), if any properties assigned to "Content-Encoding" are
134                // "DEFLATE" - then return TRUE immediately.
135    
136                for (String vals : httpHeaders.get(prop))
137                    if (vals.equalsIgnoreCase("deflate")) return true;
138
139        // The property-value "deflate" wasn't found, so return FALSE.
140        return false;
141    }
142
143    /**
144     * This method will check whether the {@code HTTP Header} returned by a website has been
145     * encoded using compression.  It expects the
146     * {@code java.util.Map} that is returned from an invokation of
147     * {@code HttpURLConnection.getHeaderFields()}.
148     * 
149     * @param httpHeaders This is a simply {@code java.util.Map<String, List<String>>}.  It 
150     * <B><I>must be</I></B> the exact map that is returned by the {@code HttpURLConnection}.
151     * 
152     * @param is This should be the {@code InputStream} that is returned from the
153     * {@code HttpURLConnection} when reqesting the content from the web-server that is hosting the
154     * {@code URL}.  The {@code HTTP Headers} will be searched, and if a compression algorithm
155     * has been specified (<I>and the algorithm is one of the algorithm's automatically handled 
156     * by Java</I>) - then this {@code InputStream} shall be <B>wrapped</B> by the appropriate
157     * decompression algorithm.
158     *
159     * @return If this map contains a property named {@code "Content-Encoding"} <B><I>AND</I></B>
160     * this property has a <I>property-value</I> in it's list equal to either {@code "deflate"}
161     * or {@code "gzip"}, then this shall return a <I>wrapped {@code InputStream}</I> that is
162     * capable of handling the <I>decompression algorithm</I>.
163     * 
164     * <BR /><BR /><B>NOTE:</B> Since {@code HTTP Headers} are considered <B>CASE INSENSITIVE</B>,
165     * all {@code String} comparisons done in this method shall ignore case.
166     */
167    public static InputStream checkHTTPCompression
168        (Map<String, List<String>> httpHeaders, InputStream is) throws IOException
169    {
170        // NOTE: HTTP Headers are CASE-INSENSITIVE, so a loop is needed to check if
171        //       certain values are present - rather than the (more simple) Map.containsKey(...)
172
173        for (String prop : httpHeaders.keySet())
174
175            // Check (Case Insensitive) if the HTTP Headers Map has the property "Content-Encoding"
176            // NOTE: The returned Maps have been known to contain null keys, so check for that here
177
178            if ((prop != null) && prop.equalsIgnoreCase("Content-Encoding"))
179
180                // Check (Case Insensitive), if any properties assigned to "Content-Encoding"
181                // are "DEFLATE" or "GZIP" - then return the compression-algorithm immediately.
182    
183                for (String vals : httpHeaders.get(prop))
184
185                    if (vals.equalsIgnoreCase("gzip"))          return new GZIPInputStream(is);
186                    else if (vals.equalsIgnoreCase("deflate"))  return new ZipInputStream(is);
187
188        // Neither of the property-values "gzip" or "deflate" were found.
189        // Return the original input stream.
190
191        return is;
192    }
193
194    /**
195     * This method shall simply take as input a {@code java.util.Map} which contains the
196     * {@code HTTP Header} properties that <I>must have been generated</I> by a call to the method
197     * {@code HttpURLConnection.getHeaderFields()}.  It will produce a Java {@code String} that
198     * lists these headers in text / readable format.
199     * 
200     * @param httpHeaders This parameter must be an instance of 
201     * {@code java.util.Map<String, List<String>>} and it should have been generated by a call to
202     * {@code HttpURLConnection.getHeaderFields()}.  The property names and values contained by
203     * this {@code Map} will be iterated and printed to a returned {@code java.lang.String}.
204     * 
205     * @return This shall return a printed version of the {@code Map}.
206     */
207    public static String httpHeadersToString(Map<String, List<String>> httpHeaders)
208    {
209        StringBuilder   sb  = new StringBuilder();
210        int             max = 0;
211
212        // To ensure that the output string is "aligned", check the length of each of the
213        // keys in the HTTP Header.
214
215        for (String key : httpHeaders.keySet()) if (key.length() > max) max = key.length();
216
217        max += 5;
218
219        // Iterate all of the Properties that are included in the 'httpHeaders' parameter
220        // It is important to note that the java "toString()" method for the List<String> that
221        // is used to store the Property-Values list works great, without any changes.
222
223        for (String key : httpHeaders.keySet()) sb.append(
224            StringParse.rightSpacePad(key + ':', max) +
225            httpHeaders.get(key).toString() + '\n'
226        );
227
228        return sb.toString();
229    }
230
231
232    // ********************************************************************************************
233    // ********************************************************************************************
234    // Some various ways to open a connection to a website.
235    // ********************************************************************************************
236    // ********************************************************************************************
237
238
239    /**
240     * Convenience Method.
241     * <BR />Invokes: {@link #openConn(URL)}
242     */
243    public static BufferedReader openConn(String url) throws IOException
244    { return openConn(new URL(url)); }
245
246    /**
247     * Opens a standard connection to a {@code URL}, and returns a {@code BufferedReader} for
248     * reading from it.
249     * 
250     * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEGZIP>
251     * <EMBED CLASS='external-html' DATA-FILE-ID=SCUA> <!-- User Agent, Browser War Note -->
252     * 
253     * @param url This may be an Internet-{@code URL.}
254     * 
255     * @return A java {@code BufferedReader} for retrieving the data from the internet connection.
256     * 
257     * @see #USER_AGENT
258     * @see #USE_USER_AGENT
259     * @see #checkHTTPCompression(Map, InputStream)
260     */
261    public static BufferedReader openConn(URL url) throws IOException
262    {
263        HttpURLConnection con = (HttpURLConnection) url.openConnection();
264
265        con.setRequestMethod("GET");
266
267        if (USE_USER_AGENT) con.setRequestProperty("User-Agent", USER_AGENT);
268
269        InputStream is = checkHTTPCompression(con.getHeaderFields(), con.getInputStream());
270
271        return new BufferedReader(new InputStreamReader(is));
272    }
273
274    /**
275     * Opens a {@code UTF8} connection to a {@code URL}, and returns a {@code BufferedReader} for
276     * reading it, <B><I>and also</I></B> the {@code HTTP Header} that was returned by the
277     * {@code HTTP Server}.
278     * 
279     * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEGZIP>
280     * 
281     * @param url This may be an Internet {@code URL}.
282     * 
283     * @return <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPERET2>
284     *  
285     * @throws IOException
286     * 
287     * @see #checkHTTPCompression(Map, InputStream)
288     */
289    public static Ret2<BufferedReader, Map<String, List<String>>> openConnGetHeader(URL url)
290        throws IOException
291    {
292        HttpURLConnection con = (HttpURLConnection) url.openConnection();
293
294        con.setRequestMethod("GET");
295
296        if (USE_USER_AGENT) con.setRequestProperty("User-Agent", USER_AGENT);
297
298        Map<String, List<String>> httpHeaders = con.getHeaderFields();
299
300        InputStream is = checkHTTPCompression(httpHeaders, con.getInputStream());
301
302        return new Ret2<BufferedReader, Map<String, List<String>>>
303            (new BufferedReader(new InputStreamReader(is)), httpHeaders);
304    }
305
306    /**
307     * Convenience Method.
308     * <BR />Invokes: {@link #openConn_iso_8859_1(URL)}
309     */
310    public static BufferedReader openConn_iso_8859_1(String url) throws IOException 
311    { return openConn_iso_8859_1(new URL(url)); }
312
313    /**
314     * Will open an {@code ISO-8859} connection to a {@code URL}, and returns a 
315     * {@code BufferedReader} for reading it.
316     * 
317     * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEGZIP> 
318     * <EMBED CLASS='external-html' DATA-FILE-ID=SCUA> <!-- User Agent, Browser War Note -->
319     * 
320     * @param url This may be an Internet {@code URL}. The site and page to which it points should
321     * return data encoded in the {@code ISO-8859} charset.
322     * 
323     * @return A java {@code BufferedReader} for retrieving the data from the internet connection.
324     * 
325     * @see #USER_AGENT
326     * @see #USE_USER_AGENT
327     * @see #checkHTTPCompression(Map, InputStream)
328     */
329    public static BufferedReader openConn_iso_8859_1(URL url) throws IOException
330    {
331        HttpURLConnection con = (HttpURLConnection) url.openConnection();
332
333        con.setRequestMethod("GET");
334
335        if (USE_USER_AGENT) con.setRequestProperty("User-Agent", USER_AGENT);
336
337        con.setRequestProperty("Content-Type", "text/html; charset=iso-8859-1");
338
339        InputStream is = checkHTTPCompression(con.getHeaderFields(), con.getInputStream());
340
341        return new BufferedReader(new InputStreamReader(is, Charset.forName("iso-8859-1")));
342    }
343
344
345    /**
346     * Opens a {@code ISO-8859-1} connection to a {@code URL}, and returns a {@code BufferedReader}
347     * for reading it, <B><I>and also</I></B> the {@code HTTP Header} that was returned by the
348     * {@code HTTP Server}.
349     * 
350     * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEGZIP>
351     * 
352     * @param url This may be an Internet {@code URL}.  The site and page to which it points should
353     * return data encoded in the {@code ISO-8859-1} charset.
354     * 
355     * @return <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPERET2>
356     *  
357     * @throws IOException
358     * 
359     * @see #checkHTTPCompression(Map, InputStream)
360     */
361    public static Ret2<BufferedReader, Map<String, List<String>>>
362        openConnGetHeader_iso_8859_1(URL url)
363        throws IOException
364    {
365        HttpURLConnection con = (HttpURLConnection) url.openConnection();
366
367        con.setRequestMethod("GET");
368
369        if (USE_USER_AGENT) con.setRequestProperty("User-Agent", USER_AGENT);
370
371        con.setRequestProperty("Content-Type", "charset=iso-8859-1");
372
373        Map<String, List<String>> httpHeaders = con.getHeaderFields();
374
375        InputStream is = checkHTTPCompression(httpHeaders, con.getInputStream());
376
377        return new Ret2<BufferedReader, Map<String, List<String>>>(
378            new BufferedReader(new InputStreamReader(is, Charset.forName("charset=iso-8859-1"))),
379            httpHeaders
380        );
381    }
382
383    /**
384     * Convenience Method.
385     * <BR />Invokes: {@link #openConn_UTF8(URL)}.
386     */
387    public static BufferedReader openConn_UTF8(String url) throws IOException
388    { return openConn_UTF8(new URL(url)); }
389
390    /**
391     * Opens a {@code UTF8} connection to a {@code URL}, and returns a {@code BufferedReader} for
392     * reading it.
393     * 
394     * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEUTF8>
395     * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEGZIP>
396     * <EMBED CLASS='external-html' DATA-FILE-ID=SCUA> <!-- User Agent, Browser War Note -->
397     * 
398     * @param url This may be an Internet {@code URL}.  The site and page to which it points should
399     * return data encoded in the {@code UTF-8} charset.
400     * 
401     * @return A java {@code BufferedReader} for retrieving the data from the internet connection.
402     * 
403     * @see #USER_AGENT
404     * @see #USE_USER_AGENT
405     * @see #checkHTTPCompression(Map, InputStream)
406     */
407    public static BufferedReader openConn_UTF8(URL url) throws IOException
408    {
409        HttpURLConnection con = (HttpURLConnection) url.openConnection();
410
411        con.setRequestMethod("GET");
412
413        if (USE_USER_AGENT) con.setRequestProperty("User-Agent", USER_AGENT);
414
415        con.setRequestProperty("Content-Type", "charset=UTF-8");
416
417        InputStream is = checkHTTPCompression(con.getHeaderFields(), con.getInputStream());
418
419        return new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8")));
420    }
421
422    /**
423     * Opens a {@code UTF8} connection to a {@code URL}, and returns a {@code BufferedReader} for
424     * reading it, <B><I>and also</I></B> the {@code HTTP Header} that was returned by the
425     * {@code HTTP Server}.
426     * 
427     * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEUTF8>
428     * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEGZIP>
429     * 
430     * @param url This may be an Internet {@code URL}.  The site and page to which it points should
431     * return data encoded in the {@code UTF-8} charet.
432     * 
433     * @return <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPERET2>
434     *  
435     * @throws IOException
436     * @see #checkHTTPCompression(Map, InputStream)
437     */
438    public static Ret2<BufferedReader, Map<String, List<String>>> openConnGetHeader_UTF8(URL url)
439        throws IOException
440    {
441        HttpURLConnection con = (HttpURLConnection) url.openConnection();
442
443        con.setRequestMethod("GET");
444
445        if (USE_USER_AGENT) con.setRequestProperty("User-Agent", USER_AGENT);
446
447        con.setRequestProperty("Content-Type", "charset=UTF-8");
448
449        Map<String, List<String>> httpHeaders = con.getHeaderFields();
450
451        InputStream is = checkHTTPCompression(httpHeaders, con.getInputStream());
452
453        return new Ret2<BufferedReader, Map<String, List<String>>>(
454            new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8"))),
455            httpHeaders
456        );
457    }
458
459
460    // ********************************************************************************************
461    // ********************************************************************************************
462    // Some simple/easy HTML scrape functions, saves to a String.
463    // ********************************************************************************************
464    // ********************************************************************************************
465
466
467    /**
468     * Convenience Method.
469     * <BR />Invokes: {@link #scrapePage(BufferedReader)}
470     * <BR />Obtains: {@code BufferedReader} from {@link #openConn(String)}
471     */
472    public static String scrapePage(String url) throws IOException
473    { return scrapePage(openConn(url)); }
474
475    /**
476     * Convenience Method.
477     * <BR />Invokes: {@link #scrapePage(BufferedReader)}
478     * <BR />Obtains: {@code BufferedReader} from {@link #openConn(URL)}
479     */
480    public static String scrapePage(URL url) throws IOException
481    { return scrapePage(openConn(url)); }
482
483    /**
484     * This scrapes a website and dumps the entire contents into a {@code java.lang.String}.
485     * 
486     * @param br This is a {@code Reader} that needs to have been connected to a Website that will
487     * output text/html data.
488     * 
489     * @return The text/html data - returned inside a {@code String}
490     */
491    public static String scrapePage(BufferedReader br) throws IOException
492    {
493        StringBuffer sb = new StringBuffer();
494        String s;
495
496        while ((s = br.readLine()) != null) sb.append(s + "\n");
497
498        return sb.toString();
499    }
500
501
502    // ********************************************************************************************
503    // ********************************************************************************************
504    // Some simple/easy HTML scrape functions, saves to a Vector<String>.
505    // ********************************************************************************************
506    // ********************************************************************************************
507
508
509    /**
510     * Convenience Method.
511     * <BR />Invokes: {@link #scrapePageToVector(BufferedReader, boolean)}
512     * <BR />Obtains: {@code BufferedReader} from {@link #openConn(String)}
513     */
514    public static Vector<String> scrapePageToVector(String url, boolean includeNewLine)
515        throws IOException
516    { return scrapePageToVector(openConn(url), includeNewLine); }
517
518    /**
519     * Convenience Method.
520     * <BR />Invokes: {@link #scrapePageToVector(BufferedReader, boolean)}
521     * <BR />Obtains: {@code Bufferedeader} from {@link #openConn(URL)}
522     */
523    public static Vector<String> scrapePageToVector(URL url, boolean includeNewLine)
524        throws IOException
525    { return scrapePageToVector(openConn(url), includeNewLine); }
526
527    /**
528     * This will scrape the entire contents of an HTML page to a {@code Vector<String>}  Each
529     * line of the text/HTML page is demarcated by the reception of a {@code '\n'} character
530     * from the web-server.
531     * 
532     * @param br  This is the input source of the HTML page.  It will query for String data.
533     * 
534     * @param includeNewLine This will append the {@code '\n'} character to the end of each
535     * {@code String} in the {@code Vector}.
536     * 
537     * @return a {@code Vector} of {@code String's} where each {@code String} is a line on the
538     * web-page.
539     * 
540     * @see #scrapePageToVector(String, boolean)
541     */
542    public static Vector<String> scrapePageToVector(BufferedReader br, boolean includeNewLine)
543        throws IOException
544    {
545        Vector<String>  ret = new Vector<>();
546        String          s   = null;
547
548        if (includeNewLine)
549
550            while ((s = br.readLine()) != null)
551                ret.add(s + '\n');
552
553        else
554
555            while ((s = br.readLine()) != null)
556                ret.add(s);
557
558        return ret;
559    }
560
561
562    // ********************************************************************************************
563    // ********************************************************************************************
564    // Main HTML scrape functions - used by main class of "HTMLPage.getPageTokens()"
565    // ********************************************************************************************
566    // ********************************************************************************************
567
568
569    /**
570     * This receives an input stream that is contains a pipe to a website that will produce HTML.
571     * The HTML is read from the website, and returned as a {@code String.}
572     * This is called "scraping HTML."
573     * 
574     * @param startTag  If this is null, the scrape will begin with the first character received.
575     * If this contains a {@code String}, the scrape will not include any text/HTML data that
576     * occurs prior to the first occurrence of {@code 'startTag'}
577     * 
578     * @param endTag  If this is null, the scrape will read the entire contents of text/HTML data
579     * from the {@code Bufferedreader br} parameter.  If this contains a {@code String}, then data
580     * will be read and included in the result until {@code 'endTag'} is received.
581     * 
582     * @return a {@code StringBuffer} that is text/html data retrieved from the Reader.
583     * Call {@code toString()} on the return value to retrieve that {@code String.} 
584     * 
585     * @throws ScrapeException If, after download completes, either the {@code 'startTag'} or the
586     * parameter {@code 'endTag'} do not represent {@code String's} that were found within the
587     * downloaded page, this exception is thrown.
588     */
589    public static StringBuffer getHTML(BufferedReader br, String startTag, String endTag)
590        throws IOException
591    {
592        StringBuffer    html = new StringBuffer();
593        String          s;
594
595        // Nice Long Name...  Guess what it means
596        boolean alreadyFoundEndTagInStartTagLine = false;
597
598        // If the startTag parameter is not null, skip all content, until the startTag is found!
599        if (startTag != null)
600        {
601            boolean foundStartTag = false;
602
603            while ((s = br.readLine()) != null)
604
605                if (s.contains(startTag))
606                {
607                    int startTagPos = s.indexOf(startTag);
608
609                    foundStartTag = true;
610
611                    // NOTE:    Sometimes the 'startTag' and 'endTag' are on the same line!
612                    //          This happens, for instance, on Yahoo Photos, when giant lines
613                    //          (no line-breaks) are transmitted
614                    //          Hence... *really* long variable name, this is confusing!
615
616                    s = s.substring(startTagPos);
617
618                    if ((endTag != null) && s.contains(endTag))
619                    {
620                        s = s.substring(0, s.indexOf(endTag) + endTag.length());
621
622                        alreadyFoundEndTagInStartTagLine = true;
623                    }
624
625                    html.append(s + "\n"); break;
626                }
627
628            if (! foundStartTag) throw new ScrapeException
629                ("Start Tag: '" + startTag + "' was Not Found on Page.");
630        }
631
632        // if the endTag parameter is not null, stop reading as soon as the end-tag is found
633        if (endTag != null)
634        {
635            // NOTE: This 'if' is inside curly-braces, because there is an 'else' that "goes with"
636            // the 'if' above... BUT NOT the following 'if'
637
638            if (! alreadyFoundEndTagInStartTagLine)
639            {
640                boolean foundEndTag = false;
641
642                while ((s = br.readLine()) != null)
643
644                    if (s.contains(endTag))
645                    {
646                        foundEndTag = true;
647                        int endTagPos = s.indexOf(endTag);
648                        html.append(s.substring(0, endTagPos + endTag.length()) + "\n");
649                        break;
650                    }
651
652                    else html.append(s + "\n");
653
654                if (! foundEndTag) throw new ScrapeException
655                    ("End Tag: '" + endTag + "' was Not Found on Page.");
656            }
657        }
658
659        // ELSE: (endTag *was* null) ... read all content until EOF ... or ... "EOWP" (end of web-page)
660        else
661
662            while ((s = br.readLine()) != null)
663                html.append(s + "\n");
664
665        // Kind of an annoying line, but this is the new "Multi-Threaded" thing I added.
666        return html;
667    }
668
669
670    /**
671     * This receives an input stream that is contains a pipe to a website that will produce HTML.
672     * The HTML is read from the website, and returned as a {@code String.}
673     * This is called "scraping HTML."
674     * 
675     * @param startLineNum  If this is {@code '0'} or {@code '1'}, the scrape will begin with the
676     * first character received.  If this contains a positive integer, the scrape will not include 
677     * any text/HTML data that occurs prior to {@code int startLineNum} lines of text/html having 
678     * been received.
679     * 
680     * @param endLineNum  If this is negative, the scrape will read the entire contents of
681     * text/HTML data from the {@code Bufferedreader br} parameter (until {@code EOF} is
682     * encountered).  If this contains a positive integer, then data will be read and included in
683     * the result until {@code int endLineNum} lines of text/html have been received.
684     * 
685     * @return a {@code StringBuffer} that is text/html data retrieved from the Reader.
686     * Call {@code toString()} on the return value to retrieve that {@code String}
687     * 
688     * @throws IllegalArgumentException If parameter {@code 'startLineNum'} is negative or greater
689     * than {@code 'endLineNum'}  If {@code 'endLineNum'} was negative, this test is skipped.
690     * 
691     * @throws ScrapeException If there were not enough lines read from the {@code BufferedReader}
692     * parameter to be consistent with the values in {@code 'startLineNum'} and
693     * {@code 'endLineNum'}
694     */
695    public static StringBuffer getHTML(BufferedReader br, int startLineNum, int endLineNum)
696        throws IOException
697    {
698        StringBuffer    html    = new StringBuffer();
699        String          s       = "";
700
701        // NOTE: Arrays start at 0, **BUT** HTML page line counts start at 1!
702        int curLineNum = 1;
703
704        if (startLineNum < 0) throw new IllegalArgumentException(
705            "The parameter startLineNum is negative: " + startLineNum + " but this is not " +
706            "allowed."
707        );
708
709        if (endLineNum == 0) throw new IllegalArgumentException
710            ("The parameter endLineNum is zero, but this is not allowed.");
711
712        endLineNum      = (endLineNum < 0) ? 1 : endLineNum;
713        startLineNum    = (startLineNum == 0) ? 1 : startLineNum;
714
715        if ((endLineNum < startLineNum) && (endLineNum != 1)) throw new IllegalArgumentException(
716            "The parameter startLineNum is: " + startLineNum + "\n" +
717            "The parameter endLineNum is: " + endLineNum + "\n" +
718            "It is required that the latter is larger than the former, " +
719            "or it must be 0 or negative to signify read until EOF."
720        );
721
722        if (startLineNum > 1)
723        {
724            while (curLineNum++ < startLineNum)
725
726                if (br.readLine() == null) throw new ScrapeException(
727                    "The HTML Page that was given didn't even have enough lines to read " +
728                    "quantity in variable startLineNum.\nstartLineNum = " + startLineNum + 
729                    " and read " + (curLineNum-1) + " line(s) before EOF."
730                );
731
732            // Off-By-One computer science error correction - remember post-decrement, means the
733            // last loop iteration didn't read line, but did increment the loop counter!
734
735            curLineNum--;
736        }
737
738        // endLineNum==1  means/imples that we don't have to heed the
739        // endLineNum variable ==> read to EOF/null!
740
741        if (endLineNum == 1)
742
743            while ((s = br.readLine()) != null)
744                html.append(s + "\n");
745
746        // endLineNum > 1 ==> Head endLineNum variable!
747        else
748        {
749            // System.out.println("At START of LOOP: curLineNum = " + curLineNum +
750            // " and endLineNum = " + endLineNum);
751
752            for ( ;curLineNum <= endLineNum; curLineNum++)
753
754                if ((s = br.readLine()) != null) html.append(s + "\n");
755                else break;
756
757            // NOTE: curLineNum-1 and endLineNum+1 are used because:
758            //
759            //      ** The loop counter (curLineNum) breaks when the next line to read is the one
760            //          passed the endLineNum
761            //      ** endLineNum+1 is the appropriate state if enough lines were read from the
762            //           HTML Page
763            //      ** curLineNum-1 is the number of the last line read from the HTML
764
765            if (curLineNum != (endLineNum+1)) throw new ScrapeException(
766                "The HTML Page that was read didn't have enough lines to read to quantity in " +
767                "variable endLineNum.\nendLineNum = " + endLineNum + " but only read " +
768                (curLineNum-1) + " line(s) before EOF."
769            );
770        }
771
772        // Kind of an annoying line, but this is the new "Multi-Threaded" thing I added.
773        return html;
774    }
775}