001package Torello.HTML;
002
003import Torello.Java.Additional.Ret2;
004import Torello.Java.StringParse;
005
006import java.util.*;
007import java.util.regex.*;
008import java.io.*;
009import java.util.zip.*;
010import java.net.URL;
011import java.net.HttpURLConnection;
012import java.nio.charset.Charset;
013
014import Torello.HTML.Tools.JavaDoc.StaticFunctional;
015import Torello.HTML.Tools.JavaDoc.Excuse;
016
017/**
018 * Some standard utilities for transfering & downloading HTML from web-sites and then storing
019 * that content in memory as a Java {@code String} - <I>which, subsequently, can be written to
020 * disk, transferred elsewhere, or even parsed (using class {@link HTMLPage})</I>.
021 * 
022 * <BR /><BR />
023 * <EMBED CLASS="external-html" DATA-FILE-ID="S">
024 */
025@StaticFunctional(
026    Excused={"USER_AGENT", "USE_USER_AGENT"},
027    Excuses={Excuse.CONFIGURATION, Excuse.FLAG}
028)
029public class Scrape
030{
031    private Scrape() { }
032
033    /**
034     * When opening an {@code HTTP URL} connection, it is usually a good idea to use a
035     * {@code "User Agent"}  The default behavior in this Scrape &amp; Search Package is to connect
036     * using the {@code public static String USER_AGENT = "Chrome/61.0.3163.100";}
037     * 
038     * <BR /><BR /><B>NOTE:</B> This behavior may be changed by modifying these
039     * {@code public static} variables.
040     * 
041     * <BR /><BR /><B>ALSO:</B> If the {@code boolean USE_USER_AGENT} is set to <B>FALSE</B>,
042     * then no User-Agent will be used
043     * at all.
044     */
045    public static String USER_AGENT = "Chrome/61.0.3163.100";
046
047    /**
048     * When opening an {@code HTTP URL} connection, it is usually a good idea to use a
049     * {@code "User Agent"} The default behavior in this Scrape &amp; Search Package is to connect
050     * using the {@code public static String USER_AGENT = "Chrome/61.0.3163.100";}
051     * 
052     * <BR /><BR /><B>NOTE:</B> This behavior may be changed by modifying these
053     * {@code public static} variables.
054     * 
055     * <BR /><BR /><B>ALSO:</B> If the {@code boolean USE_USER_AGENT} is set to <B>FALSE</B>,
056     * then no User-Agent will be used
057     */
058    public static boolean USE_USER_AGENT = true;
059
060
061    // *****************************************************************************************
062    // HTTP Headers stuff
063    //******************************************************************************************
064
065
066    /**
067     * This method will check whether the {@code HTTP Header} returned by a website has been
068     * encoded using the {@code GZIP Compression} encoding.  It expects the {@code java.util.Map}
069     * that is returned from an invocation of {@code HttpURLConnection.getHeaderFields()}.
070     * 
071     * @param httpHeaders This is a simply {@code java.util.Map<String, List<String>>}.  It 
072     * <B><I>must be</I></B> the exact map that is returned by the {@code HttpURLConnection}.
073     *
074     * @return If this map contains a property named {@code "Content-Encoding"} <B><I>AND</I></B>
075     * this property has a <I>property-value</I> in it's list equal to {@code "gzip"}, then this
076     * method will return <B>TRUE</B>.  Otherwise this method will return <B>FALSE</B>.
077     * 
078     * <BR /><BR /><B>NOTE:</B> Since {@code HTTP Headers} are considered <B>CASE INSENSITIVE</B>,
079     * all {@code String} comparisons done in this method shall ignore case.
080     */
081    public static boolean usesGZIP(Map<String, List<String>> httpHeaders)
082    {
083        // NOTE: HTTP Headers are CASE-INSENSITIVE, so a loop is needed to check if
084        //       certain values are present - rather than the (more simple) Map.containsKey(...)
085
086        for (String prop : httpHeaders.keySet())
087
088            // Check (Case Insensitive) if the HTTP Headers Map has the property "Content-Encoding"
089            // NOTE: The Map's returned have been known to contain null keys, so check for that here.
090
091            if ((prop != null) && prop.equalsIgnoreCase("Content-Encoding"))
092
093                // Check (Case Insensitive), if any of the properties assigned to "Content-Encoding"
094                // is "GZIP".  If this is found, return TRUE immediately.
095    
096                for (String vals : httpHeaders.get(prop))
097                    if (vals.equalsIgnoreCase("gzip")) return true;
098
099        // The property-value "GZIP" wasn't found, so return FALSE.
100        return false;
101    }
102
103    /**
104     * This method will check whether the {@code HTTP Header} returned by a website has been
105     * encoded using the {@code ZIP Compression (PKZIP, Deflate)} encoding.  It expects the
106     * {@code java.util.Map} that is returned from an invokation of
107     * {@code HttpURLConnection.getHeaderFields()}.
108     * 
109     * @param httpHeaders This is a simply {@code java.util.Map<String, List<String>>}.  It 
110     * <B><I>must be</I></B> the exact map that is returned by the {@code HttpURLConnection}.
111     *
112     * @return If this map contains a property named {@code "Content-Encoding"} <B><I>AND</I></B>
113     * this property has a <I>property-value</I> in it's list equal to {@code "deflate"}, then this
114     * method will return <B>TRUE</B>.  Otherwise this method will return <B>FALSE</B>.
115     * 
116     * <BR /><BR /><B>NOTE:</B> Since {@code HTTP Headers} are considered <B>CASE INSENSITIVE</B>,
117     * all {@code String} comparisons done in this method shall ignore case.
118     */
119    public static boolean usesDeflate(Map<String, List<String>> httpHeaders)
120    {
121        // NOTE: HTTP Headers are CASE-INSENSITIVE, so a loop is needed to check if
122        //       certain values are present - rather than the (more simple) Map.containsKey(...)
123
124        for (String prop : httpHeaders.keySet())
125
126            // Check (Case Insensitive) if the HTTP Headers Map has the property "Content-Encoding"
127            // NOTE: The Map's returned have been known to contain null keys, so check for that here.
128
129            if ((prop != null) && prop.equalsIgnoreCase("Content-Encoding"))
130
131                // Check (Case Insensitive), if any of the properties assigned to "Content-Encoding"
132                // is "DEFLATE".  If this is found, return TRUE immediately.
133    
134                for (String vals : httpHeaders.get(prop))
135                    if (vals.equalsIgnoreCase("deflate")) return true;
136
137        // The property-value "deflate" wasn't found, so return FALSE.
138        return false;
139    }
140
141    /**
142     * This method will check whether the {@code HTTP Header} returned by a website has been
143     * encoded using compression.  It expects the
144     * {@code java.util.Map} that is returned from an invokation of
145     * {@code HttpURLConnection.getHeaderFields()}.
146     * 
147     * @param httpHeaders This is a simply {@code java.util.Map<String, List<String>>}.  It 
148     * <B><I>must be</I></B> the exact map that is returned by the {@code HttpURLConnection}.
149     * 
150     * @param is This should be the {@code InputStream} that is returned from the
151     * {@code HttpURLConnection} when reqesting the content from the web-server that is hosting the
152     * {@code URL}.  The {@code HTTP Headers} will be searched, and if a compression algorithm
153     * has been specified (<I>and the algorithm is one of the algorithm's automatically handled 
154     * by Java</I>) - then this {@code InputStream} shall be <B>wrapped</B> by the appropriate
155     * decompression algorithm.
156     *
157     * @return If this map contains a property named {@code "Content-Encoding"} <B><I>AND</I></B>
158     * this property has a <I>property-value</I> in it's list equal to either {@code "deflate"}
159     * or {@code "gzip"}, then this shall return a <I>wrapped {@code InputStream}</I> that is
160     * capable of handling the <I>decompression algorithm</I>.
161     * 
162     * <BR /><BR /><B>NOTE:</B> Since {@code HTTP Headers} are considered <B>CASE INSENSITIVE</B>,
163     * all {@code String} comparisons done in this method shall ignore case.
164     */
165    public static InputStream checkHTTPCompression
166        (Map<String, List<String>> httpHeaders, InputStream is) throws IOException
167    {
168        // NOTE: HTTP Headers are CASE-INSENSITIVE, so a loop is needed to check if
169        //       certain values are present - rather than the (more simple) Map.containsKey(...)
170
171        for (String prop : httpHeaders.keySet())
172
173            // Check (Case Insensitive) if the HTTP Headers Map has the property "Content-Encoding"
174            // NOTE: The Map's returned have been known to contain null keys, so check for that here.
175
176            if ((prop != null) && prop.equalsIgnoreCase("Content-Encoding"))
177
178                // Check (Case Insensitive), if any of the properties assigned to "Content-Encoding"
179                // is "DEFLATE" or "GZIP".  If so, return the compression-algorithm immediately.
180    
181                for (String vals : httpHeaders.get(prop))
182
183                    if (vals.equalsIgnoreCase("gzip"))          return new GZIPInputStream(is);
184                    else if (vals.equalsIgnoreCase("deflate"))  return new ZipInputStream(is);
185
186        // Neither of the property-values "gzip" or "deflate" were found.
187        // Return the original input stream.
188        return is;
189    }
190
191    /**
192     * This method shall simply take as input a {@code java.util.Map} which contains the
193     * {@code HTTP Header} properties that <I>must have been generated</I> by a call to the method
194     * {@code HttpURLConnection.getHeaderFields()}.  It will produce a Java {@code String} that
195     * lists these headers in text / readable format.
196     * 
197     * @param httpHeaders This parameter must be an instance of 
198     * {@code java.util.Map<String, List<String>>} and it should have been generated by a call to
199     * {@code HttpURLConnection.getHeaderFields()}.  The property names and values contained by
200     * this {@code Map} will be iterated and printed to a returned {@code java.lang.String}.
201     * 
202     * @return This shall return a printed version of the {@code Map}.
203     */
204    public static String httpHeadersToString(Map<String, List<String>> httpHeaders)
205    {
206        StringBuilder   sb  = new StringBuilder();
207        int             max = 0;
208
209        // To ensure that the output string is "aligned", check the length of each of the
210        // keys in the HTTP Header.
211
212        for (String key : httpHeaders.keySet()) if (key.length() > max) max = key.length();
213
214        max += 5;
215
216        // Iterate all of the Properties that are included in the 'httpHeaders' parameter
217        // It is important to note that the java "toString()" method for the List<String> that
218        // is used to store the Property-Values list works great, without any changes.
219
220        for (String key : httpHeaders.keySet()) sb.append(
221            StringParse.rightSpacePad(key + ':', max) +
222            httpHeaders.get(key).toString() + '\n'
223        );
224
225        return sb.toString();
226    }
227
228    // *****************************************************************************************
229    // Some various ways to open a connection to a website.
230    //******************************************************************************************
231
232    /**
233     * Convenience Method.
234     * <BR />Invokes: {@link #openConn(URL)}
235     */
236    public static BufferedReader openConn(String url) throws IOException
237    { return openConn(new URL(url)); }
238
239    /**
240     * Opens a standard connection to a {@code URL}, and returns a {@code BufferedReader} for
241     * reading from it.
242     * 
243     * <EMBED CLASS="external-html" DATA-FILE-ID="SCRAPEGZIP">
244     * <EMBED CLASS="external-html" DATA-FILE-ID="SCUA"> <!-- User Agent, Browser War Note -->
245     * 
246     * @param url This may be an Internet-{@code URL.}
247     * 
248     * @return A java {@code BufferedReader} for retrieving the data from the internet connection.
249     * 
250     * @see #USER_AGENT
251     * @see #USE_USER_AGENT
252     * @see #checkHTTPCompression(Map, InputStream)
253     */
254    public static BufferedReader openConn(URL url) throws IOException
255    {
256        HttpURLConnection con = (HttpURLConnection) url.openConnection();
257
258        con.setRequestMethod("GET");
259
260        if (USE_USER_AGENT) con.setRequestProperty("User-Agent", USER_AGENT);
261
262        InputStream is = checkHTTPCompression(con.getHeaderFields(), con.getInputStream());
263
264        return new BufferedReader(new InputStreamReader(is));
265    }
266
267    /**
268     * Opens a {@code UTF8} connection to a {@code URL}, and returns a {@code BufferedReader} for
269     * reading it, <B><I>and also</I></B> the {@code HTTP Header} that was returned by the
270     * {@code HTTP Server}.
271     * 
272     * <EMBED CLASS="external-html" DATA-FILE-ID="SCRAPEGZIP">
273     * 
274     * @param url This may be an Internet {@code URL}.
275     * 
276     * @return <EMBED CLASS="external-html" DATA-FILE-ID="SCRAPERET2">
277     *  
278     * @throws IOException
279     * 
280     * @see #checkHTTPCompression(Map, InputStream)
281     */
282    public static Ret2<BufferedReader, Map<String, List<String>>> openConnGetHeader(URL url)
283        throws IOException
284    {
285        HttpURLConnection con = (HttpURLConnection) url.openConnection();
286
287        con.setRequestMethod("GET");
288
289        if (USE_USER_AGENT) con.setRequestProperty("User-Agent", USER_AGENT);
290
291        Map<String, List<String>> httpHeaders = con.getHeaderFields();
292
293        InputStream is = checkHTTPCompression(httpHeaders, con.getInputStream());
294
295        return new Ret2<BufferedReader, Map<String, List<String>>>
296            (new BufferedReader(new InputStreamReader(is)), httpHeaders);
297    }
298
299    /**
300     * Convenience Method.
301     * <BR />Invokes: {@link #openConn_iso_8859_1(URL)}
302     */
303    public static BufferedReader openConn_iso_8859_1(String url) throws IOException 
304    { return openConn_iso_8859_1(new URL(url)); }
305
306    /**
307     * Will open an {@code ISO-8859} connection to a {@code URL}, and returns a 
308     * {@code BufferedReader} for reading it.
309     * 
310     * <EMBED CLASS="external-html" DATA-FILE-ID="SCRAPEGZIP"> 
311     * <EMBED CLASS="external-html" DATA-FILE-ID="SCUA"> <!-- User Agent, Browser War Note -->
312     * 
313     * @param url This may be an Internet {@code URL}. The site and page to which it points should
314     * return data encoded in the {@code ISO-8859} charset.
315     * 
316     * @return A java {@code BufferedReader} for retrieving the data from the internet connection.
317     * 
318     * @see #USER_AGENT
319     * @see #USE_USER_AGENT
320     * @see #checkHTTPCompression(Map, InputStream)
321     */
322    public static BufferedReader openConn_iso_8859_1(URL url) throws IOException
323    {
324        HttpURLConnection con = (HttpURLConnection) url.openConnection();
325
326        con.setRequestMethod("GET");
327
328        if (USE_USER_AGENT) con.setRequestProperty("User-Agent", USER_AGENT);
329
330        con.setRequestProperty("Content-Type", "text/html; charset=iso-8859-1");
331
332        InputStream is = checkHTTPCompression(con.getHeaderFields(), con.getInputStream());
333
334        return new BufferedReader(new InputStreamReader(is, Charset.forName("iso-8859-1")));
335    }
336
337
338    /**
339     * Opens a {@code ISO-8859-1} connection to a {@code URL}, and returns a {@code BufferedReader}
340     * for reading it, <B><I>and also</I></B> the {@code HTTP Header} that was returned by the
341     * {@code HTTP Server}.
342     * 
343     * <EMBED CLASS="external-html" DATA-FILE-ID="SCRAPEGZIP">
344     * 
345     * @param url This may be an Internet {@code URL}.  The site and page to which it points should
346     * return data encoded in the {@code ISO-8859-1} charset.
347     * 
348     * @return <EMBED CLASS="external-html" DATA-FILE-ID="SCRAPERET2">
349     *  
350     * @throws IOException
351     * 
352     * @see #checkHTTPCompression(Map, InputStream)
353     */
354    public static Ret2<BufferedReader, Map<String, List<String>>>
355        openConnGetHeader_iso_8859_1(URL url)
356        throws IOException
357    {
358        HttpURLConnection con = (HttpURLConnection) url.openConnection();
359
360        con.setRequestMethod("GET");
361
362        if (USE_USER_AGENT) con.setRequestProperty("User-Agent", USER_AGENT);
363
364        con.setRequestProperty("Content-Type", "charset=iso-8859-1");
365
366        Map<String, List<String>> httpHeaders = con.getHeaderFields();
367
368        InputStream is = checkHTTPCompression(httpHeaders, con.getInputStream());
369
370        return new Ret2<BufferedReader, Map<String, List<String>>>(
371            new BufferedReader(new InputStreamReader(is, Charset.forName("charset=iso-8859-1"))),
372            httpHeaders
373        );
374    }
375
376    /**
377     * Convenience Method.
378     * <BR />Invokes: {@link #openConn_UTF8(URL)}.
379     */
380    public static BufferedReader openConn_UTF8(String url) throws IOException
381    { return openConn_UTF8(new URL(url)); }
382
383    /**
384     * Opens a {@code UTF8} connection to a {@code URL}, and returns a {@code BufferedReader} for
385     * reading it.
386     * 
387     * <EMBED CLASS="external-html" DATA-FILE-ID="SCRAPEUTF8">
388     * <EMBED CLASS="external-html" DATA-FILE-ID="SCRAPEGZIP">
389     * <EMBED CLASS="external-html" DATA-FILE-ID="SCUA"> <!-- User Agent, Browser War Note -->
390     * 
391     * @param url This may be an Internet {@code URL}.  The site and page to which it points should
392     * return data encoded in the {@code UTF-8} charset.
393     * 
394     * @return A java {@code BufferedReader} for retrieving the data from the internet connection.
395     * 
396     * @see #USER_AGENT
397     * @see #USE_USER_AGENT
398     * @see #checkHTTPCompression(Map, InputStream)
399     */
400    public static BufferedReader openConn_UTF8(URL url) throws IOException
401    {
402        HttpURLConnection con = (HttpURLConnection) url.openConnection();
403
404        con.setRequestMethod("GET");
405
406        if (USE_USER_AGENT) con.setRequestProperty("User-Agent", USER_AGENT);
407
408        con.setRequestProperty("Content-Type", "charset=UTF-8");
409
410        InputStream is = checkHTTPCompression(con.getHeaderFields(), con.getInputStream());
411
412        return new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8")));
413    }
414
415    /**
416     * Opens a {@code UTF8} connection to a {@code URL}, and returns a {@code BufferedReader} for
417     * reading it, <B><I>and also</I></B> the {@code HTTP Header} that was returned by the
418     * {@code HTTP Server}.
419     * 
420     * <EMBED CLASS="external-html" DATA-FILE-ID="SCRAPEUTF8">
421     * <EMBED CLASS="external-html" DATA-FILE-ID="SCRAPEGZIP">
422     * 
423     * @param url This may be an Internet {@code URL}.  The site and page to which it points should
424     * return data encoded in the {@code UTF-8} charet.
425     * 
426     * @return <EMBED CLASS="external-html" DATA-FILE-ID="SCRAPERET2">
427     *  
428     * @throws IOException
429     * @see #checkHTTPCompression(Map, InputStream)
430     */
431    public static Ret2<BufferedReader, Map<String, List<String>>> openConnGetHeader_UTF8(URL url)
432        throws IOException
433    {
434        HttpURLConnection con = (HttpURLConnection) url.openConnection();
435
436        con.setRequestMethod("GET");
437
438        if (USE_USER_AGENT) con.setRequestProperty("User-Agent", USER_AGENT);
439
440        con.setRequestProperty("Content-Type", "charset=UTF-8");
441
442        Map<String, List<String>> httpHeaders = con.getHeaderFields();
443
444        InputStream is = checkHTTPCompression(httpHeaders, con.getInputStream());
445
446        return new Ret2<BufferedReader, Map<String, List<String>>>(
447            new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8"))),
448            httpHeaders
449        );
450    }
451
452    // *****************************************************************************************
453    // Some simple/easy HTML scrape functions, saves to a String.
454    //******************************************************************************************
455
456    /**
457     * Convenience Method.
458     * <BR />Invokes: {@link #scrapePage(BufferedReader)}
459     * <BR />Obtains: {@code BufferedReader} from {@link #openConn(String)}
460     */
461    public static String scrapePage(String url) throws IOException
462    { return scrapePage(openConn(url)); }
463
464    /**
465     * Convenience Method.
466     * <BR />Invokes: {@link #scrapePage(BufferedReader)}
467     * <BR />Obtains: {@code BufferedReader} from {@link #openConn(URL)}
468     */
469    public static String scrapePage(URL url) throws IOException
470    { return scrapePage(openConn(url)); }
471
472    /**
473     * This scrapes a website and dumps the entire contents into a {@code java.lang.String}.
474     * 
475     * @param br This is a {@code Reader} that needs to have been connected to a Website that will
476     * output text/html data.
477     * 
478     * @return The text/html data - returned inside a {@code String}
479     */
480    public static String scrapePage(BufferedReader br) throws IOException
481    {
482        StringBuffer sb = new StringBuffer();
483        String s;
484
485        while ((s = br.readLine()) != null) sb.append(s + "\n");
486
487        return sb.toString();
488    }
489
490    // *****************************************************************************************
491    // Some simple/easy HTML scrape functions, saves to a Vector<String>.
492    //******************************************************************************************
493
494    /**
495     * Convenience Method.
496     * <BR />Invokes: {@link #scrapePageToVector(BufferedReader, boolean)}
497     * <BR />Obtains: {@code BufferedReader} from {@link #openConn(String)}
498     */
499    public static Vector<String> scrapePageToVector(String url, boolean includeNewLine)
500        throws IOException
501    { return scrapePageToVector(openConn(url), includeNewLine); }
502
503    /**
504     * Convenience Method.
505     * <BR />Invokes: {@link #scrapePageToVector(BufferedReader, boolean)}
506     * <BR />Obtains: {@code Bufferedeader} from {@link #openConn(URL)}
507     */
508    public static Vector<String> scrapePageToVector(URL url, boolean includeNewLine)
509        throws IOException
510    { return scrapePageToVector(openConn(url), includeNewLine); }
511
512    /**
513     * This will scrape the entire contents of an HTML page to a {@code Vector<String>}  Each
514     * line of the text/HTML page is demarcated by the reception of a {@code '\n'} character
515     * from the web-server.
516     * 
517     * @param br  This is the input source of the HTML page.  It will query for String data.
518     * 
519     * @param includeNewLine This will append the {@code '\n'} character to the end of each
520     * {@code String} in the {@code Vector}.
521     * 
522     * @return a {@code Vector} of {@code String's} where each {@code String} is a line on the
523     * web-page.
524     * 
525     * @see #scrapePageToVector(String, boolean)
526     */
527    public static Vector<String> scrapePageToVector(BufferedReader br, boolean includeNewLine)
528        throws IOException
529    {
530        Vector<String>  ret = new Vector<>();
531        String          s   = null;
532
533        if (includeNewLine)
534
535            while ((s = br.readLine()) != null)
536                ret.add(s + '\n');
537
538        else
539
540            while ((s = br.readLine()) != null)
541                ret.add(s);
542
543        return ret;
544    }
545
546    // *****************************************************************************************
547    // Main HTML scrape functions - functions used by main class-methods of "public class HTMLPage.getPageTokens()"
548    //******************************************************************************************
549
550    /**
551     * This receives an input stream that is contains a pipe to a website that will produce HTML.
552     * The HTML is read from the website, and returned as a {@code String.}
553     * This is called "scraping HTML."
554     * 
555     * @param startTag  If this is null, the scrape will begin with the first character received.
556     * If this contains a {@code String}, the scrape will not include any text/HTML data that
557     * occurs prior to the first occurrence of {@code 'startTag'}
558     * 
559     * @param endTag  If this is null, the scrape will read the entire contents of text/HTML data
560     * from the {@code Bufferedreader br} parameter.  If this contains a {@code String}, then data
561     * will be read and included in the result until {@code 'endTag'} is received.
562     * 
563     * @return a {@code StringBuffer} that is text/html data retrieved from the Reader.
564     * Call {@code toString()} on the return value to retrieve that {@code String.} 
565     * 
566     * @throws ScrapeException If, after download completes, either the {@code 'startTag'} or the
567     * parameter {@code 'endTag'} do not represent {@code String's} that were found within the
568     * downloaded page, this exception is thrown.
569     */
570    public static StringBuffer getHTML(BufferedReader br, String startTag, String endTag) throws IOException
571    {
572        StringBuffer    html                                = new StringBuffer();
573        String          s;
574        boolean         alreadyFoundEndTagInStartTagLine    = false;
575
576        // If the startTag parameter is not null, skip all content, until the startTag is found!
577        if (startTag != null)
578        {
579            boolean foundStartTag = false;
580
581            while ((s = br.readLine()) != null)
582
583                if (s.contains(startTag))
584                {
585                    int startTagPos = s.indexOf(startTag);
586
587                    foundStartTag = true;
588
589                    // NOTE:    Sometimes the 'startTag' and 'endTag' are on the same line!
590                    //          This happens, for instance, on Yahoo Photos, when giant lines
591                    //          (no line-breaks) are transmitted
592                    //          Hence... *really* long variable name, this is confusing!
593
594                    s = s.substring(startTagPos);
595
596                    if ((endTag != null) && s.contains(endTag))
597                    {
598                        s = s.substring(0, s.indexOf(endTag) + endTag.length());
599
600                        alreadyFoundEndTagInStartTagLine = true;
601                    }
602
603                    html.append(s + "\n"); break;
604                }
605
606            if (! foundStartTag) throw new ScrapeException
607                ("Start Tag: '" + startTag + "' was Not Found on Page.");
608        }
609
610        // if the endTag parameter is not null, stop reading as soon as the end-tag is found
611        if (endTag != null)
612        {
613            // NOTE: This 'if' is inside curly-braces, because there is an 'else' that "goes with"
614            // the 'if' above... BUT NOT the following 'if'
615
616            if (! alreadyFoundEndTagInStartTagLine)
617            {
618                boolean foundEndTag = false;
619
620                while ((s = br.readLine()) != null)
621
622                    if (s.contains(endTag))
623                    {
624                        foundEndTag = true;
625                        int endTagPos = s.indexOf(endTag);
626                        html.append(s.substring(0, endTagPos + endTag.length()) + "\n");
627                        break;
628                    }
629
630                    else html.append(s + "\n");
631
632                if (! foundEndTag) throw new ScrapeException
633                    ("End Tag: '" + endTag + "' was Not Found on Page.");
634            }
635        }
636
637        // ELSE: (endTag *was* null) ... read all content until EOF ... or ... "EOWP" (end of web-page)
638        else
639
640            while ((s = br.readLine()) != null)
641                html.append(s + "\n");
642
643        // Kind of an annoying line, but this is the new "Multi-Threaded" thing I added.
644        return html;
645    }
646
647
648    /**
649     * This receives an input stream that is contains a pipe to a website that will produce HTML.
650     * The HTML is read from the website, and returned as a {@code String.}
651     * This is called "scraping HTML."
652     * 
653     * @param startLineNum  If this is {@code '0'} or {@code '1'}, the scrape will begin with the
654     * first character received.  If this contains a positive integer, the scrape will not include 
655     * any text/HTML data that occurs prior to {@code int startLineNum} lines of text/html having 
656     * been received.
657     * 
658     * @param endLineNum  If this is negative, the scrape will read the entire contents of
659     * text/HTML data from the {@code Bufferedreader br} parameter (until {@code EOF} is
660     * encountered).  If this contains a positive integer, then data will be read and included in
661     * the result until {@code int endLineNum} lines of text/html have been received.
662     * 
663     * @return a {@code StringBuffer} that is text/html data retrieved from the Reader.
664     * Call {@code toString()} on the return value to retrieve that {@code String}
665     * 
666     * @throws IllegalArgumentException If parameter {@code 'startLineNum'} is negative or greater
667     * than {@code 'endLineNum'}  If {@code 'endLineNum'} was negative, this test is skipped.
668     * 
669     * @throws ScrapeException If there were not enough lines read from the {@code BufferedReader}
670     * parameter to be consistent with the values in {@code 'startLineNum'} and
671     * {@code 'endLineNum'}
672     */
673    public static StringBuffer getHTML(BufferedReader br, int startLineNum, int endLineNum)
674        throws IOException
675    {
676        StringBuffer    html    = new StringBuffer();
677        String          s       = "";
678
679        // NOTE: Arrays start at 0, **BUT** HTML page line counts start at 1!
680        int curLineNum = 1;
681
682        if (startLineNum < 0) throw new IllegalArgumentException(
683            "The parameter startLineNum is negative: " + startLineNum + " but this is not " +
684            "allowed."
685        );
686
687        if (endLineNum == 0) throw new IllegalArgumentException
688            ("The parameter endLineNum is zero, but this is not allowed.");
689
690        endLineNum      = (endLineNum < 0) ? 1 : endLineNum;
691        startLineNum    = (startLineNum == 0) ? 1 : startLineNum;
692
693        if ((endLineNum < startLineNum) && (endLineNum != 1)) throw new IllegalArgumentException(
694            "The parameter startLineNum is: " + startLineNum + "\n" +
695            "The parameter endLineNum is: " + endLineNum + "\n" +
696            "It is required that the latter is larger than the former, " +
697            "or it must be 0 or negative to signify read until EOF."
698        );
699
700        if (startLineNum > 1)
701        {
702            while (curLineNum++ < startLineNum)
703
704                if (br.readLine() == null) throw new ScrapeException(
705                    "The HTML Page that was given didn't even have enough lines to read " +
706                    "quantity in variable startLineNum.\nstartLineNum = " + startLineNum + 
707                    " and read " + (curLineNum-1) + " line(s) before EOF."
708                );
709
710            // Off-By-One computer science error correction - remember post-decrement, means the
711            // last loop iteration didn't read line, but did increment the loop counter!
712
713            curLineNum--;
714        }
715
716        // endLineNum==1  means/imples that we don't have to heed the
717        // endLineNum variable ==> read to EOF/null!
718
719        if (endLineNum == 1)
720
721            while ((s = br.readLine()) != null)
722                html.append(s + "\n");
723
724        // endLineNum > 1 ==> Head endLineNum variable!
725        else
726        {
727            // System.out.println("At START of LOOP: curLineNum = " + curLineNum +
728            // " and endLineNum = " + endLineNum);
729
730            for ( ;curLineNum <= endLineNum; curLineNum++)
731
732                if ((s = br.readLine()) != null) html.append(s + "\n");
733                else break;
734
735            // NOTE: curLineNum-1 and endLineNum+1 are used because:
736            //
737            //      ** The loop counter (curLineNum) breaks when the next line to read is the one
738            //          passed the endLineNum
739            //      ** endLineNum+1 is the appropriate state if enough lines were read from the
740            //           HTML Page
741            //      ** curLineNum-1 is the number of the last line read from the HTML
742
743            if (curLineNum != (endLineNum+1)) throw new ScrapeException(
744                "The HTML Page that was read didn't have enough lines to read to quantity in " +
745                "variable endLineNum.\nendLineNum = " + endLineNum + " but only read " +
746                (curLineNum-1) + " line(s) before EOF."
747            );
748        }
749
750        // Kind of an annoying line, but this is the new "Multi-Threaded" thing I added.
751        return html;
752    }
753}