Scrape.java.html

package Torello.HTML;

import Torello.Java.Additional.Ret2;
import Torello.Java.StringParse;

import java.util.*;
import java.util.regex.*;
import java.io.*;
import java.util.zip.*;
import java.net.URL;
import java.net.HttpURLConnection;
import java.nio.charset.Charset;

import Torello.JavaDoc.StaticFunctional;
import Torello.JavaDoc.Excuse;

/**
 * Some standard utilities for transfering &amp; downloading HTML from web-sites and then storing
 * that content in memory as a Java {@code String} - <I>which, subsequently, can be written to
 * disk, transferred elsewhere, or even parsed (using class {@link HTMLPage})</I>.
 * 
 * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPE>
 */
@StaticFunctional(
    Excused={"USER_AGENT", "USE_USER_AGENT"},
    Excuses={Excuse.CONFIGURATION, Excuse.FLAG}
)
public class Scrape
{
    private Scrape() { }

    /**
     * When opening an {@code HTTP URL} connection, it is usually a good idea to use a
     * {@code "User Agent"}  The default behavior in this Scrape &amp; Search Package is to connect
     * using the {@code public static String USER_AGENT = "Chrome/61.0.3163.100";}
     * 
     * <BR /><BR /><B>NOTE:</B> This behavior may be changed by modifying these
     * {@code public static} variables.
     * 
     * <BR /><BR /><B>ALSO:</B> If the boolean {@link #USE_USER_AGENT} is set to {@code FALSE},
     * then no User-Agent will be used at all.
     */
    public static String USER_AGENT = "Chrome/61.0.3163.100";

    /**
     * When opening an {@code HTTP URL} connection, it is usually a good idea to use a
     * {@code "User Agent"} The default behavior in this Scrape &amp; Search Package is to connect
     * using the {@code public static String USER_AGENT = "Chrome/61.0.3163.100";}
     * 
     * <BR /><BR /><B>NOTE:</B> This behavior may be changed by modifying these
     * {@code public static} variables.
     * 
     * <BR /><BR /><B>ALSO:</B> If this boolean is set to {@code FALSE}, then no User-Agent will be
     * used at all.
     */
    public static boolean USE_USER_AGENT = true;


    // ********************************************************************************************
    // ********************************************************************************************
    // HTTP Headers stuff
    // ********************************************************************************************
    // ********************************************************************************************


    /**
     * This method will check whether the {@code HTTP Header} returned by a website has been
     * encoded using the {@code GZIP Compression} encoding.  It expects the {@code java.util.Map}
     * that is returned from an invocation of {@code HttpURLConnection.getHeaderFields()}.
     * 
     * <BR /><BR /><B CLASS=JDDescLabel>Case-Insensitive:</B>
     * 
     * <BR />Since {@code HTTP Headers} are considered <B>CASE INSENSITIVE</B>, all {@code String}
     * comparisons done in this method shall ignore case.
     * 
     * @param httpHeaders This is a simply {@code java.util.Map<String, List<String>>}.  It 
     * <B><I>must be</I></B> the exact map that is returned by the {@code HttpURLConnection}.
     *
     * @return If this map contains a property named {@code "Content-Encoding"} <B><I>AND</I></B>
     * this property has a <I>property-value</I> in it's list equal to {@code "gzip"}, then this
     * method will return {@code TRUE}.  Otherwise this method will return {@code FALSE}.
     */
    public static boolean usesGZIP(Map<String, List<String>> httpHeaders)
    {
        // NOTE: HTTP Headers are CASE-INSENSITIVE, so a loop is needed to check if
        //       certain values are present - rather than the (more simple) Map.containsKey(...)

        for (String prop : httpHeaders.keySet())

            // Check (Case Insensitive) if the HTTP Headers Map has the property "Content-Encoding"
            // NOTE: The Map's returned have been known to contain null keys, so check for that here.

            if ((prop != null) && prop.equalsIgnoreCase("Content-Encoding"))

                // Check (Case Insensitive), if any of the properties assigned to "Content-Encoding"
                // is "GZIP".  If this is found, return TRUE immediately.
    
                for (String vals : httpHeaders.get(prop))
                    if (vals.equalsIgnoreCase("gzip")) return true;

        // The property-value "GZIP" wasn't found, so return FALSE.
        return false;
    }

    /**
     * This method will check whether the {@code HTTP Header} returned by a website has been
     * encoded using the {@code ZIP Compression (PKZIP, Deflate)} encoding.  It expects the
     * {@code java.util.Map} that is returned from an invokation of
     * {@code HttpURLConnection.getHeaderFields()}.
     * 
     * @param httpHeaders This is a simply {@code java.util.Map<String, List<String>>}.  It 
     * <B><I>must be</I></B> the exact map that is returned by the {@code HttpURLConnection}.
     *
     * @return If this map contains a property named {@code "Content-Encoding"} <B><I>AND</I></B>
     * this property has a <I>property-value</I> in it's list equal to {@code "deflate"}, then this
     * method will return {@code TRUE}.  Otherwise this method will return {@code FALSE}.
     * 
     * <BR /><BR /><B>NOTE:</B> Since {@code HTTP Headers} are considered <B>CASE INSENSITIVE</B>,
     * all {@code String} comparisons done in this method shall ignore case.
     */
    public static boolean usesDeflate(Map<String, List<String>> httpHeaders)
    {
        // NOTE: HTTP Headers are CASE-INSENSITIVE, so a loop is needed to check if
        //       certain values are present - rather than the (more simple) Map.containsKey(...)

        for (String prop : httpHeaders.keySet())

            // Check (Case Insensitive) if the HTTP Headers Map has the property "Content-Encoding"
            // NOTE: The returned Maps have been known to contain null keys, so check for that here

            if ((prop != null) && prop.equalsIgnoreCase("Content-Encoding"))

                // Check (Case Insensitive), if any properties assigned to "Content-Encoding" are
                // "DEFLATE" - then return TRUE immediately.
    
                for (String vals : httpHeaders.get(prop))
                    if (vals.equalsIgnoreCase("deflate")) return true;

        // The property-value "deflate" wasn't found, so return FALSE.
        return false;
    }

    /**
     * This method will check whether the {@code HTTP Header} returned by a website has been
     * encoded using compression.  It expects the
     * {@code java.util.Map} that is returned from an invokation of
     * {@code HttpURLConnection.getHeaderFields()}.
     * 
     * @param httpHeaders This is a simply {@code java.util.Map<String, List<String>>}.  It 
     * <B><I>must be</I></B> the exact map that is returned by the {@code HttpURLConnection}.
     * 
     * @param is This should be the {@code InputStream} that is returned from the
     * {@code HttpURLConnection} when reqesting the content from the web-server that is hosting the
     * {@code URL}.  The {@code HTTP Headers} will be searched, and if a compression algorithm
     * has been specified (<I>and the algorithm is one of the algorithm's automatically handled 
     * by Java</I>) - then this {@code InputStream} shall be <B>wrapped</B> by the appropriate
     * decompression algorithm.
     *
     * @return If this map contains a property named {@code "Content-Encoding"} <B><I>AND</I></B>
     * this property has a <I>property-value</I> in it's list equal to either {@code "deflate"}
     * or {@code "gzip"}, then this shall return a <I>wrapped {@code InputStream}</I> that is
     * capable of handling the <I>decompression algorithm</I>.
     * 
     * <BR /><BR /><B>NOTE:</B> Since {@code HTTP Headers} are considered <B>CASE INSENSITIVE</B>,
     * all {@code String} comparisons done in this method shall ignore case.
     */
    public static InputStream checkHTTPCompression
        (Map<String, List<String>> httpHeaders, InputStream is) throws IOException
    {
        // NOTE: HTTP Headers are CASE-INSENSITIVE, so a loop is needed to check if
        //       certain values are present - rather than the (more simple) Map.containsKey(...)

        for (String prop : httpHeaders.keySet())

            // Check (Case Insensitive) if the HTTP Headers Map has the property "Content-Encoding"
            // NOTE: The returned Maps have been known to contain null keys, so check for that here

            if ((prop != null) && prop.equalsIgnoreCase("Content-Encoding"))

                // Check (Case Insensitive), if any properties assigned to "Content-Encoding"
                // are "DEFLATE" or "GZIP" - then return the compression-algorithm immediately.
    
                for (String vals : httpHeaders.get(prop))

                    if (vals.equalsIgnoreCase("gzip"))          return new GZIPInputStream(is);
                    else if (vals.equalsIgnoreCase("deflate"))  return new ZipInputStream(is);

        // Neither of the property-values "gzip" or "deflate" were found.
        // Return the original input stream.

        return is;
    }

    /**
     * This method shall simply take as input a {@code java.util.Map} which contains the
     * {@code HTTP Header} properties that <I>must have been generated</I> by a call to the method
     * {@code HttpURLConnection.getHeaderFields()}.  It will produce a Java {@code String} that
     * lists these headers in text / readable format.
     * 
     * @param httpHeaders This parameter must be an instance of 
     * {@code java.util.Map<String, List<String>>} and it should have been generated by a call to
     * {@code HttpURLConnection.getHeaderFields()}.  The property names and values contained by
     * this {@code Map} will be iterated and printed to a returned {@code java.lang.String}.
     * 
     * @return This shall return a printed version of the {@code Map}.
     */
    public static String httpHeadersToString(Map<String, List<String>> httpHeaders)
    {
        StringBuilder   sb  = new StringBuilder();
        int             max = 0;

        // To ensure that the output string is "aligned", check the length of each of the
        // keys in the HTTP Header.

        for (String key : httpHeaders.keySet()) if (key.length() > max) max = key.length();

        max += 5;

        // Iterate all of the Properties that are included in the 'httpHeaders' parameter
        // It is important to note that the java "toString()" method for the List<String> that
        // is used to store the Property-Values list works great, without any changes.

        for (String key : httpHeaders.keySet()) sb.append(
            StringParse.rightSpacePad(key + ':', max) +
            httpHeaders.get(key).toString() + '\n'
        );

        return sb.toString();
    }


    // ********************************************************************************************
    // ********************************************************************************************
    // Some various ways to open a connection to a website.
    // ********************************************************************************************
    // ********************************************************************************************


    /**
     * Convenience Method.
     * <BR />Invokes: {@link #openConn(URL)}
     */
    public static BufferedReader openConn(String url) throws IOException
    { return openConn(new URL(url)); }

    /**
     * Opens a standard connection to a {@code URL}, and returns a {@code BufferedReader} for
     * reading from it.
     * 
     * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEGZIP>
     * <EMBED CLASS='external-html' DATA-FILE-ID=SCUA> <!-- User Agent, Browser War Note -->
     * 
     * @param url This may be an Internet-{@code URL.}
     * 
     * @return A java {@code BufferedReader} for retrieving the data from the internet connection.
     * 
     * @see #USER_AGENT
     * @see #USE_USER_AGENT
     * @see #checkHTTPCompression(Map, InputStream)
     */
    public static BufferedReader openConn(URL url) throws IOException
    {
        HttpURLConnection con = (HttpURLConnection) url.openConnection();

        con.setRequestMethod("GET");

        if (USE_USER_AGENT) con.setRequestProperty("User-Agent", USER_AGENT);

        InputStream is = checkHTTPCompression(con.getHeaderFields(), con.getInputStream());

        return new BufferedReader(new InputStreamReader(is));
    }

    /**
     * Opens a {@code UTF8} connection to a {@code URL}, and returns a {@code BufferedReader} for
     * reading it, <B><I>and also</I></B> the {@code HTTP Header} that was returned by the
     * {@code HTTP Server}.
     * 
     * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEGZIP>
     * 
     * @param url This may be an Internet {@code URL}.
     * 
     * @return <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPERET2>
     *  
     * @throws IOException
     * 
     * @see #checkHTTPCompression(Map, InputStream)
     */
    public static Ret2<BufferedReader, Map<String, List<String>>> openConnGetHeader(URL url)
        throws IOException
    {
        HttpURLConnection con = (HttpURLConnection) url.openConnection();

        con.setRequestMethod("GET");

        if (USE_USER_AGENT) con.setRequestProperty("User-Agent", USER_AGENT);

        Map<String, List<String>> httpHeaders = con.getHeaderFields();

        InputStream is = checkHTTPCompression(httpHeaders, con.getInputStream());

        return new Ret2<BufferedReader, Map<String, List<String>>>
            (new BufferedReader(new InputStreamReader(is)), httpHeaders);
    }

    /**
     * Convenience Method.
     * <BR />Invokes: {@link #openConn_iso_8859_1(URL)}
     */
    public static BufferedReader openConn_iso_8859_1(String url) throws IOException 
    { return openConn_iso_8859_1(new URL(url)); }

    /**
     * Will open an {@code ISO-8859} connection to a {@code URL}, and returns a 
     * {@code BufferedReader} for reading it.
     * 
     * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEGZIP> 
     * <EMBED CLASS='external-html' DATA-FILE-ID=SCUA> <!-- User Agent, Browser War Note -->
     * 
     * @param url This may be an Internet {@code URL}. The site and page to which it points should
     * return data encoded in the {@code ISO-8859} charset.
     * 
     * @return A java {@code BufferedReader} for retrieving the data from the internet connection.
     * 
     * @see #USER_AGENT
     * @see #USE_USER_AGENT
     * @see #checkHTTPCompression(Map, InputStream)
     */
    public static BufferedReader openConn_iso_8859_1(URL url) throws IOException
    {
        HttpURLConnection con = (HttpURLConnection) url.openConnection();

        con.setRequestMethod("GET");

        if (USE_USER_AGENT) con.setRequestProperty("User-Agent", USER_AGENT);

        con.setRequestProperty("Content-Type", "text/html; charset=iso-8859-1");

        InputStream is = checkHTTPCompression(con.getHeaderFields(), con.getInputStream());

        return new BufferedReader(new InputStreamReader(is, Charset.forName("iso-8859-1")));
    }


    /**
     * Opens a {@code ISO-8859-1} connection to a {@code URL}, and returns a {@code BufferedReader}
     * for reading it, <B><I>and also</I></B> the {@code HTTP Header} that was returned by the
     * {@code HTTP Server}.
     * 
     * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEGZIP>
     * 
     * @param url This may be an Internet {@code URL}.  The site and page to which it points should
     * return data encoded in the {@code ISO-8859-1} charset.
     * 
     * @return <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPERET2>
     *  
     * @throws IOException
     * 
     * @see #checkHTTPCompression(Map, InputStream)
     */
    public static Ret2<BufferedReader, Map<String, List<String>>>
        openConnGetHeader_iso_8859_1(URL url)
        throws IOException
    {
        HttpURLConnection con = (HttpURLConnection) url.openConnection();

        con.setRequestMethod("GET");

        if (USE_USER_AGENT) con.setRequestProperty("User-Agent", USER_AGENT);

        con.setRequestProperty("Content-Type", "charset=iso-8859-1");

        Map<String, List<String>> httpHeaders = con.getHeaderFields();

        InputStream is = checkHTTPCompression(httpHeaders, con.getInputStream());

        return new Ret2<BufferedReader, Map<String, List<String>>>(
            new BufferedReader(new InputStreamReader(is, Charset.forName("charset=iso-8859-1"))),
            httpHeaders
        );
    }

    /**
     * Convenience Method.
     * <BR />Invokes: {@link #openConn_UTF8(URL)}.
     */
    public static BufferedReader openConn_UTF8(String url) throws IOException
    { return openConn_UTF8(new URL(url)); }

    /**
     * Opens a {@code UTF8} connection to a {@code URL}, and returns a {@code BufferedReader} for
     * reading it.
     * 
     * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEUTF8>
     * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEGZIP>
     * <EMBED CLASS='external-html' DATA-FILE-ID=SCUA> <!-- User Agent, Browser War Note -->
     * 
     * @param url This may be an Internet {@code URL}.  The site and page to which it points should
     * return data encoded in the {@code UTF-8} charset.
     * 
     * @return A java {@code BufferedReader} for retrieving the data from the internet connection.
     * 
     * @see #USER_AGENT
     * @see #USE_USER_AGENT
     * @see #checkHTTPCompression(Map, InputStream)
     */
    public static BufferedReader openConn_UTF8(URL url) throws IOException
    {
        HttpURLConnection con = (HttpURLConnection) url.openConnection();

        con.setRequestMethod("GET");

        if (USE_USER_AGENT) con.setRequestProperty("User-Agent", USER_AGENT);

        con.setRequestProperty("Content-Type", "charset=UTF-8");

        InputStream is = checkHTTPCompression(con.getHeaderFields(), con.getInputStream());

        return new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8")));
    }

    /**
     * Opens a {@code UTF8} connection to a {@code URL}, and returns a {@code BufferedReader} for
     * reading it, <B><I>and also</I></B> the {@code HTTP Header} that was returned by the
     * {@code HTTP Server}.
     * 
     * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEUTF8>
     * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEGZIP>
     * 
     * @param url This may be an Internet {@code URL}.  The site and page to which it points should
     * return data encoded in the {@code UTF-8} charet.
     * 
     * @return <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPERET2>
     *  
     * @throws IOException
     * @see #checkHTTPCompression(Map, InputStream)
     */
    public static Ret2<BufferedReader, Map<String, List<String>>> openConnGetHeader_UTF8(URL url)
        throws IOException
    {
        HttpURLConnection con = (HttpURLConnection) url.openConnection();

        con.setRequestMethod("GET");

        if (USE_USER_AGENT) con.setRequestProperty("User-Agent", USER_AGENT);

        con.setRequestProperty("Content-Type", "charset=UTF-8");

        Map<String, List<String>> httpHeaders = con.getHeaderFields();

        InputStream is = checkHTTPCompression(httpHeaders, con.getInputStream());

        return new Ret2<BufferedReader, Map<String, List<String>>>(
            new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8"))),
            httpHeaders
        );
    }


    // ********************************************************************************************
    // ********************************************************************************************
    // Some simple/easy HTML scrape functions, saves to a String.
    // ********************************************************************************************
    // ********************************************************************************************


    /**
     * Convenience Method.
     * <BR />Invokes: {@link #scrapePage(BufferedReader)}
     * <BR />Obtains: {@code BufferedReader} from {@link #openConn(String)}
     */
    public static String scrapePage(String url) throws IOException
    { return scrapePage(openConn(url)); }

    /**
     * Convenience Method.
     * <BR />Invokes: {@link #scrapePage(BufferedReader)}
     * <BR />Obtains: {@code BufferedReader} from {@link #openConn(URL)}
     */
    public static String scrapePage(URL url) throws IOException
    { return scrapePage(openConn(url)); }

    /**
     * This scrapes a website and dumps the entire contents into a {@code java.lang.String}.
     * 
     * @param br This is a {@code Reader} that needs to have been connected to a Website that will
     * output text/html data.
     * 
     * @return The text/html data - returned inside a {@code String}
     */
    public static String scrapePage(BufferedReader br) throws IOException
    {
        StringBuffer sb = new StringBuffer();
        String s;

        while ((s = br.readLine()) != null) sb.append(s + "\n");

        return sb.toString();
    }


    // ********************************************************************************************
    // ********************************************************************************************
    // Some simple/easy HTML scrape functions, saves to a Vector<String>.
    // ********************************************************************************************
    // ********************************************************************************************


    /**
     * Convenience Method.
     * <BR />Invokes: {@link #scrapePageToVector(BufferedReader, boolean)}
     * <BR />Obtains: {@code BufferedReader} from {@link #openConn(String)}
     */
    public static Vector<String> scrapePageToVector(String url, boolean includeNewLine)
        throws IOException
    { return scrapePageToVector(openConn(url), includeNewLine); }

    /**
     * Convenience Method.
     * <BR />Invokes: {@link #scrapePageToVector(BufferedReader, boolean)}
     * <BR />Obtains: {@code Bufferedeader} from {@link #openConn(URL)}
     */
    public static Vector<String> scrapePageToVector(URL url, boolean includeNewLine)
        throws IOException
    { return scrapePageToVector(openConn(url), includeNewLine); }

    /**
     * This will scrape the entire contents of an HTML page to a {@code Vector<String>}  Each
     * line of the text/HTML page is demarcated by the reception of a {@code '\n'} character
     * from the web-server.
     * 
     * @param br  This is the input source of the HTML page.  It will query for String data.
     * 
     * @param includeNewLine This will append the {@code '\n'} character to the end of each
     * {@code String} in the {@code Vector}.
     * 
     * @return a {@code Vector} of {@code String's} where each {@code String} is a line on the
     * web-page.
     * 
     * @see #scrapePageToVector(String, boolean)
     */
    public static Vector<String> scrapePageToVector(BufferedReader br, boolean includeNewLine)
        throws IOException
    {
        Vector<String>  ret = new Vector<>();
        String          s   = null;

        if (includeNewLine)

            while ((s = br.readLine()) != null)
                ret.add(s + '\n');

        else

            while ((s = br.readLine()) != null)
                ret.add(s);

        return ret;
    }


    // ********************************************************************************************
    // ********************************************************************************************
    // Main HTML scrape functions - used by main class of "HTMLPage.getPageTokens()"
    // ********************************************************************************************
    // ********************************************************************************************


    /**
     * This receives an input stream that is contains a pipe to a website that will produce HTML.
     * The HTML is read from the website, and returned as a {@code String.}
     * This is called "scraping HTML."
     * 
     * @param startTag  If this is null, the scrape will begin with the first character received.
     * If this contains a {@code String}, the scrape will not include any text/HTML data that
     * occurs prior to the first occurrence of {@code 'startTag'}
     * 
     * @param endTag  If this is null, the scrape will read the entire contents of text/HTML data
     * from the {@code Bufferedreader br} parameter.  If this contains a {@code String}, then data
     * will be read and included in the result until {@code 'endTag'} is received.
     * 
     * @return a {@code StringBuffer} that is text/html data retrieved from the Reader.
     * Call {@code toString()} on the return value to retrieve that {@code String.} 
     * 
     * @throws ScrapeException If, after download completes, either the {@code 'startTag'} or the
     * parameter {@code 'endTag'} do not represent {@code String's} that were found within the
     * downloaded page, this exception is thrown.
     */
    public static StringBuffer getHTML(BufferedReader br, String startTag, String endTag)
        throws IOException
    {
        StringBuffer    html = new StringBuffer();
        String          s;

        // Nice Long Name...  Guess what it means
        boolean alreadyFoundEndTagInStartTagLine = false;

        // If the startTag parameter is not null, skip all content, until the startTag is found!
        if (startTag != null)
        {
            boolean foundStartTag = false;

            while ((s = br.readLine()) != null)

                if (s.contains(startTag))
                {
                    int startTagPos = s.indexOf(startTag);

                    foundStartTag = true;

                    // NOTE:    Sometimes the 'startTag' and 'endTag' are on the same line!
                    //          This happens, for instance, on Yahoo Photos, when giant lines
                    //          (no line-breaks) are transmitted
                    //          Hence... *really* long variable name, this is confusing!

                    s = s.substring(startTagPos);

                    if ((endTag != null) && s.contains(endTag))
                    {
                        s = s.substring(0, s.indexOf(endTag) + endTag.length());

                        alreadyFoundEndTagInStartTagLine = true;
                    }

                    html.append(s + "\n"); break;
                }

            if (! foundStartTag) throw new ScrapeException
                ("Start Tag: '" + startTag + "' was Not Found on Page.");
        }

        // if the endTag parameter is not null, stop reading as soon as the end-tag is found
        if (endTag != null)
        {
            // NOTE: This 'if' is inside curly-braces, because there is an 'else' that "goes with"
            // the 'if' above... BUT NOT the following 'if'

            if (! alreadyFoundEndTagInStartTagLine)
            {
                boolean foundEndTag = false;

                while ((s = br.readLine()) != null)

                    if (s.contains(endTag))
                    {
                        foundEndTag = true;
                        int endTagPos = s.indexOf(endTag);
                        html.append(s.substring(0, endTagPos + endTag.length()) + "\n");
                        break;
                    }

                    else html.append(s + "\n");

                if (! foundEndTag) throw new ScrapeException
                    ("End Tag: '" + endTag + "' was Not Found on Page.");
            }
        }

        // ELSE: (endTag *was* null) ... read all content until EOF ... or ... "EOWP" (end of web-page)
        else

            while ((s = br.readLine()) != null)
                html.append(s + "\n");

        // Kind of an annoying line, but this is the new "Multi-Threaded" thing I added.
        return html;
    }


    /**
     * This receives an input stream that is contains a pipe to a website that will produce HTML.
     * The HTML is read from the website, and returned as a {@code String.}
     * This is called "scraping HTML."
     * 
     * @param startLineNum  If this is {@code '0'} or {@code '1'}, the scrape will begin with the
     * first character received.  If this contains a positive integer, the scrape will not include 
     * any text/HTML data that occurs prior to {@code int startLineNum} lines of text/html having 
     * been received.
     * 
     * @param endLineNum  If this is negative, the scrape will read the entire contents of
     * text/HTML data from the {@code Bufferedreader br} parameter (until {@code EOF} is
     * encountered).  If this contains a positive integer, then data will be read and included in
     * the result until {@code int endLineNum} lines of text/html have been received.
     * 
     * @return a {@code StringBuffer} that is text/html data retrieved from the Reader.
     * Call {@code toString()} on the return value to retrieve that {@code String}
     * 
     * @throws IllegalArgumentException If parameter {@code 'startLineNum'} is negative or greater
     * than {@code 'endLineNum'}  If {@code 'endLineNum'} was negative, this test is skipped.
     * 
     * @throws ScrapeException If there were not enough lines read from the {@code BufferedReader}
     * parameter to be consistent with the values in {@code 'startLineNum'} and
     * {@code 'endLineNum'}
     */
    public static StringBuffer getHTML(BufferedReader br, int startLineNum, int endLineNum)
        throws IOException
    {
        StringBuffer	html    = new StringBuffer();
        String			s       = "";

        // NOTE: Arrays start at 0, **BUT** HTML page line counts start at 1!
        int curLineNum = 1;

        if (startLineNum < 0) throw new IllegalArgumentException(
            "The parameter startLineNum is negative: " + startLineNum + " but this is not " +
            "allowed."
        );

        if (endLineNum == 0) throw new IllegalArgumentException
            ("The parameter endLineNum is zero, but this is not allowed.");

        endLineNum		= (endLineNum < 0) ? 1 : endLineNum;
        startLineNum	= (startLineNum == 0) ? 1 : startLineNum;

        if ((endLineNum < startLineNum) && (endLineNum != 1)) throw new IllegalArgumentException(
            "The parameter startLineNum is: " + startLineNum + "\n" +
            "The parameter endLineNum is: " + endLineNum + "\n" +
            "It is required that the latter is larger than the former, " +
            "or it must be 0 or negative to signify read until EOF."
        );

        if (startLineNum > 1)
        {
            while (curLineNum++ < startLineNum)

                if (br.readLine() == null) throw new ScrapeException(
                    "The HTML Page that was given didn't even have enough lines to read " +
                    "quantity in variable startLineNum.\nstartLineNum = " + startLineNum + 
                    " and read " + (curLineNum-1) + " line(s) before EOF."
                );

            // Off-By-One computer science error correction - remember post-decrement, means the
            // last loop iteration didn't read line, but did increment the loop counter!

            curLineNum--;
        }

        // endLineNum==1  means/imples that we don't have to heed the
        // endLineNum variable ==> read to EOF/null!

        if (endLineNum == 1)

            while ((s = br.readLine()) != null)
                html.append(s + "\n");

        // endLineNum > 1 ==> Head endLineNum variable!
        else
        {
            // System.out.println("At START of LOOP: curLineNum = " + curLineNum +
            // " and endLineNum = " + endLineNum);

            for ( ;curLineNum <= endLineNum; curLineNum++)

                if ((s = br.readLine()) != null) html.append(s + "\n");
                else break;

            // NOTE: curLineNum-1 and endLineNum+1 are used because:
            //
            //		** The loop counter (curLineNum) breaks when the next line to read is the one
            //          passed the endLineNum
            //		** endLineNum+1 is the appropriate state if enough lines were read from the
            //           HTML Page
            //		** curLineNum-1 is the number of the last line read from the HTML

            if (curLineNum != (endLineNum+1)) throw new ScrapeException(
                "The HTML Page that was read didn't have enough lines to read to quantity in " +
                "variable endLineNum.\nendLineNum = " + endLineNum + " but only read " +
                (curLineNum-1) + " line(s) before EOF."
            );
        }

        // Kind of an annoying line, but this is the new "Multi-Threaded" thing I added.
        return html;
    }
}