HTMLPage.java.html

package Torello.HTML;

import java.io.*;
import java.util.Vector;
import java.net.URL;

import Torello.JavaDoc.StaticFunctional;
import Torello.JavaDoc.JDHeaderBackgroundImg;
import Torello.JavaDoc.Excuse;
import Torello.Java.UnreachableError;

/**
 * Java HTML's flagship-parser class for converting HTML web-pages into plain Java {@code Vector's}
 * of {@link HTMLNode}.
 * 
 * <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE>
 * 
 * @see Scrape#getHTML(BufferedReader, int, int)
 * @see Scrape#getHTML(BufferedReader, String, String)
 * @see HTMLPageMWT
 */
@StaticFunctional(Excused="parser", Excuses=Excuse.SINGLETON)
@JDHeaderBackgroundImg
public class HTMLPage
{
    private HTMLPage() { }

    /**
     * A function-pointer / lambda-target that (could) potentially be used to replace this
     * library's current regular-expression based parser with something possibly faster or even
     * more efficient.
     * 
     * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_PARSER>
     * @see #parser
     */
    @FunctionalInterface
    public static interface Parser
    {
        /**
         * Parse html source-text into a {@code Vector<HTMLNode>}.
         * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_HTML>
         * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
         * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML>
         * <BR /><BR /><B STYLE='color:red;'>NOTE:</B> If you have decided to implement a parser,
         * and you wish to ingore this parameter (and don't want to output such a file) - <I>it is
         * (hopefully) obvious that you may skip this step!</I>
         * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F>
         * <BR /><BR /><B>NOTE:</B> <I>As above,</I> you may skip implementing this.
         * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
         * <BR /><BR /><B>NOTE:</B> <I>As above,</I> you may skip implementing this.
         * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
         * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX>
         */
        public Vector<HTMLNode> parse(
                CharSequence    html,
                boolean         eliminateHTMLTags,
                String          rawHTMLFile,
                String          matchesFile,
                String          justTextFile
            )
        throws IOException;
    }

    /**
     * If needing to "swap a proprietary parser" comes up, this is possible.
     * It just needs to accept the same parameters as the current parser, and produce a 
     * {@code Vector<HTMLNode>.}  This is not an advised step to take, but if an alternative
     * parser has been tested and happens to be generating different results, it can be easily
     * 'swapped out' for the one used now.
     * @see Parser
     * @see Parser#parse
     */
    public static Parser parser = Torello.HTML.HelperPackages.parse.ParserRE::parsePageTokens;


    // ********************************************************************************************
    // ********************************************************************************************
    // These 6 functions presume that the HTML source needs to be downloaded & read from a URL
    // ********************************************************************************************
    // ********************************************************************************************


    /**
     * Convenience Method.
     * <BR />Accepts: {@code URL}
     * <BR />Passes null to parameters
     * {@code startTag, endTag, rawHTMLFile, matchesFile & justTextFile}.
     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
     *      String, String, String, String, String)}
     * <BR />And Invokes: {@link Scrape#openConn(URL)}
     */
    public static Vector<HTMLNode> getPageTokens
        (URL url, boolean eliminateHTMLTags)
        throws IOException
    {
        return getPageTokens
            (Scrape.openConn(url), eliminateHTMLTags, null, null, null, null, null);
    }
    
    /**
     * Convenience Method.
     * <BR />Accepts: {@code URL}
     * <BR />And-Accepts: {@code 'startTag'} and {@code 'endTag'}
     * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
     *      String, String, String, String, String)}
     * <BR />And Invokes: {@link Scrape#openConn(URL)}
     */       
    public static Vector<HTMLNode> getPageTokens
        (URL url, boolean eliminateHTMLTags, String startTag, String endTag)
        throws IOException
    {
        return getPageTokens
            (Scrape.openConn(url), eliminateHTMLTags, startTag, endTag, null, null, null);
    }
    
    /**
     * Convenience Method.
     * <BR />Accepts: {@code URL}
     * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'}
     * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
     *      int, int, String, String, String)}
     * <BR />And Invokes: {@link Scrape#openConn(URL)}
     */
    public static Vector<HTMLNode> getPageTokens
        (URL url, boolean eliminateHTMLTags, int startLineNum, int endLineNum)
        throws IOException
    {
        return getPageTokens
            (Scrape.openConn(url), eliminateHTMLTags, startLineNum, endLineNum, null, null, null);
    }

    /**
     * Convenience Method.
     * <BR />Accepts: {@code URL}
     * <BR />Passes null to {@code startTag} &amp; {@code endTag} parameters.
     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
     *      String, String, String, String, String)}
     * <BR />And Invokes: {@link Scrape#openConn(URL)}
     */
    public static Vector<HTMLNode> getPageTokens(
            URL url, boolean eliminateHTMLTags,
            String rawHTMLFile, String matchesFile, String justTextFile
        )
        throws IOException
    {
        return getPageTokens(
            Scrape.openConn(url), eliminateHTMLTags,
            null, null,
            rawHTMLFile, matchesFile, justTextFile
        );
    }
    
    /**
     * Convenience Method.
     * <BR />Accepts: {@code URL}
     * <BR />And-Accepts: {@code 'startTag'} and {@code 'endTag'}
     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
     *      String, String, String, String, String)}
     * <BR />And Invokes: {@link Scrape#openConn(URL)}
     */
    public static Vector<HTMLNode> getPageTokens(
            URL url, boolean eliminateHTMLTags,
            String startTag, String endTag,
            String rawHTMLFile, String matchesFile, String justTextFile
        )
        throws IOException
    {
        return getPageTokens(
            Scrape.openConn(url), eliminateHTMLTags,
            startTag, endTag,
            rawHTMLFile, matchesFile, justTextFile
        );
    }
    
    /**
     * Convenience Method.
     * <BR />Accepts: {@code URL}
     * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'}
     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
     *      int, int, String, String, String)}
     * <BR />And Invokes: {@link Scrape#openConn(URL)}
     */
    public static Vector<HTMLNode> getPageTokens(
            URL url, boolean eliminateHTMLTags,
            int startLineNum, int endLineNum,
            String rawHTMLFile, String matchesFile, String justTextFile
        )
        throws IOException
    {
        return getPageTokens(
            Scrape.openConn(url), eliminateHTMLTags,
            startLineNum, endLineNum,
            rawHTMLFile, matchesFile, justTextFile
        );
    }


    // ********************************************************************************************
    // ********************************************************************************************
    // These 6 functions presume that the HTML source is from a CharSequence
    // ********************************************************************************************
    // ********************************************************************************************


    /**
     * Parses and Vectorizes HTML from a {@code CharSequence} (usually a {@code String}) source.
     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_HTML>
     * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
     * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
     * <BR /><BR /><B><SPAN STYLE="color: red;">NOTE:</B></SPAN> This method does not throw any
     * checked-exceptions, there is no Input-Output involved here, it is strictly a computational
     * method that neither invokes the file-system, nor the web.
     */
    public static Vector<HTMLNode> getPageTokens
        (CharSequence html, boolean eliminateHTMLTags)
        // NO IOException... NO I/O!
    {
        try
            { return parser.parse(html, eliminateHTMLTags, null, null, null); }

        // This should never happen, when reading from a 'String' rather than a URL, or
        // BufferedReader ==> IOException will not be thrown.

        catch (IOException ioe)
            { throw new UnreachableError(ioe); }
    }

    /**
     * Convenience Method.
     * <BR />Accepts: {@code CharSequence}
     * <BR />And-Accepts: {@code 'startTag'} and {@code 'endTag'}
     * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
     * <BR />Invokes: {@link #getPageTokens(CharSequence, boolean,
     *      String, String, String, String, String)}
     * <BR />Catches: {@code IOException} <B>{@code ==>}</B> No HTTP-I/O, so an IOException isn't
     * possible!
     */
    public static Vector<HTMLNode> getPageTokens
        (CharSequence html, boolean eliminateHTMLTags, String startTag, String endTag)
    // NO IOException... NO I/O!
    {
        try
            { return getPageTokens(html, eliminateHTMLTags, startTag, endTag, null, null, null); }

        // This should never happen, when reading from a 'String' rather than a URL, or
        // BufferedReader ==> IOException will not be thrown.

        catch (IOException ioe)
            { throw new UnreachableError(ioe); }
    }
    
    /**
     * Convenience Method.
     * <BR />Accepts: {@code CharSequence}
     * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'}
     * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
     * <BR />Invokes: {@link #getPageTokens(CharSequence, boolean,
     *      int, int, String, String, String)}
     * <BR />Catches: {@code IOException} <B>{@code ==>}</B> No HTTP-I/O, so an IOException isn't
     * possible!
     */
    public static Vector<HTMLNode> getPageTokens
        (CharSequence html, boolean eliminateHTMLTags, int startLineNum, int endLineNum)
        // NO IOException... NO I/O!
    {
        try
        { 
            return getPageTokens
                (html, eliminateHTMLTags, startLineNum, endLineNum, null, null, null);
        }

        // This should never happen, when reading from a 'String' rather than a URL, or
        // BufferedReader ==> IOException will not be thrown.

        catch (IOException ioe)
            { throw new UnreachableError(ioe); }
    }

    /**
     * Parses and Vectorizes HTML from a {@code CharSequence} (usually a {@code String}) source.
     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_HTML>
     * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
     * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML>
     * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F>
     * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
     * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
     * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX>
     */
    public static Vector<HTMLNode> getPageTokens(
            CharSequence html, boolean eliminateHTMLTags,
            String rawHTMLFile, String matchesFile, String justTextFile
        )
        throws IOException
    { return parser.parse(html, eliminateHTMLTags, rawHTMLFile, matchesFile, justTextFile); }

    /**
     * Parses and Vectorizes HTML from a {@code CharSequence} (usually a {@code String}) source.
     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_HTML>
     * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
     * @param startTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_START_TAG>
     * @param endTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_END_TAG>
     * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML>
     * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F>
     * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
     * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
     * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX>
     * @throws ScrapeException  <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_SCEX2>
     */
    public static Vector<HTMLNode> getPageTokens(
            CharSequence html, boolean eliminateHTMLTags,
            String startTag, String endTag,
            String rawHTMLFile, String matchesFile, String justTextFile
        )
        throws IOException
    {
        String  htmlStr = html.toString();

        int sPos = htmlStr.indexOf(startTag);

        if (sPos == -1) throw new IllegalArgumentException
            ("Passed String-Parameter 'startTag' [" + startTag + "] was not found in HTML.");

        int ePos = htmlStr.indexOf(endTag, sPos);

        if (ePos == -1) throw new IllegalArgumentException
            ("Passed String-Parameter 'endTag' [" + endTag + "] was not found in HTML.");

        ePos += endTag.length();

        return parser.parse(
            htmlStr.substring(sPos, ePos), eliminateHTMLTags,
            rawHTMLFile, matchesFile, justTextFile
        );
    }
    
    /**
     * Convenience Method.
     * <BR />Accepts: {@code CharSequence}
     * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'}
     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
     *      int, int, String, String, String)}
     */
    public static Vector<HTMLNode> getPageTokens(
            CharSequence html, boolean eliminateHTMLTags,
            int startLineNum, int endLineNum,
            String rawHTMLFile, String matchesFile, String justTextFile
        ) 
        throws IOException
    {
        return getPageTokens(
            new BufferedReader(new StringReader(html.toString())),
            eliminateHTMLTags, startLineNum, endLineNum, rawHTMLFile, matchesFile, justTextFile
        );
    }


    // ********************************************************************************************
    // ********************************************************************************************
    // The next 6 functions presume that the input is from a BufferedReader
    // ********************************************************************************************
    // ********************************************************************************************


    /**
     * Convenience Method.
     * <BR />Accepts: {@code BufferedReader}
     * <BR />Passes null to parameters
     * {@code startTag, endTag, rawHTMLFile, matchesFile & justTextFile}.
     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
     *      String, String, String, String, String)}
     */
    public static Vector<HTMLNode> getPageTokens
        (BufferedReader br, boolean eliminateHTMLTags)
        throws IOException
    { return getPageTokens(br, eliminateHTMLTags, null, null, null, null, null); }

    /**
     * Convenience Method.
     * <BR />Accepts: {@code BufferedReader}
     * <BR />And-Accepts: {@code 'startTag'} and {@code 'endTag'}
     * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
     *      String, String, String, String, String)}
     */ 
    public static Vector<HTMLNode> getPageTokens
        (BufferedReader br, boolean eliminateHTMLTags, String startTag, String endTag)
        throws IOException
    { return getPageTokens(br, eliminateHTMLTags, startTag, endTag, null, null, null); }

    /**
     * Convenience Method.
     * <BR />Accepts: {@code BufferedReader}
     * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'}
     * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
     *      int, int, String, String, String)}
     */
    public static Vector<HTMLNode> getPageTokens
        (BufferedReader br, boolean eliminateHTMLTags, int startLineNum, int endLineNum)
        throws IOException
    { return getPageTokens(br, eliminateHTMLTags, startLineNum, endLineNum,	null, null, null); }


    /**
     * Convenience Method.
     * <BR />Accepts: {@code BufferedReader}
     * <BR />Passes null to {@code startTag} &amp; {@code endTag} parameters.
     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
     *      String, String, String, String, String)}
     */
    public static Vector<HTMLNode> getPageTokens(
            BufferedReader br, boolean eliminateHTMLTags,
            String rawHTMLFile, String matchesFile, String justTextFile
        )
        throws IOException
    {
        return getPageTokens
            (br, eliminateHTMLTags, null, null, rawHTMLFile, matchesFile, justTextFile);
    }


    // ********************************************************************************************
    // ********************************************************************************************
    // 
    // ********************************************************************************************
    // ********************************************************************************************


    /**
     * Parses and Vectorizes HTML from a {@code BufferedReader} source.
     * @param br <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_BR>
     * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
     * @param startTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_START_TAG>
     * @param endTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_END_TAG>
     * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML>
     * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F>
     * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
     * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
     * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX>
     * @throws ScrapeException  <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_SCEX2>
     */
    public static Vector<HTMLNode> getPageTokens(
            BufferedReader br, boolean eliminateHTMLTags,
            String startTag, String endTag,
            String rawHTMLFile, String matchesFile, String justTextFile
        )
        throws IOException
    {
        return parser.parse(
            Scrape.getHTML(br, startTag, endTag), eliminateHTMLTags, rawHTMLFile,
            matchesFile, justTextFile
        );
    }

    /**
     * Parses and Vectorizes HTML from a {@code BufferedReader} source.
     * @param br <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_BR>
     * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
     * @param startLineNum <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_START_LN>
     * @param endLineNum <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_END_LN>
     * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML>
     * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F>
     * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
     * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
     * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX>
     * @throws IllegalArgumentException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IAEX>
     * @throws ScrapeException  <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_SCEX1>
     */
    public static Vector<HTMLNode> getPageTokens(
            BufferedReader br, boolean eliminateHTMLTags,
            int startLineNum, int endLineNum,
            String rawHTMLFile, String matchesFile, String justTextFile
        )
        throws IOException
    {
        return parser.parse(
            Scrape.getHTML(br, startLineNum, endLineNum), eliminateHTMLTags,
            rawHTMLFile, matchesFile, justTextFile
        );
    }
}