Surrounding.java.html

package Torello.HTML;

import Torello.HTML.NodeSearch.*;

import static Torello.Java.C.*;

import Torello.Java.FileRW;
import Torello.Java.C;
import Torello.HTML.HelperPackages.parse.HTMLTagCounter;

import java.util.*;

import java.util.function.Predicate;
import java.net.URL;
import java.io.IOException;

/**
 * Class for finding ancestor &amp; parent nodes of any selected {@link HTMLNode}.
 * 
 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=SURROUNDING>
 */
@Torello.JavaDoc.StaticFunctional
public class Surrounding
{
    private Surrounding() { }

    /**
     * This will return the first <B><SPAN STYLE="color: red;">ancestor</SPAN></B> node - along
     * with it's closing element - as a {@code DotPair} - that matches.
     * 
     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC>
     * 
     * @param index This is the index of the node for whose "ancestors" we are searching (to use a
     * Java-Script DOM Tree term).
     * 
     * @param htmlTags If this list is empty, we shall look for any ancestor node.  Since this
     * method returns the first, if this list is left empty, and the index-node is surrounded by
     * even a bold "{@code <B>...</B>}" then that will be the {@code DotPair} result that is
     * returned.  If this list is left non-empty, then the only ancestor nodes whose HTML Element
     * Tag (usually referred to as "the Element") matches a tag from this list shall be returned.
     *
     * <BR /><BR /><B>FOR INSTANCE:</B> If {@code "div", "p"}, and {@code "a"} were provided as
     * values to this parameter - <I>the search loop would skip over all ancestors that were not
     * HTML divider, paragraph or anchor elements</I> before selecting a result.
     * 
     * @return This shall return the first sub-list, as a {@code 'DotPair'} (start &amp; end index
     * pair). If no matches are found, null will return.  This sublist is nearly identical to the
     * Java-Script <B STYLE="color: red">DOM Tree</B> concept of ancestor-node, though no trees are
     * constructed by this method.
     * 
     * @throws ArrayIndexOutOfBoundsException If index is not within the bounds of the passed 
     * vectorized-html parameter {@code 'html'}
     * 
     * @throws HTMLTokException If any of the tags passed are null, or not found in the table of 
     * {@code class HTMLTags} - specifically if they are not valid HTML Elements.
     * 
     * @see #FIRST(Vector, int, HTMLTagCounter)
     * @see ARGCHECK#index(Vector, int)
     */
    public static DotPair first(Vector<? extends HTMLNode> html, int index, String... htmlTags)
    {
        return FIRST(
            html, ARGCHECK.index(html, index),
            new HTMLTagCounter(htmlTags, HTMLTagCounter.NORMAL, HTMLTagCounter.FIRST)
        );
    }

    /**
     * This will return the first <B><SPAN STYLE="color: red;">ancestor</SPAN></B> node - along
     * with it's closing element - as a {@code DotPair} - that matches the input-parameter
     * {@code 'htmlTags'} In this case, the term {@code 'except'} shall mean that any matches whose
     * HTML Token is among the list in parameter {@code String... htmlTags} will be <B>skipped</B>,
     * and a "higher-level" ancestor will be returned instead.
     * 
     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC>
     * 
     * @param index This is the index of the node for whose "ancestors" we are searching (to use a
     * Java-Script {@code DOM Tree} term).
     * 
     * @param htmlTags When this list is non-empty (contains <I>at least one token</I>), the search
     * loop will skip over ancestor nodes that are among the members of this var-args parameter
     * list. If this method is invoked <I>and this parameter is an empty list</I>, then the search
     * loop will return the first anestor node identified.
     *
     * <BR /><BR /><B>FOR INSTANCE:</B> If {@code "B"} and {@code "P"} were passed as parameters to
     * this method, then the search-loop will continue looking for higher-level ancestors -
     * <I>until one was found that was not an HTML {@code 'bold'} or {@code 'paragraph'} element 
     * {@code DotPair}.</I>
     * 
     * @return This shall return the first sub-list, as a {@code 'DotPair'} (start &amp; end index
     * pair). If no matches are found, null will return.  This sublist is nearly identical to the
     * Java-Script <B STYLE="color: red">DOM Tree</B> concept of ancestor-node, though no trees are
     * constructed by this method.
     * 
     * @throws ArrayIndexOutOfBoundsException If index is not within the bounds of the passed 
     * vectorized-html parameter {@code 'html'}
     * 
     * @throws HTMLTokException If any of the tags passed are null, or not found in the table of 
     * {@code class HTMLTags} - specifically if they are not valid HTML Elements.
     * 
     * @see #FIRST(Vector, int, HTMLTagCounter)
     * @see ARGCHECK#index(Vector, int)
     */
    public static DotPair firstExcept(Vector<? extends HTMLNode> html, int index, String... htmlTags)
    {
        return FIRST(
            html, ARGCHECK.index(html, index),
            new HTMLTagCounter(htmlTags, HTMLTagCounter.EXCEPT, HTMLTagCounter.FIRST)
        );
    }

    /**
     * This will find all <B><SPAN STYLE="color: red;">ancestors</SPAN></B> of a given index.  If
     * parameter {@code String... htmlTags} is null, all HTML elements will be considered.  If this
     * parameter contains any elements, then only those elements shall be considered as match in
     * the ancestor hierarchy tree.
     * 
     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC>
     * 
     * @param index This is the index of the node for whose "ancestors" we are searching (to use a
     * Java-Script {@code DOM Tree} term).
     * 
     * @param htmlTags If this list is empty, we shall look for <I><B>all ancestor nodes.</I></B> 
     * Since this method returns the first ancestor node-pair found, f this list is left non-empty,
     * then the only ancestor nodes whose HTML Element Tag (usually referred to as "the token") are
     * members of this varargs {@code String} parameter list shall be considered eligible as a
     * return result for this method.
     *
     * <BR /><BR /><B>FOR INSTANCE:</B> If {@code "DIV", "P"}, and {@code "A"} were listed - <I>the
     * search loop would skip over all ancestors that were not HTML divider, paragraph or anchor
     * elements</I> before selecting a result.
     * 
     * @return This shall return <I><B>every</I></B> sub-list, as a {@code 'DotPair'}
     * (start &amp; end index pair).  If no matches are found, an empty {@code Vector} of
     * zero-elements shall return.  These sublists are nearly identical to the Java-Script
     * <B STYLE="color: red">DOM Tree</B> concept of ancestor-nodes, though no trees are
     * constructed by this method.
     * 
     * @throws ArrayIndexOutOfBoundsException If index is not within the bounds of the passed 
     * vectorized-html parameter {@code 'html'}
     * 
     * @throws HTMLTokException If any of the tags passed are null, or not found in the table of
     * {@code class HTMLTags} - specifically if they are not valid HTML Elements.
     * 
     * @see #ALL(Vector, int, HTMLTagCounter)
     * @see ARGCHECK#index(Vector, int)
     */
    public static Vector<DotPair> all(Vector<? extends HTMLNode> html, int index, String... htmlTags)
    { 
        return ALL(
            html, ARGCHECK.index(html, index),
            new HTMLTagCounter(htmlTags, HTMLTagCounter.NORMAL, HTMLTagCounter.ALL)
        );
    }

    /**
     * This will find all <B><SPAN STYLE="color: red;">ancestors</SPAN></B> of a given index.  If
     * parameter {@code String... htmlTags} is null, all HTML elements will be considered.  If this
     * parameter contains any elements, then those elements <B><I>shall not be considered</B></I>
     * as a match in the ancestor hierarchy tree.
     * 
     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC>
     * 
     * @param index This is the index of the node for whose "ancestors" we are searching (to use a
     * Java-Script {@code DOM Tree} term).
     * 
     * @param htmlTags When this list is non-empty (contains <I>at least one token</I>), the search
     * loop will skip over ancestor nodes that are among the members of this var-args parameter
     * list. If this method is invoked <I>and this parameter is an empty list</I>, then the search
     * loop will return all ancestor nodes of the index node.
     *
     * <BR /><BR /><B>FOR INSTANCE:</B> If {@code "B"} and {@code "P"} were passed as parameters to
     * this method, then the search-loop which is saving all ancestor matches to it's result-set,
     * would skip over any HTML {@code 'bold'} or {@code 'paragraph'} {@code DotPair's}.
     * 
     * @return This shall return <I><B>every</I></B> sub-list, as a {@code 'DotPair'}
     * (start &amp; end index pair).  If no matches are found, an empty {@code Vector} of
     * zero-elements shall return.  These sublists are nearly identical to the Java-Script
     * <B STYLE="color: red">DOM Tree</B> concept of ancestor-nodes, though no trees are
     * constructed by this method.
     * 
     * @throws ArrayIndexOutOfBoundsException If index is not within the bounds of the passed 
     * vectorized-html parameter {@code 'html'}
     * 
     * @throws HTMLTokException If any of the tags passed are null, or not found in the table of 
     * {@code class HTMLTags} - specifically if they are not valid HTML Elements.
     * 
     * @see #ALL(Vector, int, HTMLTagCounter)
     * @see ARGCHECK#index(Vector, int)
     */
    public static Vector<DotPair> allExcept
        (Vector<? extends HTMLNode> html, int index, String... htmlTags)
    {
        return ALL(
            html, ARGCHECK.index(html, index),
            new HTMLTagCounter(htmlTags, HTMLTagCounter.EXCEPT, HTMLTagCounter.ALL)
        );
    }


    // ********************************************************************************************
    // ********************************************************************************************
    // FIND INTERNAL METHODS
    // ********************************************************************************************
    // ********************************************************************************************


    /**
     * Finds the first <B><SPAN STYLE="color: red;">ancestor</SPAN></B> ("surrounding") node pair.
     * 
     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC>
     * @param index This is any index within the bounds of the {@code 'html'} parameter.
     * @param tagCounter Any internally used counter, to optimize the search routine.
     * 
     * @return The matching <B STYLE="color: red;">ancestor</B> node's start-and-end index as a 
     * {@code 'DotPair'}.
     * 
     * @see TagNode
     * @see HTMLNode
     * @see DotPair
     * @see DotPair#isInside(int)
     * @see Util.Inclusive#dotPairOPT(Vector, int, int)
     */
    protected static DotPair FIRST
        (Vector<? extends HTMLNode> html, int index, HTMLTagCounter tagCounter)
    {
        int     size = html.size();
        TagNode tn;
        DotPair ret;

        for (   int i=(index-1);
                (i >= 0) && (! tagCounter.allBanned());
                i--
        )

            if (    ((tn = html.elementAt(i).openTag()) != null)
                &&  tagCounter.check(tn)
                &&  ((ret = Util.Inclusive.dotPairOPT(html, i, size)) != null)
                &&  ret.isInside(index)
                    // isInside(...) Should never fail, but 
            )       // This guarantees to prevent erroneous answers

                // If there is a match, return that match, and exit immediately.
                return ret;

        return null;
    }

    /**
     * Finds all <B><SPAN STYLE="color: red;">ancestor</SPAN></B> ("surrounding"} node pairs.
     * 
     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC>
     * @param index This is any index within the bounds of the {@code 'html'} parameter.
     * @param tagCounter Any internally used counter, to optimize the search routine.
     * 
     * @return All matching <B STYLE="color: red;">ancestor</B> nodes' start-and-end index pairs
     * inside a {@code Vector<DotPair>}
     * 
     * @see TagNode
     * @see HTMLNode
     * @see DotPair
     * @see DotPair#isInside(int)
     * @see Util.Inclusive#dotPairOPT(Vector, int, int)
     */
    protected static Vector<DotPair> ALL
        (Vector<? extends HTMLNode> html, int index, HTMLTagCounter tagCounter)
    {
        HTMLNode n;     TagNode tn;     DotPair dp;     int size = html.size();
        Vector<DotPair> ret = new Vector<>();

        for (int i=(index-1); (i >= 0) && (! tagCounter.allBanned()); i--)

            if (    (n = html.elementAt(i)).isTagNode()
                &&  tagCounter.check(tn = (TagNode) n)
            )
            {
                if (    ((dp = Util.Inclusive.dotPairOPT(html, i, size)) != null)
                    &&  dp.isInside(index)
                )           // isInside(...) Should never fail, but 
                            // This guarantees to prevent erroneous answers
                    ret.addElement(dp);

                else
                    // If finding a token match fails, just ignore that token from now on...
                    tagCounter.reportFailed(tn.tok);

            }

        return ret;
    }


    // ********************************************************************************************
    // ********************************************************************************************
    // Tester, leave it here!  It's not doing you no harm.
    // ********************************************************************************************
    // ********************************************************************************************


    static void test(String urlStr, String fileName) throws IOException
    {
        // String url = "http://developer.torello.directory/JavaHTML/Version%201/1.4/javadoc/" +
        //      "Torello/HTML/NodeSearch/CommentNodeCount.html";
    
        StringBuilder       sb      = new StringBuilder();
        URL                 url     = new URL(urlStr);
        Vector<HTMLNode>    page    = HTMLPage.getPageTokens(url, false);

        int     pos = TextNodeFind.first(page, TextComparitor.CN_CI, "a count of how many");
        DotPair dp  = Surrounding.firstExcept(page, pos, "li", "body", "div");

        sb.append("Text Node Found: [" + page.elementAt(pos) + "]\n");
        sb.append("Index Found: " + pos + ", DotPair Found: " + dp.toString() + "\n");
        sb.append(Debug.print(page, dp, Debug::J) + "\n");

        Vector<DotPair> allDP = Surrounding.allExcept(page, pos, "body", "html", "div");

        for (DotPair l : allDP) sb.append(
            BCYAN + 
            "************************************************************\n" +
            "************************************************************\n" + RESET +
            "Index Found: " + pos + ", DotPair Found: " + l.toString() + "\n" +
            "Starting Node: " + BRED + page.elementAt(l.start).str + RESET + "\n" +
            "Ending Node:" + BRED + page.elementAt(l.end).str + RESET + "\n"
        );

        String s = sb.toString();
        System.out.println(s);

        if (fileName != null)
            FileRW.writeFile(C.toHTML(s.replace("<", "&lt;").replace(">", "&gt;")), fileName);
    }
}