001package Torello.HTML;
002
003import Torello.HTML.NodeSearch.*;
004
005import Torello.Java.FileRW;
006import Torello.Java.Shell.C;
007import Torello.HTML.parse.HTMLTagCounter;
008
009import java.util.*;
010
011import java.util.function.Predicate;
012import java.net.URL;
013import java.io.IOException;
014
015/**
016 * Class for finding ancestor & parent nodes of any selected {@link HTMLNode}.
017 * 
018 * <BR /><BR /><EMBED CLASS="external-html" DATA-FILE-ID=SURROUNDING>
019 */
020@Torello.HTML.Tools.JavaDoc.StaticFunctional
021public class Surrounding
022{
023    private Surrounding() { }
024
025    /**
026     * This will return the first <B><SPAN STYLE="color: red;">ancestor</SPAN></B> node - along
027     * with it's closing element - as a {@code DotPair} - that matches.
028     * 
029     * @param html <EMBED CLASS="external-html" DATA-FILE-ID="HTMLVEC">
030     * 
031     * @param index This is the index of the node for whose "ancestors" we are searching (to use a
032     * Java-Script DOM Tree term).
033     * 
034     * @param htmlTags If this list is empty, we shall look for any ancestor node.  Since this
035     * method returns the first, if this list is left empty, and the index-node is surrounded by
036     * even a bold "{@code <B>...</B>}" then that will be the {@code DotPair} result that is
037     * returned.  If this list is left non-empty, then the only ancestor nodes whose HTML Element
038     * Tag (usually referred to as "the Element") matches a tag from this list shall be returned.
039     *
040     * <BR /><BR /><B>FOR INSTANCE:</B> If {@code "div", "p"}, and {@code "a"} were provided as
041     * values to this parameter - <I>the search loop would skip over all ancestors that were not
042     * HTML divider, paragraph or anchor elements</I> before selecting a result.
043     * 
044     * @return This shall return the first sub-list, as a {@code 'DotPair'} (start &amp; end index
045     * pair). If no matches are found, null will return.  This sublist is nearly identical to the
046     * Java-Script <B STYLE="color: red">DOM Tree</B> concept of ancestor-node, though no trees are
047     * constructed by this method.
048     * 
049     * @throws ArrayIndexOutOfBoundsException If index is not within the bounds of the passed 
050     * vectorized-html parameter {@code 'html'}
051     * 
052     * @throws HTMLTokException If any of the tags passed are null, or not found in the table of 
053     * {@code class HTMLTags} - specifically if they are not valid HTML Elements.
054     * 
055     * @see #FIRST(Vector, int, HTMLTagCounter)
056     * @see ARGCHECK#index(Vector, int)
057     */
058    public static DotPair first(Vector<? extends HTMLNode> html, int index, String... htmlTags)
059    {
060        return FIRST(
061            html, ARGCHECK.index(html, index),
062            new HTMLTagCounter(htmlTags, HTMLTagCounter.NORMAL, HTMLTagCounter.FIRST)
063        );
064    }
065
066    /**
067     * This will return the first <B><SPAN STYLE="color: red;">ancestor</SPAN></B> node - along
068     * with it's closing element - as a {@code DotPair} - that matches the input-parameter
069     * {@code 'htmlTags'} In this case, the term {@code 'except'} shall mean that any matches whose
070     * HTML Token is among the list in parameter {@code String... htmlTags} will be <B>skipped</B>,
071     * and a "higher-level" ancestor will be returned instead.
072     * 
073     * @param html <EMBED CLASS="external-html" DATA-FILE-ID="HTMLVEC">
074     * 
075     * @param index This is the index of the node for whose "ancestors" we are searching (to use a
076     * Java-Script {@code DOM Tree} term).
077     * 
078     * @param htmlTags When this list is non-empty (contains <I>at least one token</I>), the search
079     * loop will skip over ancestor nodes that are among the members of this var-args parameter
080     * list. If this method is invoked <I>and this parameter is an empty list</I>, then the search
081     * loop will return the first anestor node identified.
082     *
083     * <BR /><BR /><B>FOR INSTANCE:</B> If {@code "B"} and {@code "P"} were passed as parameters to
084     * this method, then the search-loop will continue looking for higher-level ancestors -
085     * <I>until one was found that was not an HTML {@code 'bold'} or {@code 'paragraph'} element 
086     * {@code DotPair}.</I>
087     * 
088     * @return This shall return the first sub-list, as a {@code 'DotPair'} (start &amp; end index
089     * pair). If no matches are found, null will return.  This sublist is nearly identical to the
090     * Java-Script <B STYLE="color: red">DOM Tree</B> concept of ancestor-node, though no trees are
091     * constructed by this method.
092     * 
093     * @throws ArrayIndexOutOfBoundsException If index is not within the bounds of the passed 
094     * vectorized-html parameter {@code 'html'}
095     * 
096     * @throws HTMLTokException If any of the tags passed are null, or not found in the table of 
097     * {@code class HTMLTags} - specifically if they are not valid HTML Elements.
098     * 
099     * @see #FIRST(Vector, int, HTMLTagCounter)
100     * @see ARGCHECK#index(Vector, int)
101     */
102    public static DotPair firstExcept(Vector<? extends HTMLNode> html, int index, String... htmlTags)
103    {
104        return FIRST(
105            html, ARGCHECK.index(html, index),
106            new HTMLTagCounter(htmlTags, HTMLTagCounter.EXCEPT, HTMLTagCounter.FIRST)
107        );
108    }
109
110    /**
111     * This will find all <B><SPAN STYLE="color: red;">ancestors</SPAN></B> of a given index.  If
112     * parameter {@code String... htmlTags} is null, all HTML elements will be considered.  If this
113     * parameter contains any elements, then only those elements shall be considered as match in
114     * the ancestor hierarchy tree.
115     * 
116     * @param html <EMBED CLASS="external-html" DATA-FILE-ID="HTMLVEC">
117     * 
118     * @param index This is the index of the node for whose "ancestors" we are searching (to use a
119     * Java-Script {@code DOM Tree} term).
120     * 
121     * @param htmlTags If this list is empty, we shall look for <I><B>all ancestor nodes.</I></B> 
122     * Since this method returns the first ancestor node-pair found, f this list is left non-empty,
123     * then the only ancestor nodes whose HTML Element Tag (usually referred to as "the token") are
124     * members of this varargs {@code String} parameter list shall be considered eligible as a
125     * return result for this method.
126     *
127     * <BR /><BR /><B>FOR INSTANCE:</B> If {@code "DIV", "P"}, and {@code "A"} were listed - <I>the
128     * search loop would skip over all ancestors that were not HTML divider, paragraph or anchor
129     * elements</I> before selecting a result.
130     * 
131     * @return This shall return <I><B>every</I></B> sub-list, as a {@code 'DotPair'}
132     * (start &amp; end index pair).  If no matches are found, an empty {@code Vector} of
133     * zero-elements shall return.  These sublists are nearly identical to the Java-Script
134     * <B STYLE="color: red">DOM Tree</B> concept of ancestor-nodes, though no trees are
135     * constructed by this method.
136     * 
137     * @throws ArrayIndexOutOfBoundsException If index is not within the bounds of the passed 
138     * vectorized-html parameter {@code 'html'}
139     * 
140     * @throws HTMLTokException If any of the tags passed are null, or not found in the table of
141     * {@code class HTMLTags} - specifically if they are not valid HTML Elements.
142     * 
143     * @see #ALL(Vector, int, HTMLTagCounter)
144     * @see ARGCHECK#index(Vector, int)
145     */
146    public static Vector<DotPair> all(Vector<? extends HTMLNode> html, int index, String... htmlTags)
147    { 
148        return ALL(
149            html, ARGCHECK.index(html, index),
150            new HTMLTagCounter(htmlTags, HTMLTagCounter.NORMAL, HTMLTagCounter.ALL)
151        );
152    }
153
154    /**
155     * This will find all <B><SPAN STYLE="color: red;">ancestors</SPAN></B> of a given index.  If
156     * parameter {@code String... htmlTags} is null, all HTML elements will be considered.  If this
157     * parameter contains any elements, then those elements <B><I>shall not be considered</B></I>
158     * as a match in the ancestor hierarchy tree.
159     * 
160     * @param html <EMBED CLASS="external-html" DATA-FILE-ID="HTMLVEC">
161     * 
162     * @param index This is the index of the node for whose "ancestors" we are searching (to use a
163     * Java-Script {@code DOM Tree} term).
164     * 
165     * @param htmlTags When this list is non-empty (contains <I>at least one token</I>), the search
166     * loop will skip over ancestor nodes that are among the members of this var-args parameter
167     * list. If this method is invoked <I>and this parameter is an empty list</I>, then the search
168     * loop will return all ancestor nodes of the index node.
169     *
170     * <BR /><BR /><B>FOR INSTANCE:</B> If {@code "B"} and {@code "P"} were passed as parameters to
171     * this method, then the search-loop which is saving all ancestor matches to it's result-set,
172     * would skip over any HTML {@code 'bold'} or {@code 'paragraph'} {@code DotPair's}.
173     * 
174     * @return This shall return <I><B>every</I></B> sub-list, as a {@code 'DotPair'}
175     * (start &amp; end index pair).  If no matches are found, an empty {@code Vector} of
176     * zero-elements shall return.  These sublists are nearly identical to the Java-Script
177     * <B STYLE="color: red">DOM Tree</B> concept of ancestor-nodes, though no trees are
178     * constructed by this method.
179     * 
180     * @throws ArrayIndexOutOfBoundsException If index is not within the bounds of the passed 
181     * vectorized-html parameter {@code 'html'}
182     * 
183     * @throws HTMLTokException If any of the tags passed are null, or not found in the table of 
184     * {@code class HTMLTags} - specifically if they are not valid HTML Elements.
185     * 
186     * @see #ALL(Vector, int, HTMLTagCounter)
187     * @see ARGCHECK#index(Vector, int)
188     */
189    public static Vector<DotPair> allExcept(Vector<? extends HTMLNode> html, int index, String... htmlTags)
190    {
191        return ALL(
192            html, ARGCHECK.index(html, index),
193            new HTMLTagCounter(htmlTags, HTMLTagCounter.EXCEPT, HTMLTagCounter.ALL)
194        );
195    }
196
197    // ********************************************************************************************
198    // FIND INTERNAL METHODS
199    // ********************************************************************************************
200
201    /**
202     * Finds the first <B><SPAN STYLE="color: red;">ancestor</SPAN></B> ("surrounding") node pair.
203     * 
204     * @param html <EMBED CLASS="external-html" DATA-FILE-ID="HTMLVEC">
205     * 
206     * @param index This is any index within the bounds of the {@code 'html'} parameter.
207     * 
208     * @param tagCounter Any internally used counter, to optimize the search routine.
209     * 
210     * @return The matching <B STYLE="color: red;">ancestor</B> node's start-and-end index as a 
211     * {@code 'DotPair'}.
212     * 
213     * @see TagNode
214     * @see HTMLNode
215     * @see DotPair
216     * @see DotPair#isInside(int)
217     * @see Util.Inclusive#dotPairOPT(Vector, int, int)
218     */
219    protected static DotPair FIRST(Vector<? extends HTMLNode> html, int index, HTMLTagCounter tagCounter)
220    {
221        int     size = html.size();
222        TagNode tn;
223        DotPair ret;
224
225        for (   int i=(index-1);
226                (i >= 0) && (! tagCounter.allBanned());
227                i--
228        )
229
230            if (    ((tn = html.elementAt(i).openTag()) != null)
231                    && tagCounter.check(tn)
232                    && ((ret = Util.Inclusive.dotPairOPT(html, i, size)) != null)
233                    && ret.isInside(index)  // isInside(...) Should never fail, but 
234            )                               // This guarantees to prevent erroneous answers
235
236                // If there is a match, return that match, and exit immediately.
237                return ret;
238
239        return null;
240    }
241
242    /**
243     * Finds all <B><SPAN STYLE="color: red;">ancestor</SPAN></B> ("surrounding"} node pairs.
244     * 
245     * @param html <EMBED CLASS="external-html" DATA-FILE-ID="HTMLVEC">
246     * 
247     * @param index This is any index within the bounds of the {@code 'html'} parameter.
248     * 
249     * @param tagCounter Any internally used counter, to optimize the search routine.
250     * 
251     * @return All matching <B STYLE="color: red;">ancestor</B> nodes' start-and-end index pairs
252     * inside a {@code Vector<DotPair>}
253     * 
254     * @see TagNode
255     * @see HTMLNode
256     * @see DotPair
257     * @see DotPair#isInside(int)
258     * @see Util.Inclusive#dotPairOPT(Vector, int, int)
259     */
260    protected static Vector<DotPair> ALL(Vector<? extends HTMLNode> html, int index, HTMLTagCounter tagCounter)
261    {
262        HTMLNode n;     TagNode tn;     DotPair dp;     int size = html.size();
263        Vector<DotPair> ret = new Vector<>();
264
265        for (int i=(index-1); (i >= 0) && (! tagCounter.allBanned()); i--)
266
267            if (    (n = html.elementAt(i)).isTagNode()
268                &&  tagCounter.check(tn = (TagNode) n)
269            )
270            {
271                if (    ((dp = Util.Inclusive.dotPairOPT(html, i, size)) != null)
272                    &&  dp.isInside(index)
273                )                                   // isInside(...) Should never fail, but 
274                                                    // This guarantees to prevent erroneous answers
275                    ret.addElement(dp);
276                else
277                    tagCounter.reportFailed(tn.tok);    // If finding a token match fails, just ignore that token from now
278                                                        // on...
279            }
280
281        return ret;
282    }
283
284    // *****************************************************************************************************************************
285    // Tester, leave it here!  It's not doing anybody any harm.
286    // *****************************************************************************************************************************
287
288    static void test(String urlStr, String fileName) throws IOException
289    {
290        // String url = "http://developer.torello.directory/JavaHTML/Version%201/1.4/javadoc/Torello/HTML/NodeSearch/CommentNodeCount.html";
291    
292        StringBuilder       sb      = new StringBuilder();
293        URL                 url     = new URL(urlStr);
294        Vector<HTMLNode>    page    = HTMLPage.getPageTokens(url, false);
295        int                 pos     = TextNodeFind.first(page, TextComparitor.CN_CI, "a count of how many");
296        DotPair             dp      = Surrounding.firstExcept(page, pos, "li", "body", "div");
297
298        sb.append("Text Node Found: [" + page.elementAt(pos) + "]\n");
299        sb.append("Index Found: " + pos + ", DotPair Found: " + dp.toString() + "\n");
300        sb.append(Debug.print(page, dp, Debug::J) + "\n");
301
302        Vector<DotPair> allDP = Surrounding.allExcept(page, pos, "body", "html", "div");
303
304        for (DotPair l : allDP)
305            sb.append(C.BCYAN + 
306                "************************************************************\n" +
307                "************************************************************\n" + C.RESET +
308                "Index Found: " + pos + ", DotPair Found: " + l.toString() + "\n" +
309                "Starting Node: " + C.BRED + page.elementAt(l.start).str + C.RESET + "\n" +
310                "Ending Node:" + C.BRED + page.elementAt(l.end).str + C.RESET + "\n" );
311
312        String s = sb.toString();
313        System.out.println(s);
314
315        if (fileName != null)
316            FileRW.writeFile(C.toHTML(s.replace("<", "&lt;").replace(">", "&gt;")), fileName);
317    }
318}