001package Torello.HTML;
002
003import Torello.HTML.NodeSearch.*;
004
005import static Torello.Java.C.*;
006
007import Torello.Java.FileRW;
008import Torello.Java.C;
009import Torello.HTML.HelperPackages.parse.HTMLTagCounter;
010
011import java.util.*;
012
013import java.util.function.Predicate;
014import java.net.URL;
015import java.io.IOException;
016
017/**
018 * Class for finding ancestor & parent nodes of any selected {@link HTMLNode}.
019 * 
020 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=SURROUNDING>
021 */
022@Torello.JavaDoc.StaticFunctional
023public class Surrounding
024{
025    private Surrounding() { }
026
027    /**
028     * This will return the first <B><SPAN STYLE="color: red;">ancestor</SPAN></B> node - along
029     * with it's closing element - as a {@code DotPair} - that matches.
030     * 
031     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC>
032     * 
033     * @param index This is the index of the node for whose "ancestors" we are searching (to use a
034     * Java-Script DOM Tree term).
035     * 
036     * @param htmlTags If this list is empty, we shall look for any ancestor node.  Since this
037     * method returns the first, if this list is left empty, and the index-node is surrounded by
038     * even a bold "{@code <B>...</B>}" then that will be the {@code DotPair} result that is
039     * returned.  If this list is left non-empty, then the only ancestor nodes whose HTML Element
040     * Tag (usually referred to as "the Element") matches a tag from this list shall be returned.
041     *
042     * <BR /><BR /><B>FOR INSTANCE:</B> If {@code "div", "p"}, and {@code "a"} were provided as
043     * values to this parameter - <I>the search loop would skip over all ancestors that were not
044     * HTML divider, paragraph or anchor elements</I> before selecting a result.
045     * 
046     * @return This shall return the first sub-list, as a {@code 'DotPair'} (start &amp; end index
047     * pair). If no matches are found, null will return.  This sublist is nearly identical to the
048     * Java-Script <B STYLE="color: red">DOM Tree</B> concept of ancestor-node, though no trees are
049     * constructed by this method.
050     * 
051     * @throws ArrayIndexOutOfBoundsException If index is not within the bounds of the passed 
052     * vectorized-html parameter {@code 'html'}
053     * 
054     * @throws HTMLTokException If any of the tags passed are null, or not found in the table of 
055     * {@code class HTMLTags} - specifically if they are not valid HTML Elements.
056     * 
057     * @see #FIRST(Vector, int, HTMLTagCounter)
058     * @see ARGCHECK#index(Vector, int)
059     */
060    public static DotPair first(Vector<? extends HTMLNode> html, int index, String... htmlTags)
061    {
062        return FIRST(
063            html, ARGCHECK.index(html, index),
064            new HTMLTagCounter(htmlTags, HTMLTagCounter.NORMAL, HTMLTagCounter.FIRST)
065        );
066    }
067
068    /**
069     * This will return the first <B><SPAN STYLE="color: red;">ancestor</SPAN></B> node - along
070     * with it's closing element - as a {@code DotPair} - that matches the input-parameter
071     * {@code 'htmlTags'} In this case, the term {@code 'except'} shall mean that any matches whose
072     * HTML Token is among the list in parameter {@code String... htmlTags} will be <B>skipped</B>,
073     * and a "higher-level" ancestor will be returned instead.
074     * 
075     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC>
076     * 
077     * @param index This is the index of the node for whose "ancestors" we are searching (to use a
078     * Java-Script {@code DOM Tree} term).
079     * 
080     * @param htmlTags When this list is non-empty (contains <I>at least one token</I>), the search
081     * loop will skip over ancestor nodes that are among the members of this var-args parameter
082     * list. If this method is invoked <I>and this parameter is an empty list</I>, then the search
083     * loop will return the first anestor node identified.
084     *
085     * <BR /><BR /><B>FOR INSTANCE:</B> If {@code "B"} and {@code "P"} were passed as parameters to
086     * this method, then the search-loop will continue looking for higher-level ancestors -
087     * <I>until one was found that was not an HTML {@code 'bold'} or {@code 'paragraph'} element 
088     * {@code DotPair}.</I>
089     * 
090     * @return This shall return the first sub-list, as a {@code 'DotPair'} (start &amp; end index
091     * pair). If no matches are found, null will return.  This sublist is nearly identical to the
092     * Java-Script <B STYLE="color: red">DOM Tree</B> concept of ancestor-node, though no trees are
093     * constructed by this method.
094     * 
095     * @throws ArrayIndexOutOfBoundsException If index is not within the bounds of the passed 
096     * vectorized-html parameter {@code 'html'}
097     * 
098     * @throws HTMLTokException If any of the tags passed are null, or not found in the table of 
099     * {@code class HTMLTags} - specifically if they are not valid HTML Elements.
100     * 
101     * @see #FIRST(Vector, int, HTMLTagCounter)
102     * @see ARGCHECK#index(Vector, int)
103     */
104    public static DotPair firstExcept(Vector<? extends HTMLNode> html, int index, String... htmlTags)
105    {
106        return FIRST(
107            html, ARGCHECK.index(html, index),
108            new HTMLTagCounter(htmlTags, HTMLTagCounter.EXCEPT, HTMLTagCounter.FIRST)
109        );
110    }
111
112    /**
113     * This will find all <B><SPAN STYLE="color: red;">ancestors</SPAN></B> of a given index.  If
114     * parameter {@code String... htmlTags} is null, all HTML elements will be considered.  If this
115     * parameter contains any elements, then only those elements shall be considered as match in
116     * the ancestor hierarchy tree.
117     * 
118     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC>
119     * 
120     * @param index This is the index of the node for whose "ancestors" we are searching (to use a
121     * Java-Script {@code DOM Tree} term).
122     * 
123     * @param htmlTags If this list is empty, we shall look for <I><B>all ancestor nodes.</I></B> 
124     * Since this method returns the first ancestor node-pair found, f this list is left non-empty,
125     * then the only ancestor nodes whose HTML Element Tag (usually referred to as "the token") are
126     * members of this varargs {@code String} parameter list shall be considered eligible as a
127     * return result for this method.
128     *
129     * <BR /><BR /><B>FOR INSTANCE:</B> If {@code "DIV", "P"}, and {@code "A"} were listed - <I>the
130     * search loop would skip over all ancestors that were not HTML divider, paragraph or anchor
131     * elements</I> before selecting a result.
132     * 
133     * @return This shall return <I><B>every</I></B> sub-list, as a {@code 'DotPair'}
134     * (start &amp; end index pair).  If no matches are found, an empty {@code Vector} of
135     * zero-elements shall return.  These sublists are nearly identical to the Java-Script
136     * <B STYLE="color: red">DOM Tree</B> concept of ancestor-nodes, though no trees are
137     * constructed by this method.
138     * 
139     * @throws ArrayIndexOutOfBoundsException If index is not within the bounds of the passed 
140     * vectorized-html parameter {@code 'html'}
141     * 
142     * @throws HTMLTokException If any of the tags passed are null, or not found in the table of
143     * {@code class HTMLTags} - specifically if they are not valid HTML Elements.
144     * 
145     * @see #ALL(Vector, int, HTMLTagCounter)
146     * @see ARGCHECK#index(Vector, int)
147     */
148    public static Vector<DotPair> all(Vector<? extends HTMLNode> html, int index, String... htmlTags)
149    { 
150        return ALL(
151            html, ARGCHECK.index(html, index),
152            new HTMLTagCounter(htmlTags, HTMLTagCounter.NORMAL, HTMLTagCounter.ALL)
153        );
154    }
155
156    /**
157     * This will find all <B><SPAN STYLE="color: red;">ancestors</SPAN></B> of a given index.  If
158     * parameter {@code String... htmlTags} is null, all HTML elements will be considered.  If this
159     * parameter contains any elements, then those elements <B><I>shall not be considered</B></I>
160     * as a match in the ancestor hierarchy tree.
161     * 
162     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC>
163     * 
164     * @param index This is the index of the node for whose "ancestors" we are searching (to use a
165     * Java-Script {@code DOM Tree} term).
166     * 
167     * @param htmlTags When this list is non-empty (contains <I>at least one token</I>), the search
168     * loop will skip over ancestor nodes that are among the members of this var-args parameter
169     * list. If this method is invoked <I>and this parameter is an empty list</I>, then the search
170     * loop will return all ancestor nodes of the index node.
171     *
172     * <BR /><BR /><B>FOR INSTANCE:</B> If {@code "B"} and {@code "P"} were passed as parameters to
173     * this method, then the search-loop which is saving all ancestor matches to it's result-set,
174     * would skip over any HTML {@code 'bold'} or {@code 'paragraph'} {@code DotPair's}.
175     * 
176     * @return This shall return <I><B>every</I></B> sub-list, as a {@code 'DotPair'}
177     * (start &amp; end index pair).  If no matches are found, an empty {@code Vector} of
178     * zero-elements shall return.  These sublists are nearly identical to the Java-Script
179     * <B STYLE="color: red">DOM Tree</B> concept of ancestor-nodes, though no trees are
180     * constructed by this method.
181     * 
182     * @throws ArrayIndexOutOfBoundsException If index is not within the bounds of the passed 
183     * vectorized-html parameter {@code 'html'}
184     * 
185     * @throws HTMLTokException If any of the tags passed are null, or not found in the table of 
186     * {@code class HTMLTags} - specifically if they are not valid HTML Elements.
187     * 
188     * @see #ALL(Vector, int, HTMLTagCounter)
189     * @see ARGCHECK#index(Vector, int)
190     */
191    public static Vector<DotPair> allExcept
192        (Vector<? extends HTMLNode> html, int index, String... htmlTags)
193    {
194        return ALL(
195            html, ARGCHECK.index(html, index),
196            new HTMLTagCounter(htmlTags, HTMLTagCounter.EXCEPT, HTMLTagCounter.ALL)
197        );
198    }
199
200
201    // ********************************************************************************************
202    // ********************************************************************************************
203    // FIND INTERNAL METHODS
204    // ********************************************************************************************
205    // ********************************************************************************************
206
207
208    /**
209     * Finds the first <B><SPAN STYLE="color: red;">ancestor</SPAN></B> ("surrounding") node pair.
210     * 
211     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC>
212     * @param index This is any index within the bounds of the {@code 'html'} parameter.
213     * @param tagCounter Any internally used counter, to optimize the search routine.
214     * 
215     * @return The matching <B STYLE="color: red;">ancestor</B> node's start-and-end index as a 
216     * {@code 'DotPair'}.
217     * 
218     * @see TagNode
219     * @see HTMLNode
220     * @see DotPair
221     * @see DotPair#isInside(int)
222     * @see Util.Inclusive#dotPairOPT(Vector, int, int)
223     */
224    protected static DotPair FIRST
225        (Vector<? extends HTMLNode> html, int index, HTMLTagCounter tagCounter)
226    {
227        int     size = html.size();
228        TagNode tn;
229        DotPair ret;
230
231        for (   int i=(index-1);
232                (i >= 0) && (! tagCounter.allBanned());
233                i--
234        )
235
236            if (    ((tn = html.elementAt(i).openTag()) != null)
237                &&  tagCounter.check(tn)
238                &&  ((ret = Util.Inclusive.dotPairOPT(html, i, size)) != null)
239                &&  ret.isInside(index)
240                    // isInside(...) Should never fail, but 
241            )       // This guarantees to prevent erroneous answers
242
243                // If there is a match, return that match, and exit immediately.
244                return ret;
245
246        return null;
247    }
248
249    /**
250     * Finds all <B><SPAN STYLE="color: red;">ancestor</SPAN></B> ("surrounding"} node pairs.
251     * 
252     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC>
253     * @param index This is any index within the bounds of the {@code 'html'} parameter.
254     * @param tagCounter Any internally used counter, to optimize the search routine.
255     * 
256     * @return All matching <B STYLE="color: red;">ancestor</B> nodes' start-and-end index pairs
257     * inside a {@code Vector<DotPair>}
258     * 
259     * @see TagNode
260     * @see HTMLNode
261     * @see DotPair
262     * @see DotPair#isInside(int)
263     * @see Util.Inclusive#dotPairOPT(Vector, int, int)
264     */
265    protected static Vector<DotPair> ALL
266        (Vector<? extends HTMLNode> html, int index, HTMLTagCounter tagCounter)
267    {
268        HTMLNode n;     TagNode tn;     DotPair dp;     int size = html.size();
269        Vector<DotPair> ret = new Vector<>();
270
271        for (int i=(index-1); (i >= 0) && (! tagCounter.allBanned()); i--)
272
273            if (    (n = html.elementAt(i)).isTagNode()
274                &&  tagCounter.check(tn = (TagNode) n)
275            )
276            {
277                if (    ((dp = Util.Inclusive.dotPairOPT(html, i, size)) != null)
278                    &&  dp.isInside(index)
279                )           // isInside(...) Should never fail, but 
280                            // This guarantees to prevent erroneous answers
281                    ret.addElement(dp);
282
283                else
284                    // If finding a token match fails, just ignore that token from now on...
285                    tagCounter.reportFailed(tn.tok);
286
287            }
288
289        return ret;
290    }
291
292
293    // ********************************************************************************************
294    // ********************************************************************************************
295    // Tester, leave it here!  It's not doing you no harm.
296    // ********************************************************************************************
297    // ********************************************************************************************
298
299
300    static void test(String urlStr, String fileName) throws IOException
301    {
302        // String url = "http://developer.torello.directory/JavaHTML/Version%201/1.4/javadoc/" +
303        //      "Torello/HTML/NodeSearch/CommentNodeCount.html";
304    
305        StringBuilder       sb      = new StringBuilder();
306        URL                 url     = new URL(urlStr);
307        Vector<HTMLNode>    page    = HTMLPage.getPageTokens(url, false);
308
309        int     pos = TextNodeFind.first(page, TextComparitor.CN_CI, "a count of how many");
310        DotPair dp  = Surrounding.firstExcept(page, pos, "li", "body", "div");
311
312        sb.append("Text Node Found: [" + page.elementAt(pos) + "]\n");
313        sb.append("Index Found: " + pos + ", DotPair Found: " + dp.toString() + "\n");
314        sb.append(Debug.print(page, dp, Debug::J) + "\n");
315
316        Vector<DotPair> allDP = Surrounding.allExcept(page, pos, "body", "html", "div");
317
318        for (DotPair l : allDP) sb.append(
319            BCYAN + 
320            "************************************************************\n" +
321            "************************************************************\n" + RESET +
322            "Index Found: " + pos + ", DotPair Found: " + l.toString() + "\n" +
323            "Starting Node: " + BRED + page.elementAt(l.start).str + RESET + "\n" +
324            "Ending Node:" + BRED + page.elementAt(l.end).str + RESET + "\n"
325        );
326
327        String s = sb.toString();
328        System.out.println(s);
329
330        if (fileName != null)
331            FileRW.writeFile(C.toHTML(s.replace("<", "&lt;").replace(">", "&gt;")), fileName);
332    }
333}