001package Torello.HTML;
002
003import Torello.HTML.NodeSearch.*;
004
005import static Torello.Java.C.*;
006
007import Torello.Java.FileRW;
008import Torello.Java.C;
009
010import java.util.*;
011
012import java.util.function.Predicate;
013import java.net.URL;
014import java.io.IOException;
015
016/**
017 * Class for finding ancestor & parent nodes of any selected {@link HTMLNode}.
018 * 
019 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=SURROUNDING>
020 */
021@Torello.JavaDoc.StaticFunctional
022public class Surrounding
023{
024    private Surrounding() { }
025
026    /**
027     * This will return the first <B><SPAN STYLE="color: red;">ancestor</SPAN></B> node - along
028     * with it's closing element - as a {@code DotPair} - that matches.
029     * 
030     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC>
031     * 
032     * @param index This is the index of the node for whose "ancestors" we are searching (to use a
033     * Java-Script DOM Tree term).
034     * 
035     * @param htmlTags If this list is empty, we shall look for any ancestor node.  Since this
036     * method returns the first, if this list is left empty, and the index-node is surrounded by
037     * even a bold "{@code <B>...</B>}" then that will be the {@code DotPair} result that is
038     * returned.  If this list is left non-empty, then the only ancestor nodes whose HTML Element
039     * Tag (usually referred to as "the Element") matches a tag from this list shall be returned.
040     *
041     * <BR /><BR /><B>FOR INSTANCE:</B> If {@code "div", "p"}, and {@code "a"} were provided as
042     * values to this parameter - <I>the search loop would skip over all ancestors that were not
043     * HTML divider, paragraph or anchor elements</I> before selecting a result.
044     * 
045     * @return This shall return the first sub-list, as a {@code 'DotPair'} (start &amp; end index
046     * pair). If no matches are found, null will return.  This sublist is nearly identical to the
047     * Java-Script <B STYLE="color: red">DOM Tree</B> concept of ancestor-node, though no trees are
048     * constructed by this method.
049     * 
050     * @throws ArrayIndexOutOfBoundsException If index is not within the bounds of the passed 
051     * vectorized-html parameter {@code 'html'}
052     * 
053     * @throws HTMLTokException If any of the tags passed are null, or not found in the table of 
054     * {@code class HTMLTags} - specifically if they are not valid HTML Elements.
055     * 
056     * @see #FIRST(Vector, int, HTMLTagCounter)
057     * @see ARGCHECK#index(Vector, int)
058     */
059    public static DotPair first(Vector<? extends HTMLNode> html, int index, String... htmlTags)
060    {
061        return FIRST(
062            html, ARGCHECK.index(html, index),
063            new HTMLTagCounter(htmlTags, HTMLTagCounter.NORMAL, HTMLTagCounter.FIRST)
064        );
065    }
066
067    /**
068     * This will return the first <B><SPAN STYLE="color: red;">ancestor</SPAN></B> node - along
069     * with it's closing element - as a {@code DotPair} - that matches the input-parameter
070     * {@code 'htmlTags'} In this case, the term {@code 'except'} shall mean that any matches whose
071     * HTML Token is among the list in parameter {@code String... htmlTags} will be <B>skipped</B>,
072     * and a "higher-level" ancestor will be returned instead.
073     * 
074     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC>
075     * 
076     * @param index This is the index of the node for whose "ancestors" we are searching (to use a
077     * Java-Script {@code DOM Tree} term).
078     * 
079     * @param htmlTags When this list is non-empty (contains <I>at least one token</I>), the search
080     * loop will skip over ancestor nodes that are among the members of this var-args parameter
081     * list. If this method is invoked <I>and this parameter is an empty list</I>, then the search
082     * loop will return the first anestor node identified.
083     *
084     * <BR /><BR /><B>FOR INSTANCE:</B> If {@code "B"} and {@code "P"} were passed as parameters to
085     * this method, then the search-loop will continue looking for higher-level ancestors -
086     * <I>until one was found that was not an HTML {@code 'bold'} or {@code 'paragraph'} element 
087     * {@code DotPair}.</I>
088     * 
089     * @return This shall return the first sub-list, as a {@code 'DotPair'} (start &amp; end index
090     * pair). If no matches are found, null will return.  This sublist is nearly identical to the
091     * Java-Script <B STYLE="color: red">DOM Tree</B> concept of ancestor-node, though no trees are
092     * constructed by this method.
093     * 
094     * @throws ArrayIndexOutOfBoundsException If index is not within the bounds of the passed 
095     * vectorized-html parameter {@code 'html'}
096     * 
097     * @throws HTMLTokException If any of the tags passed are null, or not found in the table of 
098     * {@code class HTMLTags} - specifically if they are not valid HTML Elements.
099     * 
100     * @see #FIRST(Vector, int, HTMLTagCounter)
101     * @see ARGCHECK#index(Vector, int)
102     */
103    public static DotPair firstExcept(Vector<? extends HTMLNode> html, int index, String... htmlTags)
104    {
105        return FIRST(
106            html, ARGCHECK.index(html, index),
107            new HTMLTagCounter(htmlTags, HTMLTagCounter.EXCEPT, HTMLTagCounter.FIRST)
108        );
109    }
110
111    /**
112     * This will find all <B><SPAN STYLE="color: red;">ancestors</SPAN></B> of a given index.  If
113     * parameter {@code String... htmlTags} is null, all HTML elements will be considered.  If this
114     * parameter contains any elements, then only those elements shall be considered as match in
115     * the ancestor hierarchy tree.
116     * 
117     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC>
118     * 
119     * @param index This is the index of the node for whose "ancestors" we are searching (to use a
120     * Java-Script {@code DOM Tree} term).
121     * 
122     * @param htmlTags If this list is empty, we shall look for <I><B>all ancestor nodes.</I></B> 
123     * Since this method returns the first ancestor node-pair found, f this list is left non-empty,
124     * then the only ancestor nodes whose HTML Element Tag (usually referred to as "the token") are
125     * members of this varargs {@code String} parameter list shall be considered eligible as a
126     * return result for this method.
127     *
128     * <BR /><BR /><B>FOR INSTANCE:</B> If {@code "DIV", "P"}, and {@code "A"} were listed - <I>the
129     * search loop would skip over all ancestors that were not HTML divider, paragraph or anchor
130     * elements</I> before selecting a result.
131     * 
132     * @return This shall return <I><B>every</I></B> sub-list, as a {@code 'DotPair'}
133     * (start &amp; end index pair).  If no matches are found, an empty {@code Vector} of
134     * zero-elements shall return.  These sublists are nearly identical to the Java-Script
135     * <B STYLE="color: red">DOM Tree</B> concept of ancestor-nodes, though no trees are
136     * constructed by this method.
137     * 
138     * @throws ArrayIndexOutOfBoundsException If index is not within the bounds of the passed 
139     * vectorized-html parameter {@code 'html'}
140     * 
141     * @throws HTMLTokException If any of the tags passed are null, or not found in the table of
142     * {@code class HTMLTags} - specifically if they are not valid HTML Elements.
143     * 
144     * @see #ALL(Vector, int, HTMLTagCounter)
145     * @see ARGCHECK#index(Vector, int)
146     */
147    public static Vector<DotPair> all(Vector<? extends HTMLNode> html, int index, String... htmlTags)
148    { 
149        return ALL(
150            html, ARGCHECK.index(html, index),
151            new HTMLTagCounter(htmlTags, HTMLTagCounter.NORMAL, HTMLTagCounter.ALL)
152        );
153    }
154
155    /**
156     * This will find all <B><SPAN STYLE="color: red;">ancestors</SPAN></B> of a given index.  If
157     * parameter {@code String... htmlTags} is null, all HTML elements will be considered.  If this
158     * parameter contains any elements, then those elements <B><I>shall not be considered</B></I>
159     * as a match in the ancestor hierarchy tree.
160     * 
161     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC>
162     * 
163     * @param index This is the index of the node for whose "ancestors" we are searching (to use a
164     * Java-Script {@code DOM Tree} term).
165     * 
166     * @param htmlTags When this list is non-empty (contains <I>at least one token</I>), the search
167     * loop will skip over ancestor nodes that are among the members of this var-args parameter
168     * list. If this method is invoked <I>and this parameter is an empty list</I>, then the search
169     * loop will return all ancestor nodes of the index node.
170     *
171     * <BR /><BR /><B>FOR INSTANCE:</B> If {@code "B"} and {@code "P"} were passed as parameters to
172     * this method, then the search-loop which is saving all ancestor matches to it's result-set,
173     * would skip over any HTML {@code 'bold'} or {@code 'paragraph'} {@code DotPair's}.
174     * 
175     * @return This shall return <I><B>every</I></B> sub-list, as a {@code 'DotPair'}
176     * (start &amp; end index pair).  If no matches are found, an empty {@code Vector} of
177     * zero-elements shall return.  These sublists are nearly identical to the Java-Script
178     * <B STYLE="color: red">DOM Tree</B> concept of ancestor-nodes, though no trees are
179     * constructed by this method.
180     * 
181     * @throws ArrayIndexOutOfBoundsException If index is not within the bounds of the passed 
182     * vectorized-html parameter {@code 'html'}
183     * 
184     * @throws HTMLTokException If any of the tags passed are null, or not found in the table of 
185     * {@code class HTMLTags} - specifically if they are not valid HTML Elements.
186     * 
187     * @see #ALL(Vector, int, HTMLTagCounter)
188     * @see ARGCHECK#index(Vector, int)
189     */
190    public static Vector<DotPair> allExcept
191        (Vector<? extends HTMLNode> html, int index, String... htmlTags)
192    {
193        return ALL(
194            html, ARGCHECK.index(html, index),
195            new HTMLTagCounter(htmlTags, HTMLTagCounter.EXCEPT, HTMLTagCounter.ALL)
196        );
197    }
198
199
200    // ********************************************************************************************
201    // ********************************************************************************************
202    // FIND INTERNAL METHODS
203    // ********************************************************************************************
204    // ********************************************************************************************
205
206
207    /**
208     * Finds the first <B><SPAN STYLE="color: red;">ancestor</SPAN></B> ("surrounding") node pair.
209     * 
210     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC>
211     * @param index This is any index within the bounds of the {@code 'html'} parameter.
212     * @param tagCounter Any internally used counter, to optimize the search routine.
213     * 
214     * @return The matching <B STYLE="color: red;">ancestor</B> node's start-and-end index as a 
215     * {@code 'DotPair'}.
216     * 
217     * @see TagNode
218     * @see HTMLNode
219     * @see DotPair
220     * @see DotPair#isInside(int)
221     * @see Util.Inclusive#dotPairOPT(Vector, int, int)
222     */
223    protected static DotPair FIRST
224        (Vector<? extends HTMLNode> html, int index, HTMLTagCounter tagCounter)
225    {
226        int     size = html.size();
227        TagNode tn;
228        DotPair ret;
229
230        for (   int i=(index-1);
231                (i >= 0) && (! tagCounter.allBanned());
232                i--
233        )
234
235            if (    ((tn = html.elementAt(i).openTag()) != null)
236                &&  tagCounter.check(tn)
237                &&  ((ret = Util.Inclusive.dotPairOPT(html, i, size)) != null)
238                &&  ret.isInside(index)
239                    // isInside(...) Should never fail, but 
240            )       // This guarantees to prevent erroneous answers
241
242                // If there is a match, return that match, and exit immediately.
243                return ret;
244
245        return null;
246    }
247
248    /**
249     * Finds all <B><SPAN STYLE="color: red;">ancestor</SPAN></B> ("surrounding"} node pairs.
250     * 
251     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC>
252     * @param index This is any index within the bounds of the {@code 'html'} parameter.
253     * @param tagCounter Any internally used counter, to optimize the search routine.
254     * 
255     * @return All matching <B STYLE="color: red;">ancestor</B> nodes' start-and-end index pairs
256     * inside a {@code Vector<DotPair>}
257     * 
258     * @see TagNode
259     * @see HTMLNode
260     * @see DotPair
261     * @see DotPair#isInside(int)
262     * @see Util.Inclusive#dotPairOPT(Vector, int, int)
263     */
264    protected static Vector<DotPair> ALL
265        (Vector<? extends HTMLNode> html, int index, HTMLTagCounter tagCounter)
266    {
267        HTMLNode n;     TagNode tn;     DotPair dp;     int size = html.size();
268        Vector<DotPair> ret = new Vector<>();
269
270        for (int i=(index-1); (i >= 0) && (! tagCounter.allBanned()); i--)
271
272            if (    (n = html.elementAt(i)).isTagNode()
273                &&  tagCounter.check(tn = (TagNode) n)
274            )
275            {
276                if (    ((dp = Util.Inclusive.dotPairOPT(html, i, size)) != null)
277                    &&  dp.isInside(index)
278                )           // isInside(...) Should never fail, but 
279                            // This guarantees to prevent erroneous answers
280                    ret.addElement(dp);
281
282                else
283                    // If finding a token match fails, just ignore that token from now on...
284                    tagCounter.reportFailed(tn.tok);
285
286            }
287
288        return ret;
289    }
290
291
292    // ********************************************************************************************
293    // ********************************************************************************************
294    // Tester, leave it here!  It's not doing you no harm.
295    // ********************************************************************************************
296    // ********************************************************************************************
297
298
299    static void test(String urlStr, String fileName) throws IOException
300    {
301        // String url = "http://developer.torello.directory/JavaHTML/Version%201/1.4/javadoc/" +
302        //      "Torello/HTML/NodeSearch/CommentNodeCount.html";
303    
304        StringBuilder       sb      = new StringBuilder();
305        URL                 url     = new URL(urlStr);
306        Vector<HTMLNode>    page    = HTMLPage.getPageTokens(url, false);
307
308        int     pos = TextNodeFind.first(page, TextComparitor.CN_CI, "a count of how many");
309        DotPair dp  = Surrounding.firstExcept(page, pos, "li", "body", "div");
310
311        sb.append("Text Node Found: [" + page.elementAt(pos) + "]\n");
312        sb.append("Index Found: " + pos + ", DotPair Found: " + dp.toString() + "\n");
313        sb.append(Debug.print(page, dp, Debug::J) + "\n");
314
315        Vector<DotPair> allDP = Surrounding.allExcept(page, pos, "body", "html", "div");
316
317        for (DotPair l : allDP) sb.append(
318            BCYAN + 
319            "************************************************************\n" +
320            "************************************************************\n" + RESET +
321            "Index Found: " + pos + ", DotPair Found: " + l.toString() + "\n" +
322            "Starting Node: " + BRED + page.elementAt(l.start).str + RESET + "\n" +
323            "Ending Node:" + BRED + page.elementAt(l.end).str + RESET + "\n"
324        );
325
326        String s = sb.toString();
327        System.out.println(s);
328
329        if (fileName != null)
330            FileRW.writeFile(C.toHTML(s.replace("<", "&lt;").replace(">", "&gt;")), fileName);
331    }
332}