001package Torello.HTML; 002 003import Torello.HTML.NodeSearch.*; 004 005import static Torello.Java.C.*; 006 007import Torello.Java.FileRW; 008import Torello.Java.C; 009 010import java.util.*; 011 012import java.util.function.Predicate; 013import java.net.URL; 014import java.io.IOException; 015 016/** 017 * Class for finding ancestor & parent nodes of any selected {@link HTMLNode}. 018 * 019 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=SURROUNDING> 020 */ 021@Torello.JavaDoc.StaticFunctional 022public class Surrounding 023{ 024 private Surrounding() { } 025 026 /** 027 * This will return the first <B><SPAN STYLE="color: red;">ancestor</SPAN></B> node - along 028 * with it's closing element - as a {@code DotPair} - that matches. 029 * 030 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 031 * 032 * @param index This is the index of the node for whose "ancestors" we are searching (to use a 033 * Java-Script DOM Tree term). 034 * 035 * @param htmlTags If this list is empty, we shall look for any ancestor node. Since this 036 * method returns the first, if this list is left empty, and the index-node is surrounded by 037 * even a bold "{@code <B>...</B>}" then that will be the {@code DotPair} result that is 038 * returned. If this list is left non-empty, then the only ancestor nodes whose HTML Element 039 * Tag (usually referred to as "the Element") matches a tag from this list shall be returned. 040 * 041 * <BR /><BR /><B>FOR INSTANCE:</B> If {@code "div", "p"}, and {@code "a"} were provided as 042 * values to this parameter - <I>the search loop would skip over all ancestors that were not 043 * HTML divider, paragraph or anchor elements</I> before selecting a result. 044 * 045 * @return This shall return the first sub-list, as a {@code 'DotPair'} (start & end index 046 * pair). If no matches are found, null will return. This sublist is nearly identical to the 047 * Java-Script <B STYLE="color: red">DOM Tree</B> concept of ancestor-node, though no trees are 048 * constructed by this method. 049 * 050 * @throws ArrayIndexOutOfBoundsException If index is not within the bounds of the passed 051 * vectorized-html parameter {@code 'html'} 052 * 053 * @throws HTMLTokException If any of the tags passed are null, or not found in the table of 054 * {@code class HTMLTags} - specifically if they are not valid HTML Elements. 055 * 056 * @see #FIRST(Vector, int, HTMLTagCounter) 057 * @see ARGCHECK#index(Vector, int) 058 */ 059 public static DotPair first(Vector<? extends HTMLNode> html, int index, String... htmlTags) 060 { 061 return FIRST( 062 html, ARGCHECK.index(html, index), 063 new HTMLTagCounter(htmlTags, HTMLTagCounter.NORMAL, HTMLTagCounter.FIRST) 064 ); 065 } 066 067 /** 068 * This will return the first <B><SPAN STYLE="color: red;">ancestor</SPAN></B> node - along 069 * with it's closing element - as a {@code DotPair} - that matches the input-parameter 070 * {@code 'htmlTags'} In this case, the term {@code 'except'} shall mean that any matches whose 071 * HTML Token is among the list in parameter {@code String... htmlTags} will be <B>skipped</B>, 072 * and a "higher-level" ancestor will be returned instead. 073 * 074 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 075 * 076 * @param index This is the index of the node for whose "ancestors" we are searching (to use a 077 * Java-Script {@code DOM Tree} term). 078 * 079 * @param htmlTags When this list is non-empty (contains <I>at least one token</I>), the search 080 * loop will skip over ancestor nodes that are among the members of this var-args parameter 081 * list. If this method is invoked <I>and this parameter is an empty list</I>, then the search 082 * loop will return the first anestor node identified. 083 * 084 * <BR /><BR /><B>FOR INSTANCE:</B> If {@code "B"} and {@code "P"} were passed as parameters to 085 * this method, then the search-loop will continue looking for higher-level ancestors - 086 * <I>until one was found that was not an HTML {@code 'bold'} or {@code 'paragraph'} element 087 * {@code DotPair}.</I> 088 * 089 * @return This shall return the first sub-list, as a {@code 'DotPair'} (start & end index 090 * pair). If no matches are found, null will return. This sublist is nearly identical to the 091 * Java-Script <B STYLE="color: red">DOM Tree</B> concept of ancestor-node, though no trees are 092 * constructed by this method. 093 * 094 * @throws ArrayIndexOutOfBoundsException If index is not within the bounds of the passed 095 * vectorized-html parameter {@code 'html'} 096 * 097 * @throws HTMLTokException If any of the tags passed are null, or not found in the table of 098 * {@code class HTMLTags} - specifically if they are not valid HTML Elements. 099 * 100 * @see #FIRST(Vector, int, HTMLTagCounter) 101 * @see ARGCHECK#index(Vector, int) 102 */ 103 public static DotPair firstExcept(Vector<? extends HTMLNode> html, int index, String... htmlTags) 104 { 105 return FIRST( 106 html, ARGCHECK.index(html, index), 107 new HTMLTagCounter(htmlTags, HTMLTagCounter.EXCEPT, HTMLTagCounter.FIRST) 108 ); 109 } 110 111 /** 112 * This will find all <B><SPAN STYLE="color: red;">ancestors</SPAN></B> of a given index. If 113 * parameter {@code String... htmlTags} is null, all HTML elements will be considered. If this 114 * parameter contains any elements, then only those elements shall be considered as match in 115 * the ancestor hierarchy tree. 116 * 117 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 118 * 119 * @param index This is the index of the node for whose "ancestors" we are searching (to use a 120 * Java-Script {@code DOM Tree} term). 121 * 122 * @param htmlTags If this list is empty, we shall look for <I><B>all ancestor nodes.</I></B> 123 * Since this method returns the first ancestor node-pair found, f this list is left non-empty, 124 * then the only ancestor nodes whose HTML Element Tag (usually referred to as "the token") are 125 * members of this varargs {@code String} parameter list shall be considered eligible as a 126 * return result for this method. 127 * 128 * <BR /><BR /><B>FOR INSTANCE:</B> If {@code "DIV", "P"}, and {@code "A"} were listed - <I>the 129 * search loop would skip over all ancestors that were not HTML divider, paragraph or anchor 130 * elements</I> before selecting a result. 131 * 132 * @return This shall return <I><B>every</I></B> sub-list, as a {@code 'DotPair'} 133 * (start & end index pair). If no matches are found, an empty {@code Vector} of 134 * zero-elements shall return. These sublists are nearly identical to the Java-Script 135 * <B STYLE="color: red">DOM Tree</B> concept of ancestor-nodes, though no trees are 136 * constructed by this method. 137 * 138 * @throws ArrayIndexOutOfBoundsException If index is not within the bounds of the passed 139 * vectorized-html parameter {@code 'html'} 140 * 141 * @throws HTMLTokException If any of the tags passed are null, or not found in the table of 142 * {@code class HTMLTags} - specifically if they are not valid HTML Elements. 143 * 144 * @see #ALL(Vector, int, HTMLTagCounter) 145 * @see ARGCHECK#index(Vector, int) 146 */ 147 public static Vector<DotPair> all(Vector<? extends HTMLNode> html, int index, String... htmlTags) 148 { 149 return ALL( 150 html, ARGCHECK.index(html, index), 151 new HTMLTagCounter(htmlTags, HTMLTagCounter.NORMAL, HTMLTagCounter.ALL) 152 ); 153 } 154 155 /** 156 * This will find all <B><SPAN STYLE="color: red;">ancestors</SPAN></B> of a given index. If 157 * parameter {@code String... htmlTags} is null, all HTML elements will be considered. If this 158 * parameter contains any elements, then those elements <B><I>shall not be considered</B></I> 159 * as a match in the ancestor hierarchy tree. 160 * 161 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 162 * 163 * @param index This is the index of the node for whose "ancestors" we are searching (to use a 164 * Java-Script {@code DOM Tree} term). 165 * 166 * @param htmlTags When this list is non-empty (contains <I>at least one token</I>), the search 167 * loop will skip over ancestor nodes that are among the members of this var-args parameter 168 * list. If this method is invoked <I>and this parameter is an empty list</I>, then the search 169 * loop will return all ancestor nodes of the index node. 170 * 171 * <BR /><BR /><B>FOR INSTANCE:</B> If {@code "B"} and {@code "P"} were passed as parameters to 172 * this method, then the search-loop which is saving all ancestor matches to it's result-set, 173 * would skip over any HTML {@code 'bold'} or {@code 'paragraph'} {@code DotPair's}. 174 * 175 * @return This shall return <I><B>every</I></B> sub-list, as a {@code 'DotPair'} 176 * (start & end index pair). If no matches are found, an empty {@code Vector} of 177 * zero-elements shall return. These sublists are nearly identical to the Java-Script 178 * <B STYLE="color: red">DOM Tree</B> concept of ancestor-nodes, though no trees are 179 * constructed by this method. 180 * 181 * @throws ArrayIndexOutOfBoundsException If index is not within the bounds of the passed 182 * vectorized-html parameter {@code 'html'} 183 * 184 * @throws HTMLTokException If any of the tags passed are null, or not found in the table of 185 * {@code class HTMLTags} - specifically if they are not valid HTML Elements. 186 * 187 * @see #ALL(Vector, int, HTMLTagCounter) 188 * @see ARGCHECK#index(Vector, int) 189 */ 190 public static Vector<DotPair> allExcept 191 (Vector<? extends HTMLNode> html, int index, String... htmlTags) 192 { 193 return ALL( 194 html, ARGCHECK.index(html, index), 195 new HTMLTagCounter(htmlTags, HTMLTagCounter.EXCEPT, HTMLTagCounter.ALL) 196 ); 197 } 198 199 200 // ******************************************************************************************** 201 // ******************************************************************************************** 202 // FIND INTERNAL METHODS 203 // ******************************************************************************************** 204 // ******************************************************************************************** 205 206 207 /** 208 * Finds the first <B><SPAN STYLE="color: red;">ancestor</SPAN></B> ("surrounding") node pair. 209 * 210 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 211 * @param index This is any index within the bounds of the {@code 'html'} parameter. 212 * @param tagCounter Any internally used counter, to optimize the search routine. 213 * 214 * @return The matching <B STYLE="color: red;">ancestor</B> node's start-and-end index as a 215 * {@code 'DotPair'}. 216 * 217 * @see TagNode 218 * @see HTMLNode 219 * @see DotPair 220 * @see DotPair#isInside(int) 221 * @see Util.Inclusive#dotPairOPT(Vector, int, int) 222 */ 223 protected static DotPair FIRST 224 (Vector<? extends HTMLNode> html, int index, HTMLTagCounter tagCounter) 225 { 226 int size = html.size(); 227 TagNode tn; 228 DotPair ret; 229 230 for ( int i=(index-1); 231 (i >= 0) && (! tagCounter.allBanned()); 232 i-- 233 ) 234 235 if ( ((tn = html.elementAt(i).openTag()) != null) 236 && tagCounter.check(tn) 237 && ((ret = Util.Inclusive.dotPairOPT(html, i, size)) != null) 238 && ret.isInside(index) 239 // isInside(...) Should never fail, but 240 ) // This guarantees to prevent erroneous answers 241 242 // If there is a match, return that match, and exit immediately. 243 return ret; 244 245 return null; 246 } 247 248 /** 249 * Finds all <B><SPAN STYLE="color: red;">ancestor</SPAN></B> ("surrounding"} node pairs. 250 * 251 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 252 * @param index This is any index within the bounds of the {@code 'html'} parameter. 253 * @param tagCounter Any internally used counter, to optimize the search routine. 254 * 255 * @return All matching <B STYLE="color: red;">ancestor</B> nodes' start-and-end index pairs 256 * inside a {@code Vector<DotPair>} 257 * 258 * @see TagNode 259 * @see HTMLNode 260 * @see DotPair 261 * @see DotPair#isInside(int) 262 * @see Util.Inclusive#dotPairOPT(Vector, int, int) 263 */ 264 protected static Vector<DotPair> ALL 265 (Vector<? extends HTMLNode> html, int index, HTMLTagCounter tagCounter) 266 { 267 HTMLNode n; TagNode tn; DotPair dp; int size = html.size(); 268 Vector<DotPair> ret = new Vector<>(); 269 270 for (int i=(index-1); (i >= 0) && (! tagCounter.allBanned()); i--) 271 272 if ( (n = html.elementAt(i)).isTagNode() 273 && tagCounter.check(tn = (TagNode) n) 274 ) 275 { 276 if ( ((dp = Util.Inclusive.dotPairOPT(html, i, size)) != null) 277 && dp.isInside(index) 278 ) // isInside(...) Should never fail, but 279 // This guarantees to prevent erroneous answers 280 ret.addElement(dp); 281 282 else 283 // If finding a token match fails, just ignore that token from now on... 284 tagCounter.reportFailed(tn.tok); 285 286 } 287 288 return ret; 289 } 290 291 292 // ******************************************************************************************** 293 // ******************************************************************************************** 294 // Tester, leave it here! It's not doing you no harm. 295 // ******************************************************************************************** 296 // ******************************************************************************************** 297 298 299 static void test(String urlStr, String fileName) throws IOException 300 { 301 // String url = "http://developer.torello.directory/JavaHTML/Version%201/1.4/javadoc/" + 302 // "Torello/HTML/NodeSearch/CommentNodeCount.html"; 303 304 StringBuilder sb = new StringBuilder(); 305 URL url = new URL(urlStr); 306 Vector<HTMLNode> page = HTMLPage.getPageTokens(url, false); 307 308 int pos = TextNodeFind.first(page, TextComparitor.CN_CI, "a count of how many"); 309 DotPair dp = Surrounding.firstExcept(page, pos, "li", "body", "div"); 310 311 sb.append("Text Node Found: [" + page.elementAt(pos) + "]\n"); 312 sb.append("Index Found: " + pos + ", DotPair Found: " + dp.toString() + "\n"); 313 sb.append(Debug.print(page, dp, Debug::J) + "\n"); 314 315 Vector<DotPair> allDP = Surrounding.allExcept(page, pos, "body", "html", "div"); 316 317 for (DotPair l : allDP) sb.append( 318 BCYAN + 319 "************************************************************\n" + 320 "************************************************************\n" + RESET + 321 "Index Found: " + pos + ", DotPair Found: " + l.toString() + "\n" + 322 "Starting Node: " + BRED + page.elementAt(l.start).str + RESET + "\n" + 323 "Ending Node:" + BRED + page.elementAt(l.end).str + RESET + "\n" 324 ); 325 326 String s = sb.toString(); 327 System.out.println(s); 328 329 if (fileName != null) 330 FileRW.writeFile(C.toHTML(s.replace("<", "<").replace(">", ">")), fileName); 331 } 332}