001package Torello.HTML; 002 003import Torello.HTML.NodeSearch.*; 004 005import static Torello.Java.C.*; 006 007import Torello.Java.FileRW; 008import Torello.Java.C; 009import Torello.HTML.parse.HTMLTagCounter; 010 011import java.util.*; 012 013import java.util.function.Predicate; 014import java.net.URL; 015import java.io.IOException; 016 017/** 018 * Class for finding ancestor & parent nodes of any selected {@link HTMLNode}. 019 * 020 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=SURROUNDING> 021 */ 022@Torello.JavaDoc.StaticFunctional 023public class Surrounding 024{ 025 private Surrounding() { } 026 027 /** 028 * This will return the first <B><SPAN STYLE="color: red;">ancestor</SPAN></B> node - along 029 * with it's closing element - as a {@code DotPair} - that matches. 030 * 031 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 032 * 033 * @param index This is the index of the node for whose "ancestors" we are searching (to use a 034 * Java-Script DOM Tree term). 035 * 036 * @param htmlTags If this list is empty, we shall look for any ancestor node. Since this 037 * method returns the first, if this list is left empty, and the index-node is surrounded by 038 * even a bold "{@code <B>...</B>}" then that will be the {@code DotPair} result that is 039 * returned. If this list is left non-empty, then the only ancestor nodes whose HTML Element 040 * Tag (usually referred to as "the Element") matches a tag from this list shall be returned. 041 * 042 * <BR /><BR /><B>FOR INSTANCE:</B> If {@code "div", "p"}, and {@code "a"} were provided as 043 * values to this parameter - <I>the search loop would skip over all ancestors that were not 044 * HTML divider, paragraph or anchor elements</I> before selecting a result. 045 * 046 * @return This shall return the first sub-list, as a {@code 'DotPair'} (start & end index 047 * pair). If no matches are found, null will return. This sublist is nearly identical to the 048 * Java-Script <B STYLE="color: red">DOM Tree</B> concept of ancestor-node, though no trees are 049 * constructed by this method. 050 * 051 * @throws ArrayIndexOutOfBoundsException If index is not within the bounds of the passed 052 * vectorized-html parameter {@code 'html'} 053 * 054 * @throws HTMLTokException If any of the tags passed are null, or not found in the table of 055 * {@code class HTMLTags} - specifically if they are not valid HTML Elements. 056 * 057 * @see #FIRST(Vector, int, HTMLTagCounter) 058 * @see ARGCHECK#index(Vector, int) 059 */ 060 public static DotPair first(Vector<? extends HTMLNode> html, int index, String... htmlTags) 061 { 062 return FIRST( 063 html, ARGCHECK.index(html, index), 064 new HTMLTagCounter(htmlTags, HTMLTagCounter.NORMAL, HTMLTagCounter.FIRST) 065 ); 066 } 067 068 /** 069 * This will return the first <B><SPAN STYLE="color: red;">ancestor</SPAN></B> node - along 070 * with it's closing element - as a {@code DotPair} - that matches the input-parameter 071 * {@code 'htmlTags'} In this case, the term {@code 'except'} shall mean that any matches whose 072 * HTML Token is among the list in parameter {@code String... htmlTags} will be <B>skipped</B>, 073 * and a "higher-level" ancestor will be returned instead. 074 * 075 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 076 * 077 * @param index This is the index of the node for whose "ancestors" we are searching (to use a 078 * Java-Script {@code DOM Tree} term). 079 * 080 * @param htmlTags When this list is non-empty (contains <I>at least one token</I>), the search 081 * loop will skip over ancestor nodes that are among the members of this var-args parameter 082 * list. If this method is invoked <I>and this parameter is an empty list</I>, then the search 083 * loop will return the first anestor node identified. 084 * 085 * <BR /><BR /><B>FOR INSTANCE:</B> If {@code "B"} and {@code "P"} were passed as parameters to 086 * this method, then the search-loop will continue looking for higher-level ancestors - 087 * <I>until one was found that was not an HTML {@code 'bold'} or {@code 'paragraph'} element 088 * {@code DotPair}.</I> 089 * 090 * @return This shall return the first sub-list, as a {@code 'DotPair'} (start & end index 091 * pair). If no matches are found, null will return. This sublist is nearly identical to the 092 * Java-Script <B STYLE="color: red">DOM Tree</B> concept of ancestor-node, though no trees are 093 * constructed by this method. 094 * 095 * @throws ArrayIndexOutOfBoundsException If index is not within the bounds of the passed 096 * vectorized-html parameter {@code 'html'} 097 * 098 * @throws HTMLTokException If any of the tags passed are null, or not found in the table of 099 * {@code class HTMLTags} - specifically if they are not valid HTML Elements. 100 * 101 * @see #FIRST(Vector, int, HTMLTagCounter) 102 * @see ARGCHECK#index(Vector, int) 103 */ 104 public static DotPair firstExcept(Vector<? extends HTMLNode> html, int index, String... htmlTags) 105 { 106 return FIRST( 107 html, ARGCHECK.index(html, index), 108 new HTMLTagCounter(htmlTags, HTMLTagCounter.EXCEPT, HTMLTagCounter.FIRST) 109 ); 110 } 111 112 /** 113 * This will find all <B><SPAN STYLE="color: red;">ancestors</SPAN></B> of a given index. If 114 * parameter {@code String... htmlTags} is null, all HTML elements will be considered. If this 115 * parameter contains any elements, then only those elements shall be considered as match in 116 * the ancestor hierarchy tree. 117 * 118 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 119 * 120 * @param index This is the index of the node for whose "ancestors" we are searching (to use a 121 * Java-Script {@code DOM Tree} term). 122 * 123 * @param htmlTags If this list is empty, we shall look for <I><B>all ancestor nodes.</I></B> 124 * Since this method returns the first ancestor node-pair found, f this list is left non-empty, 125 * then the only ancestor nodes whose HTML Element Tag (usually referred to as "the token") are 126 * members of this varargs {@code String} parameter list shall be considered eligible as a 127 * return result for this method. 128 * 129 * <BR /><BR /><B>FOR INSTANCE:</B> If {@code "DIV", "P"}, and {@code "A"} were listed - <I>the 130 * search loop would skip over all ancestors that were not HTML divider, paragraph or anchor 131 * elements</I> before selecting a result. 132 * 133 * @return This shall return <I><B>every</I></B> sub-list, as a {@code 'DotPair'} 134 * (start & end index pair). If no matches are found, an empty {@code Vector} of 135 * zero-elements shall return. These sublists are nearly identical to the Java-Script 136 * <B STYLE="color: red">DOM Tree</B> concept of ancestor-nodes, though no trees are 137 * constructed by this method. 138 * 139 * @throws ArrayIndexOutOfBoundsException If index is not within the bounds of the passed 140 * vectorized-html parameter {@code 'html'} 141 * 142 * @throws HTMLTokException If any of the tags passed are null, or not found in the table of 143 * {@code class HTMLTags} - specifically if they are not valid HTML Elements. 144 * 145 * @see #ALL(Vector, int, HTMLTagCounter) 146 * @see ARGCHECK#index(Vector, int) 147 */ 148 public static Vector<DotPair> all(Vector<? extends HTMLNode> html, int index, String... htmlTags) 149 { 150 return ALL( 151 html, ARGCHECK.index(html, index), 152 new HTMLTagCounter(htmlTags, HTMLTagCounter.NORMAL, HTMLTagCounter.ALL) 153 ); 154 } 155 156 /** 157 * This will find all <B><SPAN STYLE="color: red;">ancestors</SPAN></B> of a given index. If 158 * parameter {@code String... htmlTags} is null, all HTML elements will be considered. If this 159 * parameter contains any elements, then those elements <B><I>shall not be considered</B></I> 160 * as a match in the ancestor hierarchy tree. 161 * 162 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 163 * 164 * @param index This is the index of the node for whose "ancestors" we are searching (to use a 165 * Java-Script {@code DOM Tree} term). 166 * 167 * @param htmlTags When this list is non-empty (contains <I>at least one token</I>), the search 168 * loop will skip over ancestor nodes that are among the members of this var-args parameter 169 * list. If this method is invoked <I>and this parameter is an empty list</I>, then the search 170 * loop will return all ancestor nodes of the index node. 171 * 172 * <BR /><BR /><B>FOR INSTANCE:</B> If {@code "B"} and {@code "P"} were passed as parameters to 173 * this method, then the search-loop which is saving all ancestor matches to it's result-set, 174 * would skip over any HTML {@code 'bold'} or {@code 'paragraph'} {@code DotPair's}. 175 * 176 * @return This shall return <I><B>every</I></B> sub-list, as a {@code 'DotPair'} 177 * (start & end index pair). If no matches are found, an empty {@code Vector} of 178 * zero-elements shall return. These sublists are nearly identical to the Java-Script 179 * <B STYLE="color: red">DOM Tree</B> concept of ancestor-nodes, though no trees are 180 * constructed by this method. 181 * 182 * @throws ArrayIndexOutOfBoundsException If index is not within the bounds of the passed 183 * vectorized-html parameter {@code 'html'} 184 * 185 * @throws HTMLTokException If any of the tags passed are null, or not found in the table of 186 * {@code class HTMLTags} - specifically if they are not valid HTML Elements. 187 * 188 * @see #ALL(Vector, int, HTMLTagCounter) 189 * @see ARGCHECK#index(Vector, int) 190 */ 191 public static Vector<DotPair> allExcept 192 (Vector<? extends HTMLNode> html, int index, String... htmlTags) 193 { 194 return ALL( 195 html, ARGCHECK.index(html, index), 196 new HTMLTagCounter(htmlTags, HTMLTagCounter.EXCEPT, HTMLTagCounter.ALL) 197 ); 198 } 199 200 201 // ******************************************************************************************** 202 // ******************************************************************************************** 203 // FIND INTERNAL METHODS 204 // ******************************************************************************************** 205 // ******************************************************************************************** 206 207 208 /** 209 * Finds the first <B><SPAN STYLE="color: red;">ancestor</SPAN></B> ("surrounding") node pair. 210 * 211 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 212 * @param index This is any index within the bounds of the {@code 'html'} parameter. 213 * @param tagCounter Any internally used counter, to optimize the search routine. 214 * 215 * @return The matching <B STYLE="color: red;">ancestor</B> node's start-and-end index as a 216 * {@code 'DotPair'}. 217 * 218 * @see TagNode 219 * @see HTMLNode 220 * @see DotPair 221 * @see DotPair#isInside(int) 222 * @see Util.Inclusive#dotPairOPT(Vector, int, int) 223 */ 224 protected static DotPair FIRST 225 (Vector<? extends HTMLNode> html, int index, HTMLTagCounter tagCounter) 226 { 227 int size = html.size(); 228 TagNode tn; 229 DotPair ret; 230 231 for ( int i=(index-1); 232 (i >= 0) && (! tagCounter.allBanned()); 233 i-- 234 ) 235 236 if ( ((tn = html.elementAt(i).openTag()) != null) 237 && tagCounter.check(tn) 238 && ((ret = Util.Inclusive.dotPairOPT(html, i, size)) != null) 239 && ret.isInside(index) 240 // isInside(...) Should never fail, but 241 ) // This guarantees to prevent erroneous answers 242 243 // If there is a match, return that match, and exit immediately. 244 return ret; 245 246 return null; 247 } 248 249 /** 250 * Finds all <B><SPAN STYLE="color: red;">ancestor</SPAN></B> ("surrounding"} node pairs. 251 * 252 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 253 * @param index This is any index within the bounds of the {@code 'html'} parameter. 254 * @param tagCounter Any internally used counter, to optimize the search routine. 255 * 256 * @return All matching <B STYLE="color: red;">ancestor</B> nodes' start-and-end index pairs 257 * inside a {@code Vector<DotPair>} 258 * 259 * @see TagNode 260 * @see HTMLNode 261 * @see DotPair 262 * @see DotPair#isInside(int) 263 * @see Util.Inclusive#dotPairOPT(Vector, int, int) 264 */ 265 protected static Vector<DotPair> ALL 266 (Vector<? extends HTMLNode> html, int index, HTMLTagCounter tagCounter) 267 { 268 HTMLNode n; TagNode tn; DotPair dp; int size = html.size(); 269 Vector<DotPair> ret = new Vector<>(); 270 271 for (int i=(index-1); (i >= 0) && (! tagCounter.allBanned()); i--) 272 273 if ( (n = html.elementAt(i)).isTagNode() 274 && tagCounter.check(tn = (TagNode) n) 275 ) 276 { 277 if ( ((dp = Util.Inclusive.dotPairOPT(html, i, size)) != null) 278 && dp.isInside(index) 279 ) // isInside(...) Should never fail, but 280 // This guarantees to prevent erroneous answers 281 ret.addElement(dp); 282 283 else 284 // If finding a token match fails, just ignore that token from now on... 285 tagCounter.reportFailed(tn.tok); 286 287 } 288 289 return ret; 290 } 291 292 293 // ******************************************************************************************** 294 // ******************************************************************************************** 295 // Tester, leave it here! It's not doing you no harm. 296 // ******************************************************************************************** 297 // ******************************************************************************************** 298 299 300 static void test(String urlStr, String fileName) throws IOException 301 { 302 // String url = "http://developer.torello.directory/JavaHTML/Version%201/1.4/javadoc/" + 303 // "Torello/HTML/NodeSearch/CommentNodeCount.html"; 304 305 StringBuilder sb = new StringBuilder(); 306 URL url = new URL(urlStr); 307 Vector<HTMLNode> page = HTMLPage.getPageTokens(url, false); 308 309 int pos = TextNodeFind.first(page, TextComparitor.CN_CI, "a count of how many"); 310 DotPair dp = Surrounding.firstExcept(page, pos, "li", "body", "div"); 311 312 sb.append("Text Node Found: [" + page.elementAt(pos) + "]\n"); 313 sb.append("Index Found: " + pos + ", DotPair Found: " + dp.toString() + "\n"); 314 sb.append(Debug.print(page, dp, Debug::J) + "\n"); 315 316 Vector<DotPair> allDP = Surrounding.allExcept(page, pos, "body", "html", "div"); 317 318 for (DotPair l : allDP) sb.append( 319 BCYAN + 320 "************************************************************\n" + 321 "************************************************************\n" + RESET + 322 "Index Found: " + pos + ", DotPair Found: " + l.toString() + "\n" + 323 "Starting Node: " + BRED + page.elementAt(l.start).str + RESET + "\n" + 324 "Ending Node:" + BRED + page.elementAt(l.end).str + RESET + "\n" 325 ); 326 327 String s = sb.toString(); 328 System.out.println(s); 329 330 if (fileName != null) 331 FileRW.writeFile(C.toHTML(s.replace("<", "<").replace(">", ">")), fileName); 332 } 333}