001package Torello.HTML; 002 003import java.util.*; 004import java.util.regex.*; 005import java.util.stream.*; 006 007import java.util.function.Predicate; 008 009import Torello.HTML.NodeSearch.*; 010import Torello.Java.*; 011 012/** 013 * A long list of utilities for searching, finding, extracting and removing HTML from 014 * Vectorized-HTML. 015 * 016 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=UTIL> 017 */ 018@Torello.JavaDoc.StaticFunctional 019public class Util 020{ 021 private Util() { } 022 023 024 // ******************************************************************************************** 025 // ******************************************************************************************** 026 // Trim TextNode Strings 027 // ******************************************************************************************** 028 // ******************************************************************************************** 029 030 031 /** 032 * Convenience Method. 033 * <BR />Invokes: {@link #trimTextNodes(Vector, int, int, boolean)} 034 */ 035 public static int trimTextNodes(Vector<HTMLNode> page, boolean deleteZeroLengthStrings) 036 { return trimTextNodes(page, 0, -1, deleteZeroLengthStrings); } 037 038 /** 039 * Convenience Method. 040 * <BR />Receives: {@code DotPair} 041 * <BR />Invokes: {@link #trimTextNodes(Vector, int, int, boolean)} 042 */ 043 public static int trimTextNodes 044 (Vector<HTMLNode> page, DotPair dp, boolean deleteZeroLengthStrings) 045 { return trimTextNodes(page, dp.start, dp.end + 1, deleteZeroLengthStrings); } 046 047 /** 048 * This will iterate through the entire {@code Vector<HTMLNode>}, and invoke 049 * {@code java.lang.String.trim()} on each {@code TextNode} on the page. If this invocation 050 * results in a reduction of {@code String.length()}, then a new {@code TextNode} will be 051 * instantiated whose {@code TextNode.str} field is set to the result of the 052 * {@code String.trim(old_node.str)} operation. 053 * 054 * @param deleteZeroLengthStrings If a {@code TextNode's} length is zero (before or after 055 * {@code trim()} is called) and when this parameter is {@code TRUE}, that {@code TextNode} 056 * must be removed from the {@code Vector}. 057 * 058 * @return Any node that is trimmed or deleted will increment the counter. This counter 059 * final-value is returned 060 */ 061 public static int trimTextNodes 062 (Vector<HTMLNode> page, int sPos, int ePos, boolean deleteZeroLengthStrings) 063 { 064 int counter = 0; 065 IntStream.Builder b = deleteZeroLengthStrings ? IntStream.builder() : null; 066 HTMLNode n = null; 067 LV l = new LV(page, sPos, ePos); 068 069 for (int i=l.start; i < l.end; i++) 070 071 if ((n = page.elementAt(i)).isTextNode()) 072 { 073 String trimmed = n.str.trim(); 074 int trimmedLength = trimmed.length(); 075 076 if ((trimmedLength == 0) && deleteZeroLengthStrings) 077 { b.add(i); counter++; } 078 079 else if (trimmedLength < n.str.length()) 080 { page.setElementAt(new TextNode(trimmed), i); counter++; } 081 } 082 083 if (deleteZeroLengthStrings) Util.Remove.nodesOPT(page, b.build().toArray()); 084 085 return counter; 086 } 087 088 089 // ******************************************************************************************** 090 // ******************************************************************************************** 091 // Vectorized-HTML To-String Methods 092 // ******************************************************************************************** 093 // ******************************************************************************************** 094 095 096 /** 097 * Convenience Method. 098 * <BR />Invokes: {@link #rangeToString(Vector, int, int)} 099 */ 100 public static String pageToString(Vector<? extends HTMLNode> html) 101 { return rangeToString(html, 0, -1); } 102 103 /** 104 * Convenience Method. 105 * <BR />Receives: {@code DotPair} 106 * <BR />Invokes: {@link #rangeToString(Vector, int, int)} 107 */ 108 public static String rangeToString(Vector<? extends HTMLNode> html, DotPair dp) 109 { return rangeToString(html, dp.start, dp.end + 1); } 110 111 /** 112 * The purpose of this method/function is to convert a portion of the contents of an HTML-Page, 113 * currently being represented as a {@code Vector} of {@code HTMLNode's} into a {@code String.} 114 * Two {@code 'int'} parameters are provided in this method's signature to define a sub-list 115 * of a page to be converted to a {@code java.lang.String} 116 * 117 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 118 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 119 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 120 * 121 * @return The {@code Vector} converted into a {@code String}. 122 * 123 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 124 * 125 * @see #pageToString(Vector) 126 * @see #rangeToString(Vector, DotPair) 127 */ 128 public static String rangeToString(Vector<? extends HTMLNode> html, int sPos, int ePos) 129 { 130 StringBuilder ret = new StringBuilder(); 131 LV l = new LV(html, sPos, ePos); 132 133 for (int i=l.start; i < l.end; i++) ret.append(html.elementAt(i).str); 134 135 return ret.toString(); 136 } 137 138 139 // ******************************************************************************************** 140 // ******************************************************************************************** 141 // Vectorized-HTML TextNode To-String Methods 142 // ******************************************************************************************** 143 // ******************************************************************************************** 144 145 146 /** 147 * Convenience Method. 148 * <BR />Invokes: {@link #textNodesString(Vector, int, int)} 149 */ 150 public static String textNodesString(Vector<? extends HTMLNode> html) 151 { return textNodesString(html, 0, -1); } 152 153 /** 154 * Convenience Method. 155 * <BR />Receives: {@code DotPair} 156 * <BR />Invokes: {@link #textNodesString(Vector, int, int)} 157 */ 158 public static String textNodesString(Vector<? extends HTMLNode> html, DotPair dp) 159 { return textNodesString(html, dp.start, dp.end + 1); } 160 161 /** 162 * This will return a {@code String} that is comprised of ONLY the {@code TextNode's} contained 163 * within the input {@code Vector} - <I>and furthermore, only nodes that are situated between 164 * index {@code int 'sPos'} and index {@code int 'ePos'} in that {@code Vector.}</I> 165 * 166 * <BR /><BR />The {@code for-loop} that iterates the input-{@code Vector} parameter will 167 * simply skip an instance of {@code 'TagNode'} and {@code 'CommentNode'} when building the 168 * output return {@code String.}. 169 * 170 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 171 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 172 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 173 * 174 * @return This will return a {@code String} that is comprised of the text-only elements in the 175 * web-page or sub-page. Only text between the requested {@code Vector}-indices is included. 176 * 177 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 178 * 179 * @see #textNodesString(Vector, DotPair) 180 * @see #textNodesString(Vector) 181 */ 182 public static String textNodesString(Vector<? extends HTMLNode> html, int sPos, int ePos) 183 { 184 StringBuilder sb = new StringBuilder(); 185 LV l = new LV(html, sPos, ePos); 186 HTMLNode n; 187 188 for (int i=l.start; i < l.end; i++) 189 if ((n = html.elementAt(i)).isTextNode()) 190 sb.append(n.str); 191 192 return sb.toString(); 193 } 194 195 196 // ******************************************************************************************** 197 // ******************************************************************************************** 198 // TextNode Modification Operations - "Escape Text Nodes" 199 // ******************************************************************************************** 200 // ******************************************************************************************** 201 202 203 /** 204 * Convenience Method. 205 * <BR />Invokes: {@link #escapeTextNodes(Vector, int, int)} 206 */ 207 public static int escapeTextNodes(Vector<HTMLNode> html) 208 { return escapeTextNodes(html, 0, -1); } 209 210 /** 211 * Convenience Method. 212 * <BR />Receives: {@code DotPair} 213 * <BR />Invokes: {@link #escapeTextNodes(Vector, int, int)} 214 */ 215 public static int escapeTextNodes(Vector<HTMLNode> html, DotPair dp) 216 { return escapeTextNodes(html, dp.start, dp.end + 1); } 217 218 /** 219 * Will call {@code HTML.Escape.replaceAll} on each {@code TextNode} in the range of 220 * {@code sPos ... ePos} 221 * 222 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 223 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 224 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 225 * 226 * @return The number of {@code TextNode's} that changed as a result of the 227 * {@code Escape.replaceAll(n.str)} loop. 228 * 229 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 230 * 231 * @see Escape#replaceAll(String) 232 */ 233 public static int escapeTextNodes(Vector<HTMLNode> html, int sPos, int ePos) 234 { 235 LV l = new LV(html, sPos, ePos); 236 HTMLNode n = null; 237 String s = null; 238 int counter = 0; 239 240 for (int i=l.start; i < l.end; i++) 241 242 if ((n = html.elementAt(i)).isTextNode()) 243 if (! (s = Escape.replace(n.str)).equals(n.str)) 244 { 245 html.setElementAt(new TextNode(s), i); 246 counter++; 247 } 248 249 return counter; 250 } 251 252 253 // ******************************************************************************************** 254 // ******************************************************************************************** 255 // Clone HTML Vectors 256 // ******************************************************************************************** 257 // ******************************************************************************************** 258 259 260 /** 261 * Convenience Method. 262 * <BR />Invokes: {@link #cloneRange(Vector, int, int)} 263 */ 264 public static Vector<HTMLNode> clone(Vector<? extends HTMLNode> html) 265 { return cloneRange(html, 0, -1); } 266 267 /** 268 * Convenience Method. 269 * <BR />Receives: {@code DotPair} 270 * <BR />Invokes: {@link #cloneRange(Vector, int, int)} 271 */ 272 public static Vector<HTMLNode> cloneRange(Vector<? extends HTMLNode> html, DotPair dp) 273 { return cloneRange(html, dp.start, dp.end + 1); } 274 275 /** 276 * Copies (clones!) a sub-range of the HTML page, stores the results in a {@code Vector}, and 277 * returns it. 278 * 279 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 280 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 281 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 282 * 283 * @return The "cloned" (copied) sub-range specified by {@code 'sPos'} and {@code 'ePos'.} 284 * 285 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 286 * 287 * @see #cloneRange(Vector, DotPair) 288 */ 289 public static Vector<HTMLNode> cloneRange(Vector<? extends HTMLNode> html, int sPos, int ePos) 290 { 291 LV l = new LV(html, sPos, ePos); 292 Vector<HTMLNode> ret = new Vector<>(l.size()); 293 294 // Copy the range specified into the return vector 295 // 296 // HOW THIS WAS DONE BEFORE NOTICING Vector.subList 297 // 298 // for (int i = l.start; i < l.end; i++) ret.addElement(html.elementAt(i)); 299 300 ret.addAll(html.subList(l.start, l.end)); 301 302 return ret; 303 } 304 305 306 307 // ******************************************************************************************** 308 // ******************************************************************************************** 309 // String Length of the TextNode's 310 // ******************************************************************************************** 311 // ******************************************************************************************** 312 313 314 /** 315 * Convenience Method. 316 * <BR />Receives: {@code DotPair} 317 * <BR />Invokes: {@link #textStrLength(Vector, int, int)} 318 */ 319 public static int textStrLength(Vector<? extends HTMLNode> html, DotPair dp) 320 { return textStrLength(html, dp.start, dp.end + 1); } 321 322 /** 323 * Convenience Method. 324 * <BR />Invokes: {@link #textStrLength(Vector, int, int)} 325 */ 326 public static int textStrLength(Vector<? extends HTMLNode> html) 327 { return textStrLength(html, 0, -1); } 328 329 /** 330 * This method will return the length of the strings <I><B>contained by all/only instances of 331 * {@code 'TextNode'}</B></I> among the nodes of the input HTML-{@code Vector}. This is 332 * identical to the behavior of the method with the same name, but includes starting and ending 333 * bounds on the html {@code Vector}: {@code 'sPos'} & {@code 'ePos'}. 334 * 335 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 336 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 337 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 338 * 339 * @return The sum of the lengths of the text contained by text-nodes in the {@code Vector} 340 * between {@code 'sPos'} and {@code 'ePos'}. 341 * 342 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 343 */ 344 public static int textStrLength(Vector<? extends HTMLNode> html, int sPos, int ePos) 345 { 346 HTMLNode n; 347 int sum = 0; 348 LV l = new LV(html, sPos, ePos); 349 350 // Counts the length of each "String" in a "TextNode" between sPos and ePos 351 for (int i=l.start; i < l.end; i++) 352 353 if ((n = html.elementAt(i)).isTextNode()) 354 sum += n.str.length(); 355 356 return sum; 357 } 358 359 360 // ******************************************************************************************** 361 // ******************************************************************************************** 362 // Compact Adjacent / Adjoining TextNode's 363 // ******************************************************************************************** 364 // ******************************************************************************************** 365 366 367 /** 368 * Convenience Method. 369 * <BR />Invokes: {@link #compactTextNodes(Vector, int, int)} 370 */ 371 public static int compactTextNodes(Vector<HTMLNode> html) 372 { return compactTextNodes(html, 0, html.size()); } 373 374 /** 375 * Convenience Method. 376 * <BR />Receives: {@code DotPair} 377 * <BR />Invokes: {@link #compactTextNodes(Vector, int, int)} 378 */ 379 public static int compactTextNodes(Vector<HTMLNode> html, DotPair dp) 380 { return compactTextNodes(html, dp.start, dp.end + 1); } 381 382 /** 383 * Occasionally, when removing instances of {@code TagNode} from a vectorized-html 384 * page, certain instances of {@code TextNode} which were not adjacent / neighbours in 385 * the {@code Vector}, all of a sudden become adjacent. Although there are no major problems 386 * with contiguous instances of {@code TextNode} from the Search Algorithm's perspective, 387 * for programmer's, it can sometimes be befuddling to realize that the output text that 388 * is returned from a call to {@code Util.pageToString(html)} is not being found because 389 * the text that is left is broken amongst multiple instances of adjacent TextNodes. 390 * 391 * <BR /><BR />This method merely combines "Adjacent" instances of {@code class TextNode} 392 * in the {@code Vector} into single instances of {@code class TextNode} 393 * 394 * @param html Any vectorized-html web-page. If this page contain any contiguously placed 395 * {@code TextNode's}, the extra's will be eliminated, and the internal-string's inside the 396 * node's ({@code TextNode.str}) will be combined. This action will reduce the size of the 397 * actual html-{@code Vector}. 398 * 399 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 400 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 401 * 402 * @return The number of nodes that were eliminated after being combined, or 0 if there 403 * were no text-nodes that were removed. 404 * 405 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 406 * 407 * @see HTMLNode#str 408 * @see TextNode 409 */ 410 public static int compactTextNodes(Vector<HTMLNode> html, int sPos, int ePos) 411 { 412 LV l = new LV(html, sPos, ePos); 413 boolean compacting = false; 414 int firstPos = -1; 415 int delta = 0; 416 417 for (int i=l.start; i < (l.end - delta); i++) 418 419 if (html.elementAt(i).isTextNode()) 420 { 421 if (compacting) continue; // Not in "Compacting Mode" 422 compacting = true; // Start "Compacting Mode" - this is a TextNode 423 firstPos = i; 424 } 425 426 else if (compacting && (firstPos < (i-1))) // Else - Must be a TagNode or CommentNode 427 { 428 // Save compacted TextNode String's into this StringBuilder 429 StringBuilder compacted = new StringBuilder(); 430 431 // Iterate all TextNodes that were adjacent, put them together into StringBuilder 432 for (int j=firstPos; j < i; j++) compacted.append(html.elementAt(j).str); 433 434 // Place this new "aggregate TextNode" at location of the first TextNode that 435 // was compacted into this StringBuilder 436 437 html.setElementAt(new TextNode(compacted.toString()), firstPos); 438 439 // Remove the rest of the positions in the Vector that had TextNode's. These have 440 // all been put together into the "Aggregate TextNode" at position "firstPos" 441 442 Util.Remove.range(html, firstPos + 1, i); 443 444 // The change in the size of the Vector needs to be accounted for. 445 delta += (i - firstPos - 1); 446 447 // Change the loop-counter variable, too, since the size of the Vector has changed. 448 i = firstPos + 1; 449 450 // Since we just hit a CommentNode, or TagNode, exit "Compacting Mode." 451 compacting = false; 452 453 } 454 455 // NOTE: This, ALSO, MUST BE a TagNode or CommentNode (just like the previous 456 // if-else branch !) 457 // TRICKY: Don't forget this 'else' ! 458 459 else compacting = false; 460 461 // Added - Don't forget the case where the Vector ends with a series of TextNodes 462 // TRICKY TOO! (Same as the HTML Parser... The ending or 'trailing' nodes must be parsed 463 464 int lastNodePos = html.size() - 1; 465 466 if (html.elementAt(lastNodePos).isTextNode()) if (compacting && (firstPos < lastNodePos)) 467 { 468 StringBuilder compacted = new StringBuilder(); 469 470 // Compact the TextNodes that were identified at the end of the Vector range. 471 for (int j=firstPos; j <= lastNodePos; j++) compacted.append(html.elementAt(j).str); 472 473 // Replace the group of TextNode's at the end of the Vector, with the single, aggregate 474 html.setElementAt(new TextNode(compacted.toString()), firstPos); 475 Util.Remove.range(html, firstPos + 1, lastNodePos + 1); 476 } 477 478 return delta; 479 } 480 481 482 // ******************************************************************************************** 483 // ******************************************************************************************** 484 // String-Length Operations 485 // ******************************************************************************************** 486 // ******************************************************************************************** 487 488 489 /** 490 * Convenience Method. 491 * <BR />Invokes: {@link #strLength(Vector, int, int)} 492 */ 493 public static int strLength(Vector<? extends HTMLNode> html) 494 { return strLength(html, 0, -1); } 495 496 /** 497 * Convenience Method. 498 * <BR />Receives: {@code DotPair} 499 * <BR />Invokes: {@link #strLength(Vector, int, int)} 500 */ 501 public static int strLength(Vector<? extends HTMLNode> html, DotPair dp) 502 { return strLength(html, dp.start, dp.end + 1); } 503 504 /** 505 * This method simply adds / sums the {@code String}-length of every {@code HTMLNode.str } 506 * field in the passed page-{@code Vector}. It only counts nodes between parameters 507 * {@code sPos} (inclusive) and {@code ePos} (exclusive). 508 * 509 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 510 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 511 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 512 * 513 * @return The total length <B><I>- in characters -</I></B> of the sub-page of HTML between 514 * {@code 'sPos'} and {@code 'ePos'} 515 * 516 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 517 * 518 * @see #strLength(Vector) 519 */ 520 public static int strLength(Vector<? extends HTMLNode> html, int sPos, int ePos) 521 { 522 int ret = 0; 523 LV l = new LV(html, sPos, ePos); 524 525 for (int i=l.start; i < l.end; i++) ret += html.elementAt(i).str.length(); 526 527 return ret; 528 } 529 530 531 // ******************************************************************************************** 532 // ******************************************************************************************** 533 // Hash-Code Operations 534 // ******************************************************************************************** 535 // ******************************************************************************************** 536 537 538 /** 539 * Convenience Method. 540 * <BR />Invokes: {@link #hashCode(Vector, int, int)} 541 */ 542 public static int hashCode(Vector<? extends HTMLNode> html) 543 { return hashCode(html, 0, -1); } 544 545 /** 546 * Convenience Method. 547 * <BR />Receives: {@code DotPair} 548 * <BR />Invokes: {@link #hashCode(Vector, int, int)} 549 */ 550 public static int hashCode(Vector<? extends HTMLNode> html, DotPair dp) 551 { return hashCode(html, dp.start, dp.end + 1); } 552 553 /** 554 * Generates a hash-code for a vectorized html page-{@code Vector}. 555 * 556 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 557 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 558 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 559 * 560 * @return Returns the {@code String.hashCode()} of the <I><B>partial HTML-page</B></i> as if 561 * it were not being stored as a {@code Vector}, but rather as HTML inside of a 562 * Java-{@code String}. 563 * 564 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 565 * 566 * @see #hashCode(Vector) 567 */ 568 public static int hashCode(Vector<? extends HTMLNode> html, int sPos, int ePos) 569 { 570 int h = 0; 571 LV lv = new LV(html, sPos, ePos); 572 573 for (int j=lv.start; j < lv.end; j++) 574 { 575 String s = html.elementAt(j).str; 576 int l = s.length(); 577 578 // This line has been copied from the jdk8/jdk8 "String.hashCode()" method. 579 // The difference is that it iterates over the entire vector 580 581 for (int i=0; i < l; i++) h = 31 * h + s.charAt(i); 582 } 583 584 return h; 585 } 586 587 588 // ******************************************************************************************** 589 // ******************************************************************************************** 590 // JSON Script Nodes 591 // ******************************************************************************************** 592 // ******************************************************************************************** 593 594 595 /** 596 * Convenience Method. 597 * <BR />Invokes: {@link #getJSONScriptBlocks(Vector, int, int)} 598 */ 599 public static Stream<String> getJSONScriptBlocks(Vector<HTMLNode> html) 600 { return getJSONScriptBlocks(html, 0, -1); } 601 602 /** 603 * Convenience Method. 604 * <BR />Receives: {@code DotPair}. 605 * <BR />Invokes: {@link #getJSONScriptBlocks(Vector, int, int)} 606 */ 607 public static Stream<String> getJSONScriptBlocks(Vector<HTMLNode> html, DotPair dp) 608 { return getJSONScriptBlocks(html, dp.start, dp.end + 1); } 609 610 /** 611 * This method shall search for any and all {@code <SCRIPT TYPE="json">} 612 * <I>JSON TEXT</I> {@code </SCRIPT>} block present in a range of Vectorized HTML. The 613 * search method shall simply look for the toke {@code "JSON"} in the {@code TYPE} attribute 614 * of each and every {@code <SCRIPT> TagNode} that is found on the page. The validity of the 615 * {@code JSON} found within such blocks <I>is not checked for validity, nor is it even 616 * guaranteed to be {@code JSON} data!</I> 617 * 618 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 619 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 620 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 621 * 622 * @return This will return a {@code java.util.stream.Stream<String>} of each of the 623 * {@code JSON} elements present in the specified range of the Vectorized HTML passed to 624 * parameter {@code 'html'}. 625 * 626 * <EMBED CLASS='external-html' DATA-FILE-ID=STRMCNVT> 627 * 628 * @see StrTokCmpr#containsIgnoreCase(String, Predicate, String) 629 * @see Util#rangeToString(Vector, int, int) 630 */ 631 public static Stream<String> getJSONScriptBlocks(Vector<HTMLNode> html, int sPos, int ePos) 632 { 633 // Whenever building lists, it is usually easiest to use a Stream.Builder 634 Stream.Builder<String> b = Stream.builder(); 635 636 // This Predicate simply tests that if the substring "json" (CASE INSENSITIVE) is found 637 // in the TYPE attribute of a <SCRIPT TYPE=...> node, that the token-string is, indeed a 638 // word - not a substring of some other word. For instance: TYPE="json" would PASS, but 639 // TYPE="rajsong" would FAIL - because the token string is not surrounded by white-space 640 641 final Predicate<String> tester = (String s) -> 642 StrTokCmpr.containsIgnoreCase 643 (s, (Character c) -> ! Character.isLetterOrDigit(c), "json"); 644 645 // Find all <SCRIPT> node-blocks whose "TYPE" attribute abides by the tester 646 // String-Predicate named above. 647 648 Vector<DotPair> jsonDPList = InnerTagFindInclusive.all 649 (html, sPos, ePos, "script", "type", tester); 650 651 // Convert each of these DotPair element into a java.lang.String 652 // Add the String to the Stream.Builder<String> 653 654 for (DotPair jsonDP : jsonDPList) 655 if (jsonDP.size() > 2) 656 b.accept(Util.rangeToString(html, jsonDP.start + 1, jsonDP.end)); 657 658 // Build the Stream, and return it. 659 return b.build(); 660 } 661 662 663 // ******************************************************************************************** 664 // ******************************************************************************************** 665 // MISC 666 // ******************************************************************************************** 667 // ******************************************************************************************** 668 669 670 /** 671 * Inserts nodes, and allows a 'varargs' parameter. 672 * 673 * @param html Any HTML Page 674 * 675 * @param pos The position in the original {@code Vector} where the nodes shall be inserted. 676 * 677 * @param nodes A list of nodes to insert. 678 */ 679 public static void insertNodes(Vector<HTMLNode> html, int pos, HTMLNode... nodes) 680 { 681 Vector<HTMLNode> nodesVec = new Vector<>(nodes.length); 682 for (HTMLNode node : nodes) nodesVec.addElement(node); 683 html.addAll(pos, nodesVec); 684 } 685 686 /** 687 * Convenience Method. 688 * <BR />Invokes: {@link #replaceRange(Vector, int, int, Vector)} 689 */ 690 public static void replaceRange 691 (Vector<HTMLNode> page, DotPair range, Vector<HTMLNode> newNodes) 692 { replaceRange(page, range.start, range.end+1, newNodes); } 693 694 /** 695 * Replaces any all and all {@code HTMLNode's} located between the {@code Vector} locations 696 * {@code 'sPos'} (inclusive) and {@code 'ePos'} (exclusive). By exclusive, this means that 697 * the {@code HTMLNode} located at positon {@code 'ePos'} <B><I>will not</I></B> be replaced, 698 * but the one at {@code 'sPos'} <I><B>is replaced</B></I>. 699 * 700 * <BR /><BR />The size of the {@code Vector} will change by {@code newNodes.size() - 701 * (ePos + sPos)}. The contents situated between {@code Vector} location {@code sPos} and 702 * {@code sPos + newNodes.size()} will, indeed, be the contents of the {@code 'newNodes'} 703 * parameter. 704 * 705 * @param page Any Java HTML page, constructed of {@code HTMLNode (TagNode & TextNode)} 706 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 707 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 708 * @param newNodes Any Java HTML page-{@code Vector} of {@code HTMLNode}. 709 * 710 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 711 * 712 * @see #pollRange(Vector, int, int) 713 * @see Remove#range(Vector, int, int) 714 * @see #replaceRange(Vector, DotPair, Vector) 715 */ 716 public static void replaceRange 717 (Vector<HTMLNode> page, int sPos, int ePos, Vector<HTMLNode> newNodes) 718 { 719 // Torello.Java.LV 720 LV l = new LV(sPos, ePos, page); 721 722 int oldSize = ePos - sPos; 723 int newSize = newNodes.size(); 724 int insertPos = sPos; 725 int i = 0; 726 727 while ((i < newSize) && (i < oldSize)) 728 page.setElementAt(newNodes.elementAt(i++), insertPos++); 729 730 731 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 732 // CASE ONE: 733 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 734 735 if (newSize == oldSize) return; 736 737 738 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 739 // CASE TWO: 740 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 741 // 742 // The new Vector is SMALLER than the old sub-range 743 // The rest of the nodes just need to be trashed 744 // 745 // OLD-WAY: (Before realizing what Vector.subList is actually doing) 746 // Util.removeRange(page, insertPos, ePos); 747 748 if (newSize < oldSize) page.subList(insertPos, ePos).clear(); 749 750 751 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 752 // CASE THREE: 753 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 754 // 755 // The new Vector is BIGGER than the old sub-range 756 // There are still more nodes to insert. 757 758 else page.addAll(ePos, newNodes.subList(i, newSize)); 759 } 760 761 /** 762 * Java's {@code java.util.Vector} class does not allow public access to the 763 * {@code removeRange(start, end)} function. It is listed as {@code 'protected'} in Java's 764 * Documentation about the {@code class Vector.} This method upstages that, and performs the 765 * {@code 'Poll'} operation, where the nodes are first removed, stored, and then return as a 766 * function result. 767 * 768 * <BR /><BR /><B CLASS=JDDescLabel>Poll a Range:</B> 769 * 770 * <BR />The nodes that are removed are placed in a separate return {@code Vector}, and 771 * returned as a result to this method. 772 * 773 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 774 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 775 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 776 * 777 * @return A complete list ({@code Vector<HTMLNode>}) of the nodes that were removed. 778 * 779 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 780 * 781 * @see Remove#range(Vector, int, int) 782 * @see Remove#range(Vector, DotPair) 783 * @see #pollRange(Vector, DotPair) 784 */ 785 public static Vector<HTMLNode> pollRange(Vector<? extends HTMLNode> html, int sPos, int ePos) 786 { 787 // The original version of this method is preserved inside comments at the bottom of this 788 // method. Prior to seeing the Sun-Oracle Docs explaining that the return from the SubList 789 // operation "mirrors changes" back to to the original vector, the code in the comments is 790 // how this method was accomplished. 791 792 LV l = new LV(html, sPos, ePos); 793 Vector<HTMLNode> ret = new Vector<HTMLNode>(l.end - l.start); 794 List<? extends HTMLNode> list = html.subList(l.start, l.end); 795 796 // Copy the Nodes into the return Vector that the end-user receives 797 ret.addAll(list); 798 799 // Clear the nodes out of the original Vector. The Sun-Oracle Docs 800 // state that the returned sub-list is "mirrored back into" the original 801 802 list.clear(); 803 804 // Return the Vector to the user. Note that the List<HTMLNode> CANNOT be returned, 805 // because of it's mirror-qualities, and because this method expects a vector. 806 807 return ret; 808 809 /* 810 // BEFORE READING ABOUT Vector.subList(...), this is how this was accomplished: 811 // NOTE: It isn't so clear how the List<HTMLNode> works - likely it doesn't actually 812 // create any new memory-allocated arrays, it is just an "overlay" 813 814 // Copy the elements from the input vector into the return vector 815 for (int i=l.start; i < l.end; i++) ret.add(html.elementAt(i)); 816 817 // Remove the range from the input vector (this is the meaning of 'poll') 818 Util.removeRange(html, sPos, ePos); 819 820 return ret; 821 */ 822 } 823 824 /** 825 * Convenience Method. 826 * <BR />Receives: {@code DotPair} 827 * <BR />Invokes: {@link #pollRange(Vector, int, int)}. 828 */ 829 public static Vector<HTMLNode> pollRange(Vector<? extends HTMLNode> html, DotPair dp) 830 { return pollRange(html, dp.start, dp.end + 1); } 831 832 /** 833 * This removes every element from the {@code Vector} beginning at position 0, all the way to 834 * position {@code 'pos'} (exclusive). The {@code elementAt(pos)} remains in the original page 835 * input-{@code Vector}. This is the definition of 'exclusive'. 836 * 837 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 838 * 839 * @param pos Any position within the range of the input {@code Vector}. 840 * 841 * @return The elements in the {@code Vector} from position: {@code 0 ('zero')} all the way to 842 * position: {@code 'pos'} 843 */ 844 public static Vector<HTMLNode> split(Vector<? extends HTMLNode> html, int pos) 845 { return pollRange(html, 0, pos); } 846 847 848 // ******************************************************************************************** 849 // ******************************************************************************************** 850 // Static Inner-Class: Count 851 // ******************************************************************************************** 852 // ******************************************************************************************** 853 854 855 @Torello.JavaDoc.StaticFunctional 856 public static class Count 857 { 858 private Count() { } 859 860 861 // **************************************************************************************** 862 // **************************************************************************************** 863 // Count TextNode's 864 // **************************************************************************************** 865 // **************************************************************************************** 866 867 868 /** 869 * Convenience Method. 870 * <BR />Invokes: {@link #textNodes(Vector, int, int)} 871 */ 872 public static int textNodes(Vector<HTMLNode> page) 873 { return textNodes(page, 0, -1); } 874 875 /** 876 * Convenience Method. 877 * <BR />Receives: {@code DotPair} 878 * <BR />Invokes: {@link #textNodes(Vector, int, int)} 879 */ 880 public static int textNodes(Vector<HTMLNode> page, DotPair dp) 881 { return textNodes(page, dp.start, dp.end + 1); } 882 883 /** 884 * Counts the number of {@code TextNode's} in a {@code Vector<HTMLNode>} between the 885 * demarcated array / {@code Vector} positions, {@code 'sPos'} and {@code 'ePos'} 886 * 887 * @param page Any HTML page. 888 * 889 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 890 * 891 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 892 * 893 * @return The number of {@code TextNode's} in the {@code Vector} between the demarcated 894 * indices. 895 * 896 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 897 */ 898 public static int textNodes(Vector<HTMLNode> page, int sPos, int ePos) 899 { 900 int counter = 0; 901 LV l = new LV(page, sPos, ePos); 902 903 // Iterates the entire page between sPos and ePos, incrementing the count for every 904 // instance of text-node. 905 906 for (int i=l.start; i < l.end; i++) if (page.elementAt(i).isTextNode()) counter++; 907 908 return counter; 909 } 910 911 912 // **************************************************************************************** 913 // **************************************************************************************** 914 // Count CommentNode's 915 // **************************************************************************************** 916 // **************************************************************************************** 917 918 919 /** 920 * Convenience Method. 921 * <BR />Invokes: {@link #commentNodes(Vector, int, int)} 922 */ 923 public static int commentNodes(Vector<HTMLNode> page) 924 { return commentNodes(page, 0, -1); } 925 926 /** 927 * Convenience Method. 928 * <BR />Receives: {@code DotPair} 929 * <BR />Invokes: {@link #commentNodes(Vector, int, int)} 930 */ 931 public static int commentNodes(Vector<HTMLNode> page, DotPair dp) 932 { return commentNodes(page, dp.start, dp.end + 1); } 933 934 /** 935 * Counts the number of {@code CommentNode's} in an {@code Vector<HTMLNode>} between the 936 * demarcated array / {@code Vector} positions. 937 * 938 * @param page Any HTML page. 939 * 940 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 941 * 942 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 943 * 944 * @return The number of {@code CommentNode's} in the {@code Vector} between the demarcated 945 * indices. 946 * 947 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 948 */ 949 public static int commentNodes(Vector<HTMLNode> page, int sPos, int ePos) 950 { 951 int counter = 0; 952 LV l = new LV(page, sPos, ePos); 953 954 // Iterates the entire page between sPos and ePos, incrementing the count for every 955 // instance of comment-node. 956 957 for (int i=l.start; i < l.end; i++) if (page.elementAt(i).isCommentNode()) counter++; 958 959 return counter; 960 } 961 962 963 // **************************************************************************************** 964 // **************************************************************************************** 965 // Count TagNode's 966 // **************************************************************************************** 967 // **************************************************************************************** 968 969 970 /** 971 * Convenience Method. 972 * <BR />Invokes: {@link #tagNodes(Vector, int, int)} 973 */ 974 public static int tagNodes(Vector<HTMLNode> page) 975 { return tagNodes(page, 0, -1); } 976 977 /** 978 * Convenience Method. 979 * <BR />Receives: {@code DotPair} 980 * <BR />Invokes: {@link #tagNodes(Vector, int, int)} 981 */ 982 public static int tagNodes(Vector<HTMLNode> page, DotPair dp) 983 { return tagNodes(page, dp.start, dp.end + 1); } 984 985 /** 986 * Counts the number of {@code TagNode's} in a {@code Vector<HTMLNode>} between the 987 * demarcated array / {@code Vector} positions. 988 * 989 * @param page Any HTML page. 990 * 991 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 992 * 993 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 994 * 995 * @return The number of {@code TagNode's} in the {@code Vector}. 996 * 997 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 998 */ 999 public static int tagNodes(Vector<HTMLNode> page, int sPos, int ePos) 1000 { 1001 int counter = 0; 1002 LV l = new LV(page, sPos, ePos); 1003 1004 // Iterates the entire page between sPos and ePos, incrementing the count for every 1005 // instance of TagNode. 1006 1007 for (int i=l.start; i < l.end; i++) if (page.elementAt(i).isTagNode()) counter++; 1008 1009 return counter; 1010 } 1011 1012 1013 // **************************************************************************************** 1014 // **************************************************************************************** 1015 // Count New Lines 1016 // **************************************************************************************** 1017 // **************************************************************************************** 1018 1019 1020 /** 1021 * Convenience Method. 1022 * <BR />Invokes: {@link #newLines(Vector, int, int)} 1023 */ 1024 public static int newLines(Vector<? extends HTMLNode> html) 1025 { return newLines(html, 0, -1); } 1026 1027 /** 1028 * Convenience Method. 1029 * <BR />Receives: {@code DotPair} 1030 * <BR />Invokes: {@link #newLines(Vector, int, int)} 1031 */ 1032 public static int newLines(Vector<? extends HTMLNode> html, DotPair dp) 1033 { return newLines(html, dp.start, dp.end + 1); } 1034 1035 1036 /** 1037 * This will count the number of new-line symbols present <B><I>- on the partial HTML 1038 * page</I></B>. The count will include a sum of every {@code HTMLNode.str} that 1039 * contains the standard new-line symbols: {@code \r\n, \r, \n}, meaning that UNIX, MSFT, 1040 * Apple, etc. forms of text-line rendering should all be treated equally. 1041 * 1042 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 1043 * 1044 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 1045 * 1046 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 1047 * 1048 * @return The number of new-line characters in all of the {@code HTMLNode's} that occur 1049 * between vectorized-page positions {@code 'sPos'} and {@code 'ePos.'} 1050 * 1051 * <BR /><BR /><B>NOTE:</B> The regular-expression used here 'NEWLINEP' is as follows: 1052 * 1053 * <DIV CLASS="SNIP">{@code 1054 * private static final Pattern NEWLINEP = Pattern.compile("\\r\\n|\\r|\\n"); 1055 * }</DIV> 1056 * 1057 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 1058 * 1059 * @see StringParse#NEWLINEP 1060 */ 1061 public static int newLines(Vector<? extends HTMLNode> html, int sPos, int ePos) 1062 { 1063 int newLineCount = 0; 1064 LV l = new LV(html, sPos, ePos); 1065 1066 for (int i=l.start; i < l.end; i++) 1067 1068 // Uses the Torello.Java.StringParse "New Line RegEx" 1069 for ( Matcher m = StringParse.NEWLINEP.matcher(html.elementAt(i).str); 1070 m.find(); 1071 newLineCount++); 1072 1073 return newLineCount; 1074 } 1075 } 1076 1077 1078 // ******************************************************************************************** 1079 // ******************************************************************************************** 1080 // Static Inner-Class: Remove 1081 // ******************************************************************************************** 1082 // ******************************************************************************************** 1083 1084 1085 @Torello.JavaDoc.StaticFunctional 1086 public static class Remove 1087 { 1088 private Remove() { } 1089 1090 1091 // **************************************************************************************** 1092 // **************************************************************************************** 1093 // TextNode Removal Operations 1094 // **************************************************************************************** 1095 // **************************************************************************************** 1096 1097 1098 /** 1099 * Convenience Method. 1100 * <BR />Invokes: {@link #allTextNodes(Vector, int, int)} 1101 */ 1102 public static int allTextNodes(Vector<HTMLNode> page) 1103 { return allTextNodes(page, 0, -1); } 1104 1105 /** 1106 * Convenience Method. 1107 * <BR />Receives: {@code DotPair} 1108 * <BR />Invokes: {@link #allTextNodes(Vector, int, int)} 1109 */ 1110 public static int allTextNodes(Vector<HTMLNode> page, DotPair dp) 1111 { return allTextNodes(page, dp.start, dp.end + 1); } 1112 1113 /** 1114 * Takes a sub-section of an HTML {@code Vector} and removes all {@code TextNode} present 1115 * 1116 * @param page Any HTML page 1117 * 1118 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 1119 * 1120 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 1121 * 1122 * @return The number of HTML {@code TextNode's} that were removed 1123 * 1124 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 1125 * 1126 * @see TextNode 1127 * @see #nodesOPT(Vector, int[]) 1128 */ 1129 public static int allTextNodes(Vector<HTMLNode> page, int sPos, int ePos) 1130 { 1131 IntStream.Builder b = IntStream.builder(); 1132 LV l = new LV(page, sPos, ePos); 1133 1134 // Use Java-Streams to build the list of nodes that are valid text-nodes. 1135 for (int i=l.start; i < l.end; i++) if (page.elementAt(i).isTextNode()) b.add(i); 1136 1137 // Build the stream and convert it to an int[] (integer-array) 1138 int[] posArr = b.build().toArray(); 1139 1140 // The integer array is guaranteed to be sorted, and contain valid vector-indices. 1141 nodesOPT(page, posArr); 1142 1143 return posArr.length; 1144 } 1145 1146 1147 // **************************************************************************************** 1148 // **************************************************************************************** 1149 // TagNode Removal Operations 1150 // **************************************************************************************** 1151 // **************************************************************************************** 1152 1153 1154 /** 1155 * Convenience Method. 1156 * <BR />Invokes: {@link #allTagNodes(Vector, int, int)} 1157 */ 1158 public static int allTagNodes(Vector<HTMLNode> page) 1159 { return allTagNodes(page, 0, -1); } 1160 1161 /** 1162 * Convenience Method. 1163 * <BR />Receives: {@code DotPair} 1164 * <BR />Invokes: {@link #allTagNodes(Vector, int, int)} 1165 */ 1166 public static int allTagNodes(Vector<HTMLNode> page, DotPair dp) 1167 { return allTagNodes(page, dp.start, dp.end + 1); } 1168 1169 /** 1170 * Takes a sub-section of an HTML {@code Vector} and removes all {@code TagNode} present 1171 * 1172 * @param page Any HTML page 1173 * 1174 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 1175 * 1176 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 1177 * 1178 * @return The number of HTML {@code TagNode's} that were removed 1179 * 1180 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 1181 * 1182 * @see TagNode 1183 * @see #nodesOPT(Vector, int[]) 1184 */ 1185 public static int allTagNodes(Vector<HTMLNode> page, int sPos, int ePos) 1186 { 1187 IntStream.Builder b = IntStream.builder(); 1188 LV l = new LV(page, sPos, ePos); 1189 1190 // Use Java-Streams to build the list of nodes that are valid tag-nodes. 1191 for (int i=l.start; i < l.end; i++) if (page.elementAt(i).isTagNode()) b.add(i); 1192 1193 // Build the stream and convert it to an int[] (integer-array) 1194 int[] posArr = b.build().toArray(); 1195 1196 // The integer array is guaranteed to be sorted, and contain valid vector-indices. 1197 nodesOPT(page, posArr); 1198 1199 return posArr.length; 1200 } 1201 1202 1203 // **************************************************************************************** 1204 // **************************************************************************************** 1205 // CommentNode Removal Operations 1206 // **************************************************************************************** 1207 // **************************************************************************************** 1208 1209 1210 /** 1211 * Convenience Method. 1212 * <BR />Invokes: {@link #allCommentNodes(Vector, int, int)} 1213 */ 1214 public static int allCommentNodes(Vector<HTMLNode> page) 1215 { return allCommentNodes(page, 0, -1); } 1216 1217 /** 1218 * Convenience Method. 1219 * <BR />Receives: {@code DotPair} 1220 * <BR />Invokes: {@link #allCommentNodes(Vector, int, int)} 1221 */ 1222 public static int allCommentNodes(Vector<HTMLNode> page, DotPair dp) 1223 { return allCommentNodes(page, dp.start, dp.end + 1); } 1224 1225 /** 1226 * Takes a sub-section of an HTML {@code Vector} and removes all {@code CommentNode} 1227 * present 1228 * 1229 * @param page Any HTML page 1230 * 1231 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 1232 * 1233 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 1234 * 1235 * @return The number of HTML {@code CommentNode's} that were removed 1236 * 1237 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 1238 * 1239 * @see CommentNode 1240 * @see #nodesOPT(Vector, int[]) 1241 */ 1242 public static int allCommentNodes(Vector<HTMLNode> page, int sPos, int ePos) 1243 { 1244 IntStream.Builder b = IntStream.builder(); 1245 LV l = new LV(page, sPos, ePos); 1246 1247 // Use Java-Streams to build the list of nodes that are valid comment-nodes. 1248 for (int i=l.start; i < l.end; i++) 1249 if (page.elementAt(i).isCommentNode()) 1250 b.add(i); 1251 1252 // Build the stream and convert it to an int[] (integer-array) 1253 int[] posArr = b.build().toArray(); 1254 1255 // The integer array is guaranteed to be sorted, and contain valid vector-indices. 1256 nodesOPT(page, posArr); 1257 1258 return posArr.length; 1259 } 1260 1261 1262 // **************************************************************************************** 1263 // **************************************************************************************** 1264 // Remove All Inner Tags 1265 // **************************************************************************************** 1266 // **************************************************************************************** 1267 1268 1269 /** 1270 * Convenience Method. 1271 * <BR />Invokes: {@link #allInnerTags(Vector, int, int)} 1272 */ 1273 public static int allInnerTags(Vector<HTMLNode> html) 1274 { return allInnerTags(html, 0, -1); } 1275 1276 /** 1277 * Convenience Method. 1278 * <BR />Receives: {@code DotPair} 1279 * <BR />Invokes: {@link #allInnerTags(Vector, int, int)} 1280 */ 1281 public static int allInnerTags(Vector<? super TagNode> html, DotPair dp) 1282 { return allInnerTags(html, dp.start, dp.end + 1); } 1283 1284 /** 1285 * This method removes all inner-tags (all attributes) from every {@link TagNode} inside of 1286 * an HTML page. It does this by replacing every {@code TagNode} in the {@code Vector} 1287 * with the pre-instantiated, publicly-available {@code TagNode} which can be obtained by a 1288 * call to the class {@code HTMLTags.hasTag(token, TC)}. 1289 * 1290 * <BR /><BR /><B CLASS=JDDescLabel>Replacing {@code TagNode's:}</B> 1291 * 1292 * <BR />This method determines whether a fresh {@link TagNode} is to be inserted by 1293 * measuring the length of the internal {@link TagNode#str} field (a {@code String} field). 1294 * If the length {@code TagNode.str} is not equal to the HTML token {@link TagNode#tok} 1295 * length <B><I>plus 2</I></B>, then a fresh, pre-instantiated, node is replaced. 1296 * 1297 * <BR /><BR />The {@code '+2'} figure comes from the additional characters {@code '<'} and 1298 * {@code '>'} that start and end every HTML {@code TagNode} 1299 * 1300 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 1301 * 1302 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 1303 * 1304 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 1305 * 1306 * @return The number of {@code TagNode} elements that have were replaced with 1307 * zero-attribute HTML Element Tags. 1308 * 1309 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 1310 * 1311 * @throws ClassCastException If {@code 'html'} contains references that do not inherit 1312 * {@code HTMLNode}. 1313 */ 1314 @SuppressWarnings("unchecked") 1315 public static int allInnerTags(Vector<? super TagNode> html, int sPos, int ePos) 1316 { 1317 int ret = 0; 1318 LV l = new LV(sPos, ePos, html); 1319 TagNode tn; 1320 1321 for (int i = (l.end-1); i >= l.start; i--) 1322 1323 if ((tn = ((HTMLNode) html.elementAt(i)).openTagPWA()) != null) 1324 1325 { 1326 ret++; 1327 1328 // HTMLTags.hasTag(tok, TC) gets an empty and pre-instantiated TagNode, 1329 // where TagNode.tok == 'tn.tok' and TagNode.isClosing = false 1330 1331 html.setElementAt(HTMLTags.hasTag(tn.tok, TC.OpeningTags), i); 1332 } 1333 1334 return ret; 1335 } 1336 1337 1338 // **************************************************************************************** 1339 // **************************************************************************************** 1340 // Style-Node & Script-Node Block Removal Operations 1341 // **************************************************************************************** 1342 // **************************************************************************************** 1343 1344 1345 /** 1346 * Removes all HTML {@code 'style'} Node blocks. 1347 * 1348 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 1349 * 1350 * @return The number of {@code <STYLE>}-Node Blocks that were removed 1351 */ 1352 public static int styleNodeBlocks(Vector<? extends HTMLNode> html) 1353 { 1354 int removeCount = 0; 1355 1356 while (TagNodeRemoveInclusive.first(html, "style") > 0) removeCount++; 1357 1358 return removeCount; 1359 } 1360 1361 /** 1362 * Removes all {@code 'script'} Node blocks. 1363 * 1364 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 1365 * 1366 * @return The number of {@code SCRIPT}-Node Blocks that were removed 1367 */ 1368 public static int scriptNodeBlocks(Vector<? extends HTMLNode> html) 1369 { 1370 int removeCount = 0; 1371 1372 while (TagNodeRemoveInclusive.first(html, "script") > 0) removeCount++; 1373 1374 return removeCount; 1375 } 1376 1377 1378 // **************************************************************************************** 1379 // **************************************************************************************** 1380 // Remove a Sub-Range of nodes 1381 // **************************************************************************************** 1382 // **************************************************************************************** 1383 1384 1385 /** 1386 * Java's {@code java.util.Vector} class does not allow public access to the 1387 * {@code removeRange(start, end)} function. It is protected in Java's Documentation about 1388 * the {@code Vector} class. This method does exactly that, nothing else. 1389 * 1390 * @param page Any Java HTML page, constructed of {@code HTMLNode (TagNode & TextNode)} 1391 * 1392 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 1393 * 1394 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 1395 * 1396 * @return the number of nodes removed. 1397 * 1398 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 1399 * 1400 * @see #pollRange(Vector, int, int) 1401 * @see #range(Vector, DotPair) 1402 */ 1403 public static <T extends HTMLNode> int range(Vector<T> page, int sPos, int ePos) 1404 { 1405 // Torello.Java.LV 1406 LV l = new LV(sPos, ePos, page); 1407 1408 // According to the Sun-Oracle Docs, the returned sublist "mirros" the original vector, 1409 // which means that when it is changed, so is the original vector. 1410 1411 page.subList(l.start, l.end).clear(); 1412 1413 return l.size(); 1414 } 1415 1416 /** 1417 * Convenience Method. 1418 * <BR />Receives: {@code DotPair} 1419 * <BR />Invokes: {@link #range(Vector, int, int)} 1420 */ 1421 public static int range(Vector<? extends HTMLNode> html, DotPair dp) 1422 { return range(html, dp.start, dp.end + 1); } 1423 1424 1425 // **************************************************************************************** 1426 // **************************************************************************************** 1427 // Remove Specified Nodes by Vector-Index 1428 // **************************************************************************************** 1429 // **************************************************************************************** 1430 1431 1432 /** 1433 * <SPAN STYLE="color: red;"><B>OPT: Optimized</B></SPAN> 1434 * 1435 * <BR /><BR />This method does the same thing as 1436 * {@link Remove#nodes(boolean, Vector, int[])}, but all error checking is skipped, and the 1437 * input integer array is presumed to have been sorted. There are no guarantees about the 1438 * behavior of this method if the input array {@code 'posArr'} is not sorted, 1439 * <I>least-to-greatest,</I> or if there are duplicate or negative values in this array. 1440 * 1441 * <BR /><BR /><B CLASS=JDDescLabel>Empty Var-Args:</B> 1442 * 1443 * <BR />If the var-args input integer-array parameter is empty, this method shall exit 1444 * gracefully (and immediately). 1445 * 1446 * @param page Any HTML-Page, usually ones generated by {@code HTMLPage.getPageTokens}, but 1447 * these may be obtained or created in any fashion so necessary. 1448 * 1449 * @param posArr An array of integers which list/identify the nodes in the page to be 1450 * removed. Because this implementation has been optimized, no error checking will be 1451 * performed on this input. It is presumed to be sorted, least-to-greatest, and that all 1452 * values in the array are valid-indices into the vectorized-html parameter {@code 'page'} 1453 */ 1454 public static <T extends HTMLNode> void nodesOPT(Vector<T> page, int... posArr) 1455 { 1456 if (posArr.length == 0) return; 1457 1458 int endingInsertPos = page.size() - posArr.length; 1459 int posArrIndex = 0; 1460 int insertPos = posArr[0]; 1461 int retrievePos = posArr[0]; 1462 1463 // There is very little that can be documented about these two loops. Took 3 hours 1464 // to figure out. Read the variables names for "best documentation" 1465 1466 while (insertPos < endingInsertPos) 1467 { 1468 // This inner-loop is necessary for when the posArr has consecutive-elements that 1469 // are *ALSO* consecutive-pointers. 1470 // 1471 // For instance, this invokation: 1472 // Util.removeNodes(page, 4, 5, 6); ... 1473 // where 4, 5, and 6 are consecutive - the inner while-loop is required. 1474 // 1475 // For this invokation: 1476 // Util.removeNodes(page, 2, 4, 6); 1477 // the inner-loop is not entered. 1478 1479 while ((posArrIndex < posArr.length) && (retrievePos == posArr[posArrIndex])) 1480 { retrievePos++; posArrIndex++; } 1481 1482 page.setElementAt(page.elementAt(retrievePos++), insertPos++); 1483 } 1484 1485 // Remove all remaining elements in the tail of the array. 1486 page.setSize(page.size() - posArr.length); 1487 } 1488 1489 1490 /** 1491 * This method remove each HTMLNode from the passed-parameter {@code 'page'} 1492 * listed/identified by the input array {@code 'nodeList'}. 1493 * 1494 * <BR /><BR /><B CLASS=JDDescLabel>Empty Var-Args:</B> 1495 * 1496 * <BR />If the var-args input integer-array parameter is empty, this method shall exit 1497 * gracefully (and immediately). 1498 * 1499 * @param preserveInputArray This is a convenience input parameter that allows a programmer 1500 * to "preserve" the original input-parameter integer-array that is passed to this method. 1501 * It could be argued this parameter is "superfluous" - however, keep in mind that the 1502 * passed parameter {@code 'nodeList'} <B><I>must be sorted</I></B> before this method is 1503 * able function properly. There is a sort that's performed within the body of this method. 1504 * Just in case that the original order of the integer-array input-parameter must be 1505 * preserved, its possible to request for the sort to operate on "a clone" of the 1506 * input-parameter integer-array, instead of the original integer-array {@code 'nodeList'} 1507 * itself. 1508 * 1509 * @param page Any HTML-Page, usually ones generated by 1510 * {@code HTMLPage.getPageTokens(...)}, but these may be obtained or created in any fashion 1511 * so necessary. 1512 * 1513 * @param nodeList An array of integers which list/identify the nodes in the page to be 1514 * removed. 1515 * 1516 * @throws IllegalArgumentException If the {@code 'nodeList'} contains duplicate entries. 1517 * Obviously, no {@code HTMLNode} may be removed from the {@code Vector<HTMLNode>} more 1518 * than once. 1519 * 1520 * @throws IndexOutOfBoundsException If the nodeList contains index-pointers / items that 1521 * are not within the bounds of the passed HTML-Page {@code Vector}. 1522 */ 1523 public static <T extends HTMLNode> void nodes 1524 (boolean preserveInputArray, Vector<T> page, int... nodeList) 1525 { 1526 if (nodeList.length == 0) return; 1527 1528 // @Safe Var Args 1529 int[] posArr = preserveInputArray ? nodeList.clone() : nodeList; 1530 int len = posArr.length; 1531 1532 Arrays.sort(posArr); 1533 1534 // Check for duplicates in the nodeList, no HTMLNode may be removed twice! 1535 for (int i=0; i < (len - 1); i++) 1536 1537 if (posArr[i] == posArr[i+1]) throw new IllegalArgumentException( 1538 "The input array contains duplicate items, this is not allowed.\n" + 1539 "This is since each array-entry is intended to be a pointer/index for items " + 1540 "to be removed.\nNo item can possibly be removed twice.!" 1541 ); 1542 1543 // Make sure all nodes are within the bounds of the original Vector. (no negative 1544 // indexes, no indexes greater than the size of the Vector) 1545 1546 if ((posArr[0] < 0) || (posArr[len - 1] >= page.size())) 1547 1548 throw new IndexOutOfBoundsException ( 1549 "The input array contains entries which are not within the bounds of the " + 1550 "original-passed Vector.\nHTMLPage Vector has: " + page.size() + 1551 " elements.\n" + 1552 "Maximum element in the nodeList is [" + posArr[len - 1] + "], and the " + 1553 "minimum element is: [" + posArr[0] + "]" 1554 ); 1555 1556 int endingInsertPos = page.size() - posArr.length; 1557 int posArrIndex = 0; 1558 int insertPos = posArr[0]; 1559 int retrievePos = posArr[0]; 1560 1561 // There is very little that can be documented about these two loops. Took 3 hours 1562 // to figure out. Read the variables names for "best documentation" 1563 1564 while (insertPos < endingInsertPos) 1565 { 1566 // This inner-loop is necessary for when the posArr has consecutive-elements that 1567 // are *ALSO* consecutive-pointers. 1568 // 1569 // For instance, this invocation: 1570 // Util.removeNodes(page, 4, 5, 6); 1571 // where 4, 5, and 6 are consecutive - the inner while-loop is required. 1572 // 1573 // For this invocation: 1574 // Util.removeNodes(page, 2, 4, 6); 1575 // the inner-loop is not entered. 1576 1577 while ((posArrIndex < posArr.length) && (retrievePos == posArr[posArrIndex])) 1578 { retrievePos++; posArrIndex++; } 1579 1580 page.setElementAt(page.elementAt(retrievePos++), insertPos++); 1581 } 1582 1583 // Remove all remaining elements in the tail of the array. 1584 page.setSize(page.size() - posArr.length); 1585 } 1586 1587 1588 // **************************************************************************************** 1589 // **************************************************************************************** 1590 // Inclusive-Empty Removal Operations 1591 // **************************************************************************************** 1592 // **************************************************************************************** 1593 1594 1595 /** 1596 * Convenience Method. 1597 * <BR />Invokes: {@link #inclusiveEmpty(Vector, int, int, String[])} 1598 */ 1599 public static int inclusiveEmpty(Vector<HTMLNode> page, String... htmlTags) 1600 { return inclusiveEmpty(page, 0, -1, htmlTags); } 1601 1602 /** 1603 * Convenience Method. 1604 * <BR />Receives: {@code DotPair} 1605 * <BR />Invokes: {@link #inclusiveEmpty(Vector, int, int, String[])} 1606 */ 1607 public static int inclusiveEmpty(Vector<HTMLNode> page, DotPair dp, String... htmlTags) 1608 { return inclusiveEmpty(page, dp.start, dp.end + 1, htmlTags); } 1609 1610 /** 1611 * This will do an "Inclusive Search" using the standard class 1612 * {@link TagNodeInclusiveIterator} in the {@code package NodeSearch}. Then it will 1613 * inspect the contents of the subsections. Any subsections that do not contain any 1614 * instances of {@code HTMLNode} in between them, or any subsections that only contain 1615 * "blank-text" (white-space) between them shall be removed. 1616 * 1617 * <BR /><BR /><B CLASS=JDDescLabel>Recursive Method:</B> 1618 * 1619 * <BR />The search logic shall perform multiple <I><B>recursive iterations</B></I> of 1620 * itself, such that if, for instance, the user requested that all empty HTML divider 1621 * ({@code <DIV>}) elements be removed, if after removing a set a dividers resulted in more 1622 * empty ones (nested {@code <DIV>} elements), then an additional removal shall be called. 1623 * <I>This recursion shall continue until there are no empty HTML elements of the types 1624 * listed by</I> {@code 'htmlTags'} 1625 * 1626 * @param page Any vectorized-html page or sub-page. 1627 * 1628 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 1629 * 1630 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 1631 * 1632 * @param htmlTags The list of <I>inclusive</I> (non-singleton) html elements to search for 1633 * possibly being empty container tags. 1634 * 1635 * @return The number of {@code HTMLNode's} that were removed. 1636 * 1637 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 1638 */ 1639 public static int inclusiveEmpty 1640 (Vector<HTMLNode> page, int sPos, int ePos, String... htmlTags) 1641 { 1642 DotPair subList; 1643 1644 int removed = 0; 1645 HNLIInclusive iter = TagNodeInclusiveIterator.iter(page, htmlTags); 1646 LV l = new LV(page, sPos, ePos); 1647 1648 iter.restrictCursor(l); 1649 1650 TOP: 1651 while (iter.hasNext()) 1652 1653 // If there is only the opening & closing pair, with nothing in between, 1654 // then the pair must be removed because it is "Empty" (Inclusive Empty) 1655 1656 if ((subList = iter.nextDotPair()).size() == 2) 1657 { 1658 iter.remove(); 1659 ePos -= subList.size(); 1660 removed += subList.size(); 1661 } 1662 1663 else 1664 { 1665 // If there is any TagNode in between the start-end pair, then this is NOT 1666 // EMPTY. In this case, skip to the next start-end opening-closing pair. 1667 1668 for (int i=(subList.start + 1); i < subList.end; i++) 1669 if (! page.elementAt(i).isTextNode()) 1670 continue TOP; 1671 1672 // If there were only TextNode's between an opening-closing TagNode Pair.... 1673 // **AND** those TextNode's are only white-space, then this also considered 1674 // Inclusively Empty. (Get all TextNode's, and if .trim() reduces the length() 1675 // to zero, then it was only white-space. 1676 1677 if (Util.textNodesString(page, subList).trim().length() == 0) 1678 { 1679 iter.remove(); 1680 ePos -= subList.size(); 1681 removed += subList.size(); 1682 } 1683 } 1684 1685 // This process must be continued recursively, because if any inner, for instance, 1686 // <DIV> ... </DIV> was removed, then the outer list must be re-checked... 1687 1688 if (removed > 0) 1689 return removed + Remove.inclusiveEmpty(page, sPos, ePos, htmlTags); 1690 else 1691 return 0; 1692 } 1693 1694 1695 // **************************************************************************************** 1696 // **************************************************************************************** 1697 // Miscellaneous Removal Operations 1698 // **************************************************************************************** 1699 // **************************************************************************************** 1700 1701 1702 /** 1703 * Removes the first and last element of a vectorized-HTML web-page, or sub-page. 1704 * Generally, this could be used to remove the surrounding tag's {@code '<DIV>'} ... 1705 * {@code '</DIV>'}, or something similar. 1706 * 1707 * <BR /><BR />This method <B STYLE="color: red;">WILL NOT CHECK</B> whether there are 1708 * matching HTML open-and-close tags at the end beginning and end of this sub-section. 1709 * Generally, though, that is how this method is intended to be used. 1710 * 1711 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 1712 * 1713 * @throws IllegalArgumentException If the {@code Vector} has fewer than two elements. 1714 */ 1715 public static void firstLast(Vector<? extends HTMLNode> html) 1716 { 1717 int size = html.size(); 1718 1719 if (size < 2) throw new IllegalArgumentException( 1720 "You have requested that the first and last elements the input 'page' parameter " + 1721 "(a vector) be removed. However, the vector size is only [" + size + "], so " + 1722 "this cannot be performed." 1723 ); 1724 1725 // NOTE: *** This removes elementAt(0) and elementAt(size-1) 1726 // *** NOT ALL ELEMENTS BETWEEN 0 and (size-1) 1727 1728 Util.Remove.nodesOPT(html, 0, size-1); 1729 } 1730 1731 } 1732 1733 1734 // ******************************************************************************************** 1735 // ******************************************************************************************** 1736 // Static Inner-Class: Inclusive 1737 // ******************************************************************************************** 1738 // ******************************************************************************************** 1739 1740 1741 /** 1742 * Tools for finding the matching-closing tag of any open {@link TagNode}. 1743 * 1744 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=UTILINCL> 1745 */ 1746 @Torello.JavaDoc.StaticFunctional 1747 public static class Inclusive 1748 { 1749 private Inclusive() { } 1750 1751 1752 // **************************************************************************************** 1753 // **************************************************************************************** 1754 // Inclusive Find/Get 1755 // **************************************************************************************** 1756 // **************************************************************************************** 1757 1758 /** 1759 * This finds the closing HTML {@code 'TagNode'} match for a given opening 1760 * {@code 'TagNode'} in a given-input html page or sub-section. 1761 * 1762 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 1763 * 1764 * @param nodeIndex An index into that {@code Vector}. This index must point to an 1765 * {@code HTMLNode} element that is: 1766 * 1767 * <BR /><BR /><OL CLASS=JDOL> 1768 * <LI>An instance of {@code TagNode}</LI> 1769 * <LI>A {@code TagNode} whose {@code 'isClosing'} field is {@code FALSE}</LI> 1770 * <LI>Is not a {@code 'singleton'} HTML element-token 1771 * (i.e. {@code <IMG>, <BR>, <H1>, etc...}) 1772 * </LI> 1773 * </OL> 1774 * 1775 * @return An "inclusive search" finds {@code OpeningTag} and {@code ClosingTag} pairs - 1776 * <I>and returns all the elements between them in the contents of a 1777 * return-{@code Vector}, or {@code Vector DotPair}-end-point value</I>. This method 1778 * will take a particular node of a {@code Vector}, and (as long it has a match) 1779 * find it's <I><B>closing {@code HTMLNode} match.</B></I> The integer returned will 1780 * be the index into this page of the closing, matching {@code TagNode.} 1781 * 1782 * @throws TagNodeExpectedException If the node in the {@code Vector}-parameter 1783 * {@code 'html'} contained at index {@code 'nodeIndex'} is not an instance of 1784 * {@code TagNode}, then this exception is thrown. 1785 * 1786 * @throws OpeningTagNodeExpectedException If the node in the {@code Vector}-parameter 1787 * {@code 'html'} at index {@code 'nodeIndex'} is a closing version of the HTML element, 1788 * then this exception shall throw. 1789 * 1790 * @throws InclusiveException If the node in {@code Vector}-parameter {@code 'html'}, 1791 * pointed-to by index {@code 'nodeIndex'} is an HTML {@code 'Singleton'} / Self-Closing 1792 * Tag, then this exception will be thrown. 1793 * 1794 * @see TagNode 1795 * @see TagNode#tok 1796 * @see TagNode#isClosing 1797 * @see HTMLNode 1798 */ 1799 public static int find(Vector<? extends HTMLNode> html, int nodeIndex) 1800 { 1801 TagNode tn = null; 1802 HTMLNode n = null; 1803 String tok = null; 1804 1805 if (! html.elementAt(nodeIndex).isTagNode()) 1806 1807 throw new TagNodeExpectedException ( 1808 "You have attempted to find a closing tag to match an opening one, " + 1809 "but the 'nodeIndex' (" + nodeIndex + ") you have passed doesn't contain " + 1810 "an instance of TagNode." 1811 ); 1812 1813 else tn = (TagNode) html.elementAt(nodeIndex); 1814 1815 if (tn.isClosing) throw new OpeningTagNodeExpectedException( 1816 "The TagNode indicated by 'nodeIndex' = " + nodeIndex + " has its 'isClosing' " + 1817 "boolean as TRUE - this is not an opening TagNode, but it must be to continue." 1818 ); 1819 1820 // Checks to ensure this token is not a 'self-closing' or 'singleton' tag. 1821 // If it is an exception shall throw. 1822 InclusiveException.check(tok = tn.tok); 1823 1824 int end = html.size(); 1825 int openCount = 1; 1826 1827 for (int pos = (nodeIndex+1); pos < end; pos++) 1828 1829 if ((n = html.elementAt(pos)).isTagNode()) 1830 if ((tn = ((TagNode) n)).tok.equals(tok)) 1831 { 1832 // This keeps a "Depth Count" - where "depth" is just the number of 1833 // opened tags, for which a matching, closing tag hasn't been found yet. 1834 1835 openCount += (tn.isClosing ? -1 : 1); 1836 1837 // When all open-tags of the specified HTML Element 'tok' have been 1838 // found, search has finished. 1839 1840 if (openCount == 0) return pos; 1841 } 1842 1843 // The closing-matching tag was not found 1844 return -1; 1845 } 1846 1847 /** 1848 * Convenience Method. 1849 * <BR />Invokes: {@link #find(Vector, int)} 1850 * <BR />Converts: output to <B><CODE>'GET'</CODE></B> format ({@code Vector}-sublist) 1851 * <BR />Using: {@link Util#cloneRange(Vector, int, int)} 1852 */ 1853 public static Vector<HTMLNode> get(Vector<? extends HTMLNode> html, int nodeIndex) 1854 { 1855 int endPos = find(html, nodeIndex); 1856 1857 return (endPos == -1) ? null : cloneRange(html, nodeIndex, endPos + 1); 1858 } 1859 1860 /** 1861 * Convenience Method. 1862 * <BR />Invokes: {@link #find(Vector, int)} 1863 * <BR />Converts: output to <B><CODE>'PEEK'</CODE></B> format ({@code SubSection}) 1864 * <BR />Using: {@link Util#cloneRange(Vector, int, int)} 1865 */ 1866 public static SubSection peek(Vector<? extends HTMLNode> html, int nodeIndex) 1867 { 1868 int endPos = find(html, nodeIndex); 1869 1870 return (endPos == -1) ? null : new SubSection( 1871 new DotPair(nodeIndex, endPos), 1872 cloneRange(html, nodeIndex, endPos + 1) 1873 ); 1874 } 1875 1876 /** 1877 * Convenience Method. 1878 * <BR />Invokes: {@link #find(Vector, int)} 1879 * <BR />Converts: output to <B><CODE>'POLL'</CODE></B> format ({@code Vector}-sublist), 1880 * <BR />Using: {@link Util#pollRange(Vector, int, int)} 1881 * <BR />Removes: The requested Sub-List 1882 */ 1883 public static Vector<HTMLNode> poll(Vector<? extends HTMLNode> html, int nodeIndex) 1884 { 1885 int endPos = find(html, nodeIndex); 1886 1887 return (endPos == -1) ? null : pollRange(html, nodeIndex, endPos + 1); 1888 } 1889 1890 /** 1891 * Convenience Method. 1892 * <BR />Invokes: {@link #find(Vector, int)} 1893 * <BR />Converts: output to <B><CODE>'REMOVE'</CODE></B> format ({@code int} - number 1894 * of nodes removed) 1895 * <BR />Using: {@link Remove#range(Vector, int, int)} 1896 * <BR />Removes: The requested Sub-List 1897 */ 1898 public static int remove(Vector<? extends HTMLNode> html, int nodeIndex) 1899 { 1900 int endPos = find(html, nodeIndex); 1901 1902 return (endPos == -1) ? 0 : Util.Remove.range(html, nodeIndex, endPos + 1); 1903 } 1904 1905 1906 // **************************************************************************************** 1907 // **************************************************************************************** 1908 // Optimized Methods, Inclusive Find/Get/Subsection 1909 // **************************************************************************************** 1910 // **************************************************************************************** 1911 1912 /** 1913 * Convenience Method. 1914 * <BR />Invokes: {@link #dotPairOPT(Vector, int)} 1915 * <BR />Converts: output to {@code Vector<HTMLNode>} 1916 */ 1917 public static Vector<HTMLNode> vectorOPT(Vector<? extends HTMLNode> html, int tagPos) 1918 { 1919 DotPair dp = dotPairOPT(html, tagPos); 1920 1921 if (dp == null) return null; 1922 else return Util.cloneRange(html, dp.start, dp.end + 1); 1923 } 1924 1925 /** 1926 * Convenience Method. 1927 * <BR />Invokes: {@link #dotPairOPT(Vector, int)} 1928 * <BR />Converts: output to {@code SubSection} 1929 */ 1930 public static SubSection subSectionOPT(Vector<? extends HTMLNode> html, int tagPos) 1931 { 1932 DotPair dp = dotPairOPT(html, tagPos); 1933 1934 if (dp == null) return null; 1935 else return new SubSection(dp, Util.cloneRange(html, dp.start, dp.end + 1)); 1936 } 1937 1938 /** 1939 * 1940 * <EMBED CLASS='external-html' DATA-FILE-ID=UTILIOPT> 1941 * <!-- Inclusive Opt Description --> 1942 * 1943 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 1944 * 1945 * @param tagPos <EMBED CLASS='external-html' DATA-FILE-ID=UTILOPTTP> 1946 * 1947 * @return A <B>'DotPair'</B> version of an inclusive, end-to-end HTML tag-element. 1948 * 1949 * <EMBED CLASS='external-html' DATA-FILE-ID=UTILOPTJSN> 1950 * <!-- Note on JS-DOM Tree innerHTML --> 1951 * 1952 * @see TagNode 1953 * @see TagNode#isClosing 1954 * @see TagNode#tok 1955 * @see DotPair 1956 */ 1957 public static DotPair dotPairOPT(Vector<? extends HTMLNode> html, int tagPos) 1958 { 1959 // Temp Variables 1960 HTMLNode n; TagNode tn; int openCount = 1; 1961 1962 int len = html.size(); 1963 1964 // This is the name (token) of the "Opening HTML Element", we are searching for 1965 // the matching, closing element 1966 1967 String tok = ((TagNode) html.elementAt(tagPos)).tok; 1968 1969 for (int i = (tagPos+1); i < len; i++) 1970 1971 if ((n = html.elementAt(i)).isTagNode()) 1972 if ((tn = (TagNode) n).tok.equals(tok)) 1973 { 1974 // This keeps a "Depth Count" - where "depth" is just the number of 1975 // opened tags, for which a matching, closing tag hasn't been found yet. 1976 1977 openCount += (tn.isClosing ? -1 : 1); 1978 1979 // When all open-tags of the specified HTML Element 'tok' have been 1980 // found, search has finished. 1981 1982 if (openCount == 0) return new DotPair(tagPos, i); 1983 } 1984 1985 // Was not found 1986 return null; 1987 } 1988 1989 /** 1990 * Convenience Method. 1991 * <BR />Invokes: {@link #dotPairOPT(Vector, int, int)} 1992 * <BR />Converts: output to {@code Vector<HTMLNode>} 1993 */ 1994 public static Vector<HTMLNode> vectorOPT 1995 (Vector<? extends HTMLNode> html, int tagPos, int end) 1996 { 1997 DotPair dp = dotPairOPT(html, tagPos, end); 1998 1999 if (dp == null) return null; 2000 else return Util.cloneRange(html, dp.start, dp.end + 1); 2001 } 2002 2003 /** 2004 * Convenience Method. 2005 * <BR />Invokes: {@link #dotPairOPT(Vector, int, int)} 2006 * <BR />Converts: output to {@code SubSection} 2007 */ 2008 public static SubSection subSectionOPT 2009 (Vector<? extends HTMLNode> html, int tagPos, int end) 2010 { 2011 DotPair dp = dotPairOPT(html, tagPos, end); 2012 2013 if (dp == null) return null; 2014 else return new SubSection(dp, Util.cloneRange(html, dp.start, dp.end + 1)); 2015 } 2016 2017 /** 2018 * 2019 * <EMBED CLASS='external-html' DATA-FILE-ID=UTILIOPT> 2020 * <!-- Inclusive Opt Description --> 2021 * 2022 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 2023 * 2024 * @param tagPos <EMBED CLASS='external-html' DATA-FILE-ID=UTILOPTTP> 2025 * 2026 * @param end <EMBED CLASS='external-html' DATA-FILE-ID=UTILOPTEND> 2027 * 2028 * @return A <B>'DotPair'</B> version of an inclusive, end-to-end HTML tag-element. 2029 * 2030 * <EMBED CLASS='external-html' DATA-FILE-ID=UTILOPTJSN> 2031 * <!-- Note on JS-DOM Tree innerHTML --> 2032 * 2033 * @see TagNode 2034 * @see TagNode#isClosing 2035 * @see TagNode#tok 2036 * @see DotPair 2037 */ 2038 public static DotPair dotPairOPT(Vector<? extends HTMLNode> html, int tagPos, int end) 2039 { 2040 // Temp Variables 2041 HTMLNode n; TagNode tn; int openCount = 1; int endPos; 2042 2043 // This is the name (token) of the "Opening HTML Element", we are searching for 2044 // the matching, closing element 2045 String tok = ((TagNode) html.elementAt(tagPos)).tok; 2046 2047 for (endPos = (tagPos+1); endPos < end; endPos++) 2048 2049 if ((n = html.elementAt(endPos)).isTagNode()) 2050 if ((tn = (TagNode) n).tok.equals(tok)) 2051 { 2052 // This keeps a "Depth Count" - where "depth" is just the number of 2053 // opened tags, for which a matching, closing tag hasn't been found yet. 2054 openCount += (tn.isClosing ? -1 : 1); 2055 2056 // When all open-tags of the specified HTML Element 'tok' have been 2057 // found, search has finished. 2058 if (openCount == 0) return new DotPair(tagPos, endPos); 2059 } 2060 2061 // The end of the vectorized-html page (or subsection) was reached, but the 2062 // matching-closing element was not found. 2063 return null; // assert(endPos == html.size()); 2064 } 2065 } 2066}