001package Torello.HTML; 002 003import java.util.*; 004import java.util.regex.*; 005import java.util.stream.*; 006 007import java.util.function.Predicate; 008 009import Torello.HTML.NodeSearch.*; 010import Torello.Java.*; 011 012import Torello.Java.Additional.Ret2; 013 014/** 015 * A long list of utilities for searching, finding, extracting and removing HTML from 016 * Vectorized-HTML. 017 * 018 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=UTIL> 019 */ 020@Torello.JavaDoc.StaticFunctional 021public class Util 022{ 023 private Util() { } 024 025 026 // ******************************************************************************************** 027 // ******************************************************************************************** 028 // Trim TextNode Strings 029 // ******************************************************************************************** 030 // ******************************************************************************************** 031 032 033 /** 034 * Convenience Method. 035 * <BR />Invokes: {@link #trimTextNodes(Vector, int, int, boolean)} 036 */ 037 public static int trimTextNodes(Vector<HTMLNode> page, boolean deleteZeroLengthStrings) 038 { return trimTextNodes(page, 0, -1, deleteZeroLengthStrings); } 039 040 /** 041 * Convenience Method. 042 * <BR />Receives: {@code DotPair} 043 * <BR />Invokes: {@link #trimTextNodes(Vector, int, int, boolean)} 044 */ 045 public static int trimTextNodes 046 (Vector<HTMLNode> page, DotPair dp, boolean deleteZeroLengthStrings) 047 { return trimTextNodes(page, dp.start, dp.end + 1, deleteZeroLengthStrings); } 048 049 /** 050 * This will iterate through the entire {@code Vector<HTMLNode>}, and invoke 051 * {@code java.lang.String.trim()} on each {@code TextNode} on the page. If this invocation 052 * results in a reduction of {@code String.length()}, then a new {@code TextNode} will be 053 * instantiated whose {@code TextNode.str} field is set to the result of the 054 * {@code String.trim(old_node.str)} operation. 055 * 056 * @param deleteZeroLengthStrings If a {@code TextNode's} length is zero (before or after 057 * {@code trim()} is called) and when this parameter is {@code TRUE}, that {@code TextNode} 058 * must be removed from the {@code Vector}. 059 * 060 * @return Any node that is trimmed or deleted will increment the counter. This counter 061 * final-value is returned 062 */ 063 public static int trimTextNodes 064 (Vector<HTMLNode> page, int sPos, int ePos, boolean deleteZeroLengthStrings) 065 { 066 int counter = 0; 067 IntStream.Builder b = deleteZeroLengthStrings ? IntStream.builder() : null; 068 HTMLNode n = null; 069 LV l = new LV(page, sPos, ePos); 070 071 for (int i=l.start; i < l.end; i++) 072 073 if ((n = page.elementAt(i)).isTextNode()) 074 { 075 String trimmed = n.str.trim(); 076 int trimmedLength = trimmed.length(); 077 078 if ((trimmedLength == 0) && deleteZeroLengthStrings) 079 { b.add(i); counter++; } 080 081 else if (trimmedLength < n.str.length()) 082 { page.setElementAt(new TextNode(trimmed), i); counter++; } 083 } 084 085 if (deleteZeroLengthStrings) Util.Remove.nodesOPT(page, b.build().toArray()); 086 087 return counter; 088 } 089 090 091 // ******************************************************************************************** 092 // ******************************************************************************************** 093 // Vectorized-HTML To-String Methods 094 // ******************************************************************************************** 095 // ******************************************************************************************** 096 097 098 /** 099 * Convenience Method. 100 * <BR />Invokes: {@link #rangeToString(Vector, int, int)} 101 */ 102 public static String pageToString(Vector<? extends HTMLNode> html) 103 { return rangeToString(html, 0, -1); } 104 105 /** 106 * Convenience Method. 107 * <BR />Receives: {@code DotPair} 108 * <BR />Invokes: {@link #rangeToString(Vector, int, int)} 109 */ 110 public static String rangeToString(Vector<? extends HTMLNode> html, DotPair dp) 111 { return rangeToString(html, dp.start, dp.end + 1); } 112 113 /** 114 * The purpose of this method/function is to convert a portion of the contents of an HTML-Page, 115 * currently being represented as a {@code Vector} of {@code HTMLNode's} into a {@code String.} 116 * Two {@code 'int'} parameters are provided in this method's signature to define a sub-list 117 * of a page to be converted to a {@code java.lang.String} 118 * 119 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 120 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 121 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 122 * 123 * @return The {@code Vector} converted into a {@code String}. 124 * 125 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 126 * 127 * @see #pageToString(Vector) 128 * @see #rangeToString(Vector, DotPair) 129 */ 130 public static String rangeToString(Vector<? extends HTMLNode> html, int sPos, int ePos) 131 { 132 StringBuilder ret = new StringBuilder(); 133 LV l = new LV(html, sPos, ePos); 134 135 for (int i=l.start; i < l.end; i++) ret.append(html.elementAt(i).str); 136 137 return ret.toString(); 138 } 139 140 141 // ******************************************************************************************** 142 // ******************************************************************************************** 143 // Vectorized-HTML TextNode To-String Methods 144 // ******************************************************************************************** 145 // ******************************************************************************************** 146 147 148 /** 149 * Convenience Method. 150 * <BR />Invokes: {@link #textNodesString(Vector, int, int)} 151 */ 152 public static String textNodesString(Vector<? extends HTMLNode> html) 153 { return textNodesString(html, 0, -1); } 154 155 /** 156 * Convenience Method. 157 * <BR />Receives: {@code DotPair} 158 * <BR />Invokes: {@link #textNodesString(Vector, int, int)} 159 */ 160 public static String textNodesString(Vector<? extends HTMLNode> html, DotPair dp) 161 { return textNodesString(html, dp.start, dp.end + 1); } 162 163 /** 164 * This will return a {@code String} that is comprised of ONLY the {@code TextNode's} contained 165 * within the input {@code Vector} - <I>and furthermore, only nodes that are situated between 166 * index {@code int 'sPos'} and index {@code int 'ePos'} in that {@code Vector.}</I> 167 * 168 * <BR /><BR />The {@code for-loop} that iterates the input-{@code Vector} parameter will 169 * simply skip an instance of {@code 'TagNode'} and {@code 'CommentNode'} when building the 170 * output return {@code String.}. 171 * 172 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 173 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 174 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 175 * 176 * @return This will return a {@code String} that is comprised of the text-only elements in the 177 * web-page or sub-page. Only text between the requested {@code Vector}-indices is included. 178 * 179 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 180 * 181 * @see #textNodesString(Vector, DotPair) 182 * @see #textNodesString(Vector) 183 */ 184 public static String textNodesString(Vector<? extends HTMLNode> html, int sPos, int ePos) 185 { 186 StringBuilder sb = new StringBuilder(); 187 LV l = new LV(html, sPos, ePos); 188 HTMLNode n; 189 190 for (int i=l.start; i < l.end; i++) 191 if ((n = html.elementAt(i)).isTextNode()) 192 sb.append(n.str); 193 194 return sb.toString(); 195 } 196 197 198 // ******************************************************************************************** 199 // ******************************************************************************************** 200 // TextNode Modification Operations - "Escape Text Nodes" 201 // ******************************************************************************************** 202 // ******************************************************************************************** 203 204 205 /** 206 * Convenience Method. 207 * <BR />Invokes: {@link #escapeTextNodes(Vector, int, int)} 208 */ 209 public static int escapeTextNodes(Vector<HTMLNode> html) 210 { return escapeTextNodes(html, 0, -1); } 211 212 /** 213 * Convenience Method. 214 * <BR />Receives: {@code DotPair} 215 * <BR />Invokes: {@link #escapeTextNodes(Vector, int, int)} 216 */ 217 public static int escapeTextNodes(Vector<HTMLNode> html, DotPair dp) 218 { return escapeTextNodes(html, dp.start, dp.end + 1); } 219 220 /** 221 * Will call {@code HTML.Escape.replaceAll} on each {@code TextNode} in the range of 222 * {@code sPos ... ePos} 223 * 224 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 225 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 226 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 227 * 228 * @return The number of {@code TextNode's} that changed as a result of the 229 * {@code Escape.replaceAll(n.str)} loop. 230 * 231 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 232 * 233 * @see Escape#replaceAll(String) 234 */ 235 public static int escapeTextNodes(Vector<HTMLNode> html, int sPos, int ePos) 236 { 237 LV l = new LV(html, sPos, ePos); 238 HTMLNode n = null; 239 String s = null; 240 int counter = 0; 241 242 for (int i=l.start; i < l.end; i++) 243 244 if ((n = html.elementAt(i)).isTextNode()) 245 if (! (s = Escape.replace(n.str)).equals(n.str)) 246 { 247 html.setElementAt(new TextNode(s), i); 248 counter++; 249 } 250 251 return counter; 252 } 253 254 255 // ******************************************************************************************** 256 // ******************************************************************************************** 257 // Clone HTML Vectors 258 // ******************************************************************************************** 259 // ******************************************************************************************** 260 261 262 /** 263 * Convenience Method. 264 * <BR />Invokes: {@link #cloneRange(Vector, int, int)} 265 */ 266 public static Vector<HTMLNode> clone(Vector<? extends HTMLNode> html) 267 { return cloneRange(html, 0, -1); } 268 269 /** 270 * Convenience Method. 271 * <BR />Receives: {@code DotPair} 272 * <BR />Invokes: {@link #cloneRange(Vector, int, int)} 273 */ 274 public static Vector<HTMLNode> cloneRange(Vector<? extends HTMLNode> html, DotPair dp) 275 { return cloneRange(html, dp.start, dp.end + 1); } 276 277 /** 278 * Copies (clones!) a sub-range of the HTML page, stores the results in a {@code Vector}, and 279 * returns it. 280 * 281 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 282 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 283 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 284 * 285 * @return The "cloned" (copied) sub-range specified by {@code 'sPos'} and {@code 'ePos'.} 286 * 287 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 288 * 289 * @see #cloneRange(Vector, DotPair) 290 */ 291 public static Vector<HTMLNode> cloneRange(Vector<? extends HTMLNode> html, int sPos, int ePos) 292 { 293 LV l = new LV(html, sPos, ePos); 294 Vector<HTMLNode> ret = new Vector<>(l.size()); 295 296 // Copy the range specified into the return vector 297 // 298 // HOW THIS WAS DONE BEFORE NOTICING Vector.subList 299 // 300 // for (int i = l.start; i < l.end; i++) ret.addElement(html.elementAt(i)); 301 302 ret.addAll(html.subList(l.start, l.end)); 303 304 return ret; 305 } 306 307 308 309 // ******************************************************************************************** 310 // ******************************************************************************************** 311 // String Length of the TextNode's 312 // ******************************************************************************************** 313 // ******************************************************************************************** 314 315 316 /** 317 * Convenience Method. 318 * <BR />Receives: {@code DotPair} 319 * <BR />Invokes: {@link #textStrLength(Vector, int, int)} 320 */ 321 public static int textStrLength(Vector<? extends HTMLNode> html, DotPair dp) 322 { return textStrLength(html, dp.start, dp.end + 1); } 323 324 /** 325 * Convenience Method. 326 * <BR />Invokes: {@link #textStrLength(Vector, int, int)} 327 */ 328 public static int textStrLength(Vector<? extends HTMLNode> html) 329 { return textStrLength(html, 0, -1); } 330 331 /** 332 * This method will return the length of the strings <I><B>contained by all/only instances of 333 * {@code 'TextNode'}</B></I> among the nodes of the input HTML-{@code Vector}. This is 334 * identical to the behavior of the method with the same name, but includes starting and ending 335 * bounds on the html {@code Vector}: {@code 'sPos'} & {@code 'ePos'}. 336 * 337 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 338 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 339 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 340 * 341 * @return The sum of the lengths of the text contained by text-nodes in the {@code Vector} 342 * between {@code 'sPos'} and {@code 'ePos'}. 343 * 344 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 345 */ 346 public static int textStrLength(Vector<? extends HTMLNode> html, int sPos, int ePos) 347 { 348 HTMLNode n; 349 int sum = 0; 350 LV l = new LV(html, sPos, ePos); 351 352 // Counts the length of each "String" in a "TextNode" between sPos and ePos 353 for (int i=l.start; i < l.end; i++) 354 355 if ((n = html.elementAt(i)).isTextNode()) 356 sum += n.str.length(); 357 358 return sum; 359 } 360 361 362 // ******************************************************************************************** 363 // ******************************************************************************************** 364 // Compact Adjacent / Adjoining TextNode's 365 // ******************************************************************************************** 366 // ******************************************************************************************** 367 368 369 /** 370 * Convenience Method. 371 * <BR />Invokes: {@link #compactTextNodes(Vector, int, int)} 372 */ 373 public static int compactTextNodes(Vector<HTMLNode> html) 374 { return compactTextNodes(html, 0, html.size()); } 375 376 /** 377 * Convenience Method. 378 * <BR />Receives: {@code DotPair} 379 * <BR />Invokes: {@link #compactTextNodes(Vector, int, int)} 380 */ 381 public static int compactTextNodes(Vector<HTMLNode> html, DotPair dp) 382 { return compactTextNodes(html, dp.start, dp.end + 1); } 383 384 /** 385 * Occasionally, when removing instances of {@code TagNode} from a vectorized-html 386 * page, certain instances of {@code TextNode} which were not adjacent / neighbours in 387 * the {@code Vector}, all of a sudden become adjacent. Although there are no major problems 388 * with contiguous instances of {@code TextNode} from the Search Algorithm's perspective, 389 * for programmer's, it can sometimes be befuddling to realize that the output text that 390 * is returned from a call to {@code Util.pageToString(html)} is not being found because 391 * the text that is left is broken amongst multiple instances of adjacent TextNodes. 392 * 393 * <BR /><BR />This method merely combines "Adjacent" instances of {@code class TextNode} 394 * in the {@code Vector} into single instances of {@code class TextNode} 395 * 396 * @param html Any vectorized-html web-page. If this page contain any contiguously placed 397 * {@code TextNode's}, the extra's will be eliminated, and the internal-string's inside the 398 * node's ({@code TextNode.str}) will be combined. This action will reduce the size of the 399 * actual html-{@code Vector}. 400 * 401 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 402 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 403 * 404 * @return The number of nodes that were eliminated after being combined, or 0 if there 405 * were no text-nodes that were removed. 406 * 407 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 408 * 409 * @see HTMLNode#str 410 * @see TextNode 411 */ 412 public static int compactTextNodes(Vector<HTMLNode> html, int sPos, int ePos) 413 { 414 LV l = new LV(html, sPos, ePos); 415 boolean compacting = false; 416 int firstPos = -1; 417 int delta = 0; 418 419 for (int i=l.start; i < (l.end - delta); i++) 420 421 if (html.elementAt(i).isTextNode()) 422 { 423 if (compacting) continue; // Not in "Compacting Mode" 424 compacting = true; // Start "Compacting Mode" - this is a TextNode 425 firstPos = i; 426 } 427 428 else if (compacting && (firstPos < (i-1))) // Else - Must be a TagNode or CommentNode 429 { 430 // Save compacted TextNode String's into this StringBuilder 431 StringBuilder compacted = new StringBuilder(); 432 433 // Iterate all TextNodes that were adjacent, put them together into StringBuilder 434 for (int j=firstPos; j < i; j++) compacted.append(html.elementAt(j).str); 435 436 // Place this new "aggregate TextNode" at location of the first TextNode that 437 // was compacted into this StringBuilder 438 439 html.setElementAt(new TextNode(compacted.toString()), firstPos); 440 441 // Remove the rest of the positions in the Vector that had TextNode's. These have 442 // all been put together into the "Aggregate TextNode" at position "firstPos" 443 444 Util.Remove.range(html, firstPos + 1, i); 445 446 // The change in the size of the Vector needs to be accounted for. 447 delta += (i - firstPos - 1); 448 449 // Change the loop-counter variable, too, since the size of the Vector has changed. 450 i = firstPos + 1; 451 452 // Since we just hit a CommentNode, or TagNode, exit "Compacting Mode." 453 compacting = false; 454 455 } 456 457 // NOTE: This, ALSO, MUST BE a TagNode or CommentNode (just like the previous 458 // if-else branch !) 459 // TRICKY: Don't forget this 'else' ! 460 461 else compacting = false; 462 463 // Added - Don't forget the case where the Vector ends with a series of TextNodes 464 // TRICKY TOO! (Same as the HTML Parser... The ending or 'trailing' nodes must be parsed 465 466 int lastNodePos = html.size() - 1; 467 468 if (html.elementAt(lastNodePos).isTextNode()) if (compacting && (firstPos < lastNodePos)) 469 { 470 StringBuilder compacted = new StringBuilder(); 471 472 // Compact the TextNodes that were identified at the end of the Vector range. 473 for (int j=firstPos; j <= lastNodePos; j++) compacted.append(html.elementAt(j).str); 474 475 // Replace the group of TextNode's at the end of the Vector, with the single, aggregate 476 html.setElementAt(new TextNode(compacted.toString()), firstPos); 477 Util.Remove.range(html, firstPos + 1, lastNodePos + 1); 478 } 479 480 return delta; 481 } 482 483 484 // ******************************************************************************************** 485 // ******************************************************************************************** 486 // String-Length Operations 487 // ******************************************************************************************** 488 // ******************************************************************************************** 489 490 491 /** 492 * Convenience Method. 493 * <BR />Invokes: {@link #strLength(Vector, int, int)} 494 */ 495 public static int strLength(Vector<? extends HTMLNode> html) 496 { return strLength(html, 0, -1); } 497 498 /** 499 * Convenience Method. 500 * <BR />Receives: {@code DotPair} 501 * <BR />Invokes: {@link #strLength(Vector, int, int)} 502 */ 503 public static int strLength(Vector<? extends HTMLNode> html, DotPair dp) 504 { return strLength(html, dp.start, dp.end + 1); } 505 506 /** 507 * This method simply adds / sums the {@code String}-length of every {@code HTMLNode.str } 508 * field in the passed page-{@code Vector}. It only counts nodes between parameters 509 * {@code sPos} (inclusive) and {@code ePos} (exclusive). 510 * 511 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 512 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 513 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 514 * 515 * @return The total length <B><I>- in characters -</I></B> of the sub-page of HTML between 516 * {@code 'sPos'} and {@code 'ePos'} 517 * 518 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 519 * 520 * @see #strLength(Vector) 521 */ 522 public static int strLength(Vector<? extends HTMLNode> html, int sPos, int ePos) 523 { 524 int ret = 0; 525 LV l = new LV(html, sPos, ePos); 526 527 for (int i=l.start; i < l.end; i++) ret += html.elementAt(i).str.length(); 528 529 return ret; 530 } 531 532 533 // ******************************************************************************************** 534 // ******************************************************************************************** 535 // Hash-Code Operations 536 // ******************************************************************************************** 537 // ******************************************************************************************** 538 539 540 /** 541 * Convenience Method. 542 * <BR />Invokes: {@link #hashCode(Vector, int, int)} 543 */ 544 public static int hashCode(Vector<? extends HTMLNode> html) 545 { return hashCode(html, 0, -1); } 546 547 /** 548 * Convenience Method. 549 * <BR />Receives: {@code DotPair} 550 * <BR />Invokes: {@link #hashCode(Vector, int, int)} 551 */ 552 public static int hashCode(Vector<? extends HTMLNode> html, DotPair dp) 553 { return hashCode(html, dp.start, dp.end + 1); } 554 555 /** 556 * Generates a hash-code for a vectorized html page-{@code Vector}. 557 * 558 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 559 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 560 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 561 * 562 * @return Returns the {@code String.hashCode()} of the <I><B>partial HTML-page</B></i> as if 563 * it were not being stored as a {@code Vector}, but rather as HTML inside of a 564 * Java-{@code String}. 565 * 566 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 567 * 568 * @see #hashCode(Vector) 569 */ 570 public static int hashCode(Vector<? extends HTMLNode> html, int sPos, int ePos) 571 { 572 int h = 0; 573 LV lv = new LV(html, sPos, ePos); 574 575 for (int j=lv.start; j < lv.end; j++) 576 { 577 String s = html.elementAt(j).str; 578 int l = s.length(); 579 580 // This line has been copied from the jdk8/jdk8 "String.hashCode()" method. 581 // The difference is that it iterates over the entire vector 582 583 for (int i=0; i < l; i++) h = 31 * h + s.charAt(i); 584 } 585 586 return h; 587 } 588 589 590 // ******************************************************************************************** 591 // ******************************************************************************************** 592 // JSON Script Nodes 593 // ******************************************************************************************** 594 // ******************************************************************************************** 595 596 597 /** 598 * Convenience Method. 599 * <BR />Invokes: {@link #getJSONScriptBlocks(Vector, int, int)} 600 */ 601 public static Stream<String> getJSONScriptBlocks(Vector<HTMLNode> html) 602 { return getJSONScriptBlocks(html, 0, -1); } 603 604 /** 605 * Convenience Method. 606 * <BR />Receives: {@code DotPair}. 607 * <BR />Invokes: {@link #getJSONScriptBlocks(Vector, int, int)} 608 */ 609 public static Stream<String> getJSONScriptBlocks(Vector<HTMLNode> html, DotPair dp) 610 { return getJSONScriptBlocks(html, dp.start, dp.end + 1); } 611 612 /** 613 * This method shall search for any and all {@code <SCRIPT TYPE="json">} 614 * <I>JSON TEXT</I> {@code </SCRIPT>} block present in a range of Vectorized HTML. The 615 * search method shall simply look for the toke {@code "JSON"} in the {@code TYPE} attribute 616 * of each and every {@code <SCRIPT> TagNode} that is found on the page. The validity of the 617 * {@code JSON} found within such blocks <I>is not checked for validity, nor is it even 618 * guaranteed to be {@code JSON} data!</I> 619 * 620 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 621 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 622 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 623 * 624 * @return This will return a {@code java.util.stream.Stream<String>} of each of the 625 * {@code JSON} elements present in the specified range of the Vectorized HTML passed to 626 * parameter {@code 'html'}. 627 * 628 * <EMBED CLASS='external-html' DATA-FILE-ID=STRMCNVT> 629 * 630 * @see StrTokCmpr#containsIgnoreCase(String, Predicate, String) 631 * @see Util#rangeToString(Vector, int, int) 632 */ 633 public static Stream<String> getJSONScriptBlocks(Vector<HTMLNode> html, int sPos, int ePos) 634 { 635 // Whenever building lists, it is usually easiest to use a Stream.Builder 636 Stream.Builder<String> b = Stream.builder(); 637 638 // This Predicate simply tests that if the substring "json" (CASE INSENSITIVE) is found 639 // in the TYPE attribute of a <SCRIPT TYPE=...> node, that the token-string is, indeed a 640 // word - not a substring of some other word. For instance: TYPE="json" would PASS, but 641 // TYPE="rajsong" would FAIL - because the token string is not surrounded by white-space 642 643 final Predicate<String> tester = (String s) -> 644 StrTokCmpr.containsIgnoreCase 645 (s, (Character c) -> ! Character.isLetterOrDigit(c), "json"); 646 647 // Find all <SCRIPT> node-blocks whose "TYPE" attribute abides by the tester 648 // String-Predicate named above. 649 650 Vector<DotPair> jsonDPList = InnerTagFindInclusive.all 651 (html, sPos, ePos, "script", "type", tester); 652 653 // Convert each of these DotPair element into a java.lang.String 654 // Add the String to the Stream.Builder<String> 655 656 for (DotPair jsonDP : jsonDPList) 657 if (jsonDP.size() > 2) 658 b.accept(Util.rangeToString(html, jsonDP.start + 1, jsonDP.end)); 659 660 // Build the Stream, and return it. 661 return b.build(); 662 } 663 664 665 // ******************************************************************************************** 666 // ******************************************************************************************** 667 // MISC 668 // ******************************************************************************************** 669 // ******************************************************************************************** 670 671 672 /** 673 * Inserts nodes, and allows a 'varargs' parameter. 674 * 675 * @param html Any HTML Page 676 * 677 * @param pos The position in the original {@code Vector} where the nodes shall be inserted. 678 * 679 * @param nodes A list of nodes to insert. 680 */ 681 public static void insertNodes(Vector<HTMLNode> html, int pos, HTMLNode... nodes) 682 { 683 Vector<HTMLNode> nodesVec = new Vector<>(nodes.length); 684 for (HTMLNode node : nodes) nodesVec.addElement(node); 685 html.addAll(pos, nodesVec); 686 } 687 688 /** 689 * Convenience Method. 690 * <BR />Invokes: {@link #replaceRange(Vector, int, int, Vector)} 691 */ 692 public static void replaceRange 693 (Vector<HTMLNode> page, DotPair range, Vector<HTMLNode> newNodes) 694 { replaceRange(page, range.start, range.end+1, newNodes); } 695 696 /** 697 * Replaces any all and all {@code HTMLNode's} located between the {@code Vector} locations 698 * {@code 'sPos'} (inclusive) and {@code 'ePos'} (exclusive). By exclusive, this means that 699 * the {@code HTMLNode} located at positon {@code 'ePos'} <B><I>will not</I></B> be replaced, 700 * but the one at {@code 'sPos'} <I><B>is replaced</B></I>. 701 * 702 * <BR /><BR />The size of the {@code Vector} will change by {@code newNodes.size() - 703 * (ePos + sPos)}. The contents situated between {@code Vector} location {@code sPos} and 704 * {@code sPos + newNodes.size()} will, indeed, be the contents of the {@code 'newNodes'} 705 * parameter. 706 * 707 * @param page Any Java HTML page, constructed of {@code HTMLNode (TagNode & TextNode)} 708 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 709 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 710 * @param newNodes Any Java HTML page-{@code Vector} of {@code HTMLNode}. 711 * 712 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 713 * 714 * @see #pollRange(Vector, int, int) 715 * @see Remove#range(Vector, int, int) 716 * @see #replaceRange(Vector, DotPair, Vector) 717 */ 718 public static void replaceRange 719 (Vector<HTMLNode> page, int sPos, int ePos, Vector<HTMLNode> newNodes) 720 { 721 // Torello.Java.LV 722 LV l = new LV(sPos, ePos, page); 723 724 int oldSize = ePos - sPos; 725 int newSize = newNodes.size(); 726 int insertPos = sPos; 727 int i = 0; 728 729 while ((i < newSize) && (i < oldSize)) 730 page.setElementAt(newNodes.elementAt(i++), insertPos++); 731 732 733 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 734 // CASE ONE: 735 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 736 737 if (newSize == oldSize) return; 738 739 740 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 741 // CASE TWO: 742 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 743 // 744 // The new Vector is SMALLER than the old sub-range 745 // The rest of the nodes just need to be trashed 746 // 747 // OLD-WAY: (Before realizing what Vector.subList is actually doing) 748 // Util.removeRange(page, insertPos, ePos); 749 750 if (newSize < oldSize) page.subList(insertPos, ePos).clear(); 751 752 753 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 754 // CASE THREE: 755 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 756 // 757 // The new Vector is BIGGER than the old sub-range 758 // There are still more nodes to insert. 759 760 else page.addAll(ePos, newNodes.subList(i, newSize)); 761 } 762 763 /** 764 * Java's {@code java.util.Vector} class does not allow public access to the 765 * {@code removeRange(start, end)} function. It is listed as {@code 'protected'} in Java's 766 * Documentation about the {@code class Vector.} This method upstages that, and performs the 767 * {@code 'Poll'} operation, where the nodes are first removed, stored, and then return as a 768 * function result. 769 * 770 * <BR /><BR /><B CLASS=JDDescLabel>Poll a Range:</B> 771 * 772 * <BR />The nodes that are removed are placed in a separate return {@code Vector}, and 773 * returned as a result to this method. 774 * 775 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 776 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 777 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 778 * 779 * @return A complete list ({@code Vector<HTMLNode>}) of the nodes that were removed. 780 * 781 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 782 * 783 * @see Remove#range(Vector, int, int) 784 * @see Remove#range(Vector, DotPair) 785 * @see #pollRange(Vector, DotPair) 786 */ 787 public static Vector<HTMLNode> pollRange(Vector<? extends HTMLNode> html, int sPos, int ePos) 788 { 789 // The original version of this method is preserved inside comments at the bottom of this 790 // method. Prior to seeing the Sun-Oracle Docs explaining that the return from the SubList 791 // operation "mirrors changes" back to to the original vector, the code in the comments is 792 // how this method was accomplished. 793 794 LV l = new LV(html, sPos, ePos); 795 Vector<HTMLNode> ret = new Vector<HTMLNode>(l.end - l.start); 796 List<? extends HTMLNode> list = html.subList(l.start, l.end); 797 798 // Copy the Nodes into the return Vector that the end-user receives 799 ret.addAll(list); 800 801 // Clear the nodes out of the original Vector. The Sun-Oracle Docs 802 // state that the returned sub-list is "mirrored back into" the original 803 804 list.clear(); 805 806 // Return the Vector to the user. Note that the List<HTMLNode> CANNOT be returned, 807 // because of it's mirror-qualities, and because this method expects a vector. 808 809 return ret; 810 811 /* 812 // BEFORE READING ABOUT Vector.subList(...), this is how this was accomplished: 813 // NOTE: It isn't so clear how the List<HTMLNode> works - likely it doesn't actually 814 // create any new memory-allocated arrays, it is just an "overlay" 815 816 // Copy the elements from the input vector into the return vector 817 for (int i=l.start; i < l.end; i++) ret.add(html.elementAt(i)); 818 819 // Remove the range from the input vector (this is the meaning of 'poll') 820 Util.removeRange(html, sPos, ePos); 821 822 return ret; 823 */ 824 } 825 826 /** 827 * Convenience Method. 828 * <BR />Receives: {@code DotPair} 829 * <BR />Invokes: {@link #pollRange(Vector, int, int)}. 830 */ 831 public static Vector<HTMLNode> pollRange(Vector<? extends HTMLNode> html, DotPair dp) 832 { return pollRange(html, dp.start, dp.end + 1); } 833 834 /** 835 * This removes every element from the {@code Vector} beginning at position 0, all the way to 836 * position {@code 'pos'} (exclusive). The {@code elementAt(pos)} remains in the original page 837 * input-{@code Vector}. This is the definition of 'exclusive'. 838 * 839 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 840 * 841 * @param pos Any position within the range of the input {@code Vector}. 842 * 843 * @return The elements in the {@code Vector} from position: {@code 0 ('zero')} all the way to 844 * position: {@code 'pos'} 845 */ 846 public static Vector<HTMLNode> split(Vector<? extends HTMLNode> html, int pos) 847 { return pollRange(html, 0, pos); } 848 849 850 // ******************************************************************************************** 851 // ******************************************************************************************** 852 // Static Inner-Class: Count 853 // ******************************************************************************************** 854 // ******************************************************************************************** 855 856 857 @Torello.JavaDoc.StaticFunctional 858 public static class Count 859 { 860 private Count() { } 861 862 863 // **************************************************************************************** 864 // **************************************************************************************** 865 // Count TextNode's 866 // **************************************************************************************** 867 // **************************************************************************************** 868 869 870 /** 871 * Convenience Method. 872 * <BR />Invokes: {@link #textNodes(Vector, int, int)} 873 */ 874 public static int textNodes(Vector<HTMLNode> page) 875 { return textNodes(page, 0, -1); } 876 877 /** 878 * Convenience Method. 879 * <BR />Receives: {@code DotPair} 880 * <BR />Invokes: {@link #textNodes(Vector, int, int)} 881 */ 882 public static int textNodes(Vector<HTMLNode> page, DotPair dp) 883 { return textNodes(page, dp.start, dp.end + 1); } 884 885 /** 886 * Counts the number of {@code TextNode's} in a {@code Vector<HTMLNode>} between the 887 * demarcated array / {@code Vector} positions, {@code 'sPos'} and {@code 'ePos'} 888 * 889 * @param page Any HTML page. 890 * 891 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 892 * 893 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 894 * 895 * @return The number of {@code TextNode's} in the {@code Vector} between the demarcated 896 * indices. 897 * 898 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 899 */ 900 public static int textNodes(Vector<HTMLNode> page, int sPos, int ePos) 901 { 902 int counter = 0; 903 LV l = new LV(page, sPos, ePos); 904 905 // Iterates the entire page between sPos and ePos, incrementing the count for every 906 // instance of text-node. 907 908 for (int i=l.start; i < l.end; i++) if (page.elementAt(i).isTextNode()) counter++; 909 910 return counter; 911 } 912 913 914 // **************************************************************************************** 915 // **************************************************************************************** 916 // Count CommentNode's 917 // **************************************************************************************** 918 // **************************************************************************************** 919 920 921 /** 922 * Convenience Method. 923 * <BR />Invokes: {@link #commentNodes(Vector, int, int)} 924 */ 925 public static int commentNodes(Vector<HTMLNode> page) 926 { return commentNodes(page, 0, -1); } 927 928 /** 929 * Convenience Method. 930 * <BR />Receives: {@code DotPair} 931 * <BR />Invokes: {@link #commentNodes(Vector, int, int)} 932 */ 933 public static int commentNodes(Vector<HTMLNode> page, DotPair dp) 934 { return commentNodes(page, dp.start, dp.end + 1); } 935 936 /** 937 * Counts the number of {@code CommentNode's} in an {@code Vector<HTMLNode>} between the 938 * demarcated array / {@code Vector} positions. 939 * 940 * @param page Any HTML page. 941 * 942 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 943 * 944 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 945 * 946 * @return The number of {@code CommentNode's} in the {@code Vector} between the demarcated 947 * indices. 948 * 949 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 950 */ 951 public static int commentNodes(Vector<HTMLNode> page, int sPos, int ePos) 952 { 953 int counter = 0; 954 LV l = new LV(page, sPos, ePos); 955 956 // Iterates the entire page between sPos and ePos, incrementing the count for every 957 // instance of comment-node. 958 959 for (int i=l.start; i < l.end; i++) if (page.elementAt(i).isCommentNode()) counter++; 960 961 return counter; 962 } 963 964 965 // **************************************************************************************** 966 // **************************************************************************************** 967 // Count TagNode's 968 // **************************************************************************************** 969 // **************************************************************************************** 970 971 972 /** 973 * Convenience Method. 974 * <BR />Invokes: {@link #tagNodes(Vector, int, int)} 975 */ 976 public static int tagNodes(Vector<HTMLNode> page) 977 { return tagNodes(page, 0, -1); } 978 979 /** 980 * Convenience Method. 981 * <BR />Receives: {@code DotPair} 982 * <BR />Invokes: {@link #tagNodes(Vector, int, int)} 983 */ 984 public static int tagNodes(Vector<HTMLNode> page, DotPair dp) 985 { return tagNodes(page, dp.start, dp.end + 1); } 986 987 /** 988 * Counts the number of {@code TagNode's} in a {@code Vector<HTMLNode>} between the 989 * demarcated array / {@code Vector} positions. 990 * 991 * @param page Any HTML page. 992 * 993 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 994 * 995 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 996 * 997 * @return The number of {@code TagNode's} in the {@code Vector}. 998 * 999 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 1000 */ 1001 public static int tagNodes(Vector<HTMLNode> page, int sPos, int ePos) 1002 { 1003 int counter = 0; 1004 LV l = new LV(page, sPos, ePos); 1005 1006 // Iterates the entire page between sPos and ePos, incrementing the count for every 1007 // instance of TagNode. 1008 1009 for (int i=l.start; i < l.end; i++) if (page.elementAt(i).isTagNode()) counter++; 1010 1011 return counter; 1012 } 1013 1014 1015 // **************************************************************************************** 1016 // **************************************************************************************** 1017 // Count TagNode's, put results in a java table/map 1018 // **************************************************************************************** 1019 // **************************************************************************************** 1020 1021 1022 /** 1023 * Convenience Method. 1024 * <BR />Invokes: {@link #tagNodes(Vector, int, int)} 1025 */ 1026 public static Ret2< 1027 Hashtable<String, Integer>, 1028 Hashtable<String, Integer> 1029 > 1030 tagNodesToTable(Vector<HTMLNode> page) 1031 { return tagNodesToTable(page, 0, -1); } 1032 1033 /** 1034 * Convenience Method. 1035 * <BR />Receives: {@code DotPair} 1036 * <BR />Invokes: {@link #tagNodesToTable(Vector, int, int)} 1037 */ 1038 public static Ret2< 1039 Hashtable<String, Integer>, 1040 Hashtable<String, Integer> 1041 > 1042 tagNodesToTable(Vector<HTMLNode> page, DotPair dp) 1043 { return tagNodesToTable(page, dp.start, dp.end + 1); } 1044 1045 /** 1046 * For each tag in HTML-5 (according to class {@link HTMLTags}, this method counts the 1047 * number of instances of each {@code TagNode} contained by a {@code Vector<HTMLNode>}. 1048 * The count is performed on nodes between the parameter-provided array-indices, and the 1049 * results are placed into two {@code Hashtable's}. 1050 * 1051 * @param page Any HTML page. 1052 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 1053 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 1054 * 1055 * @return The returned {@link Ret2} instance contains the following data: 1056 * 1057 * <BR /><BR /><UL CLASS=JDUL> 1058 * <LI> <B STYLE='color: red;'>{@code ret2.a}:</B> 1059 * 1060 * <BR /><BR />A {@code java.util.Hashtable} that contains one entry for each HTML-Tag 1061 * present within the page's demarcated array-indicies - {@code 'sPos'} and 1062 * {@code 'ePos'}. 1063 * 1064 * <BR /><BR />The keys in this table are Java {@code String's} that contain a 1065 * Lower-Case {@link TagNode#tok Tag-Token} (such as: "div", "p", "span", etc...). 1066 * The values in this table contain a count on <B CLASS='color: red;'>the number of 1067 * Open-Tags that were identified within the page.</B> 1068 * <BR /><BR /></LI> 1069 * 1070 * <LI> <B STYLE='color: red;'>{@code ret2.b}:</B> 1071 * 1072 * <BR /><BR />A {@code java.util.Hashtable} with counts for each and every 1073 * "Closed Tag" on the page, all in an identical manner to that which was described, 1074 * above, for {@code ret2.a} - except the counts in this table are for Closed-Tag's 1075 * rather than Open-Tag's - {@code </div>} tags, rather than {@code <DIV ...>} tags. 1076 * </LI> 1077 * 1078 * </UL> 1079 * 1080 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 1081 */ 1082 public static Ret2< 1083 Hashtable<String, Integer>, 1084 Hashtable<String, Integer> 1085 > 1086 tagNodesToTable(Vector<HTMLNode> page, int sPos, int ePos) 1087 { 1088 LV l = new LV(page, sPos, ePos); 1089 TagNode tn = null; 1090 1091 Hashtable<String, Integer> openTags = new Hashtable<>(); 1092 Hashtable<String, Integer> closedTags = new Hashtable<>(); 1093 1094 // Iterates the entire page between sPos and ePos, incrementing the count for every 1095 // instance of TagNode. 1096 1097 for (int i=l.start; i < l.end; i++) 1098 { 1099 if ((tn = page.elementAt(i).ifTagNode()) == null) continue; 1100 1101 Hashtable<String, Integer> ht = tn.isClosing ? closedTags : openTags; 1102 Integer count = ht.get(tn.tok); 1103 1104 if (count == null) count = 1; 1105 else count = count + 1; 1106 1107 ht.put(tn.tok, count); 1108 } 1109 1110 return new Ret2<>(openTags, closedTags); 1111 } 1112 1113 1114 // **************************************************************************************** 1115 // **************************************************************************************** 1116 // Count New Lines 1117 // **************************************************************************************** 1118 // **************************************************************************************** 1119 1120 1121 /** 1122 * Convenience Method. 1123 * <BR />Invokes: {@link #newLines(Vector, int, int)} 1124 */ 1125 public static int newLines(Vector<? extends HTMLNode> html) 1126 { return newLines(html, 0, -1); } 1127 1128 /** 1129 * Convenience Method. 1130 * <BR />Receives: {@code DotPair} 1131 * <BR />Invokes: {@link #newLines(Vector, int, int)} 1132 */ 1133 public static int newLines(Vector<? extends HTMLNode> html, DotPair dp) 1134 { return newLines(html, dp.start, dp.end + 1); } 1135 1136 1137 /** 1138 * This will count the number of new-line symbols present <B><I>- on the partial HTML 1139 * page</I></B>. The count will include a sum of every {@code HTMLNode.str} that 1140 * contains the standard new-line symbols: {@code \r\n, \r, \n}, meaning that UNIX, MSFT, 1141 * Apple, etc. forms of text-line rendering should all be treated equally. 1142 * 1143 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 1144 * 1145 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 1146 * 1147 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 1148 * 1149 * @return The number of new-line characters in all of the {@code HTMLNode's} that occur 1150 * between vectorized-page positions {@code 'sPos'} and {@code 'ePos.'} 1151 * 1152 * <BR /><BR /><B>NOTE:</B> The regular-expression used here 'NEWLINEP' is as follows: 1153 * 1154 * <DIV CLASS="SNIP">{@code 1155 * private static final Pattern NEWLINEP = Pattern.compile("\\r\\n|\\r|\\n"); 1156 * }</DIV> 1157 * 1158 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 1159 * 1160 * @see StringParse#NEWLINEP 1161 */ 1162 public static int newLines(Vector<? extends HTMLNode> html, int sPos, int ePos) 1163 { 1164 int newLineCount = 0; 1165 LV l = new LV(html, sPos, ePos); 1166 1167 for (int i=l.start; i < l.end; i++) 1168 1169 // Uses the Torello.Java.StringParse "New Line RegEx" 1170 for ( Matcher m = StringParse.NEWLINEP.matcher(html.elementAt(i).str); 1171 m.find(); 1172 newLineCount++); 1173 1174 return newLineCount; 1175 } 1176 } 1177 1178 1179 // ******************************************************************************************** 1180 // ******************************************************************************************** 1181 // Static Inner-Class: Remove 1182 // ******************************************************************************************** 1183 // ******************************************************************************************** 1184 1185 1186 @Torello.JavaDoc.StaticFunctional 1187 public static class Remove 1188 { 1189 private Remove() { } 1190 1191 1192 // **************************************************************************************** 1193 // **************************************************************************************** 1194 // TextNode Removal Operations 1195 // **************************************************************************************** 1196 // **************************************************************************************** 1197 1198 1199 /** 1200 * Convenience Method. 1201 * <BR />Invokes: {@link #allTextNodes(Vector, int, int)} 1202 */ 1203 public static int allTextNodes(Vector<HTMLNode> page) 1204 { return allTextNodes(page, 0, -1); } 1205 1206 /** 1207 * Convenience Method. 1208 * <BR />Receives: {@code DotPair} 1209 * <BR />Invokes: {@link #allTextNodes(Vector, int, int)} 1210 */ 1211 public static int allTextNodes(Vector<HTMLNode> page, DotPair dp) 1212 { return allTextNodes(page, dp.start, dp.end + 1); } 1213 1214 /** 1215 * Takes a sub-section of an HTML {@code Vector} and removes all {@code TextNode} present 1216 * 1217 * @param page Any HTML page 1218 * 1219 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 1220 * 1221 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 1222 * 1223 * @return The number of HTML {@code TextNode's} that were removed 1224 * 1225 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 1226 * 1227 * @see TextNode 1228 * @see #nodesOPT(Vector, int[]) 1229 */ 1230 public static int allTextNodes(Vector<HTMLNode> page, int sPos, int ePos) 1231 { 1232 IntStream.Builder b = IntStream.builder(); 1233 LV l = new LV(page, sPos, ePos); 1234 1235 // Use Java-Streams to build the list of nodes that are valid text-nodes. 1236 for (int i=l.start; i < l.end; i++) if (page.elementAt(i).isTextNode()) b.add(i); 1237 1238 // Build the stream and convert it to an int[] (integer-array) 1239 int[] posArr = b.build().toArray(); 1240 1241 // The integer array is guaranteed to be sorted, and contain valid vector-indices. 1242 nodesOPT(page, posArr); 1243 1244 return posArr.length; 1245 } 1246 1247 1248 // **************************************************************************************** 1249 // **************************************************************************************** 1250 // TagNode Removal Operations 1251 // **************************************************************************************** 1252 // **************************************************************************************** 1253 1254 1255 /** 1256 * Convenience Method. 1257 * <BR />Invokes: {@link #allTagNodes(Vector, int, int)} 1258 */ 1259 public static int allTagNodes(Vector<HTMLNode> page) 1260 { return allTagNodes(page, 0, -1); } 1261 1262 /** 1263 * Convenience Method. 1264 * <BR />Receives: {@code DotPair} 1265 * <BR />Invokes: {@link #allTagNodes(Vector, int, int)} 1266 */ 1267 public static int allTagNodes(Vector<HTMLNode> page, DotPair dp) 1268 { return allTagNodes(page, dp.start, dp.end + 1); } 1269 1270 /** 1271 * Takes a sub-section of an HTML {@code Vector} and removes all {@code TagNode} present 1272 * 1273 * @param page Any HTML page 1274 * 1275 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 1276 * 1277 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 1278 * 1279 * @return The number of HTML {@code TagNode's} that were removed 1280 * 1281 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 1282 * 1283 * @see TagNode 1284 * @see #nodesOPT(Vector, int[]) 1285 */ 1286 public static int allTagNodes(Vector<HTMLNode> page, int sPos, int ePos) 1287 { 1288 IntStream.Builder b = IntStream.builder(); 1289 LV l = new LV(page, sPos, ePos); 1290 1291 // Use Java-Streams to build the list of nodes that are valid tag-nodes. 1292 for (int i=l.start; i < l.end; i++) if (page.elementAt(i).isTagNode()) b.add(i); 1293 1294 // Build the stream and convert it to an int[] (integer-array) 1295 int[] posArr = b.build().toArray(); 1296 1297 // The integer array is guaranteed to be sorted, and contain valid vector-indices. 1298 nodesOPT(page, posArr); 1299 1300 return posArr.length; 1301 } 1302 1303 1304 // **************************************************************************************** 1305 // **************************************************************************************** 1306 // CommentNode Removal Operations 1307 // **************************************************************************************** 1308 // **************************************************************************************** 1309 1310 1311 /** 1312 * Convenience Method. 1313 * <BR />Invokes: {@link #allCommentNodes(Vector, int, int)} 1314 */ 1315 public static int allCommentNodes(Vector<HTMLNode> page) 1316 { return allCommentNodes(page, 0, -1); } 1317 1318 /** 1319 * Convenience Method. 1320 * <BR />Receives: {@code DotPair} 1321 * <BR />Invokes: {@link #allCommentNodes(Vector, int, int)} 1322 */ 1323 public static int allCommentNodes(Vector<HTMLNode> page, DotPair dp) 1324 { return allCommentNodes(page, dp.start, dp.end + 1); } 1325 1326 /** 1327 * Takes a sub-section of an HTML {@code Vector} and removes all {@code CommentNode} 1328 * present 1329 * 1330 * @param page Any HTML page 1331 * 1332 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 1333 * 1334 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 1335 * 1336 * @return The number of HTML {@code CommentNode's} that were removed 1337 * 1338 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 1339 * 1340 * @see CommentNode 1341 * @see #nodesOPT(Vector, int[]) 1342 */ 1343 public static int allCommentNodes(Vector<HTMLNode> page, int sPos, int ePos) 1344 { 1345 IntStream.Builder b = IntStream.builder(); 1346 LV l = new LV(page, sPos, ePos); 1347 1348 // Use Java-Streams to build the list of nodes that are valid comment-nodes. 1349 for (int i=l.start; i < l.end; i++) 1350 if (page.elementAt(i).isCommentNode()) 1351 b.add(i); 1352 1353 // Build the stream and convert it to an int[] (integer-array) 1354 int[] posArr = b.build().toArray(); 1355 1356 // The integer array is guaranteed to be sorted, and contain valid vector-indices. 1357 nodesOPT(page, posArr); 1358 1359 return posArr.length; 1360 } 1361 1362 1363 // **************************************************************************************** 1364 // **************************************************************************************** 1365 // Remove All Inner Tags 1366 // **************************************************************************************** 1367 // **************************************************************************************** 1368 1369 1370 /** 1371 * Convenience Method. 1372 * <BR />Invokes: {@link #allInnerTags(Vector, int, int)} 1373 */ 1374 public static int allInnerTags(Vector<HTMLNode> html) 1375 { return allInnerTags(html, 0, -1); } 1376 1377 /** 1378 * Convenience Method. 1379 * <BR />Receives: {@code DotPair} 1380 * <BR />Invokes: {@link #allInnerTags(Vector, int, int)} 1381 */ 1382 public static int allInnerTags(Vector<? super TagNode> html, DotPair dp) 1383 { return allInnerTags(html, dp.start, dp.end + 1); } 1384 1385 /** 1386 * This method removes all inner-tags (all attributes) from every {@link TagNode} inside of 1387 * an HTML page. It does this by replacing every {@code TagNode} in the {@code Vector} 1388 * with the pre-instantiated, publicly-available {@code TagNode} which can be obtained by a 1389 * call to the class {@code HTMLTags.hasTag(token, TC)}. 1390 * 1391 * <BR /><BR /><B CLASS=JDDescLabel>Replacing {@code TagNode's:}</B> 1392 * 1393 * <BR />This method determines whether a fresh {@link TagNode} is to be inserted by 1394 * measuring the length of the internal {@link TagNode#str} field (a {@code String} field). 1395 * If the length {@code TagNode.str} is not equal to the HTML token {@link TagNode#tok} 1396 * length <B><I>plus 2</I></B>, then a fresh, pre-instantiated, node is replaced. 1397 * 1398 * <BR /><BR />The {@code '+2'} figure comes from the additional characters {@code '<'} and 1399 * {@code '>'} that start and end every HTML {@code TagNode} 1400 * 1401 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 1402 * 1403 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 1404 * 1405 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 1406 * 1407 * @return The number of {@code TagNode} elements that have were replaced with 1408 * zero-attribute HTML Element Tags. 1409 * 1410 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 1411 * 1412 * @throws ClassCastException If {@code 'html'} contains references that do not inherit 1413 * {@code HTMLNode}. 1414 */ 1415 @SuppressWarnings("unchecked") 1416 public static int allInnerTags(Vector<? super TagNode> html, int sPos, int ePos) 1417 { 1418 int ret = 0; 1419 LV l = new LV(sPos, ePos, html); 1420 TagNode tn; 1421 1422 for (int i = (l.end-1); i >= l.start; i--) 1423 1424 if ((tn = ((HTMLNode) html.elementAt(i)).openTagPWA()) != null) 1425 1426 { 1427 ret++; 1428 1429 // HTMLTags.hasTag(tok, TC) gets an empty and pre-instantiated TagNode, 1430 // where TagNode.tok == 'tn.tok' and TagNode.isClosing = false 1431 1432 html.setElementAt(HTMLTags.hasTag(tn.tok, TC.OpeningTags), i); 1433 } 1434 1435 return ret; 1436 } 1437 1438 1439 // **************************************************************************************** 1440 // **************************************************************************************** 1441 // Style-Node & Script-Node Block Removal Operations 1442 // **************************************************************************************** 1443 // **************************************************************************************** 1444 1445 1446 /** 1447 * Removes all HTML {@code 'style'} Node blocks. 1448 * 1449 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 1450 * 1451 * @return The number of {@code <STYLE>}-Node Blocks that were removed 1452 */ 1453 public static int styleNodeBlocks(Vector<? extends HTMLNode> html) 1454 { 1455 int removeCount = 0; 1456 1457 while (TagNodeRemoveInclusive.first(html, "style") > 0) removeCount++; 1458 1459 return removeCount; 1460 } 1461 1462 /** 1463 * Removes all {@code 'script'} Node blocks. 1464 * 1465 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 1466 * 1467 * @return The number of {@code SCRIPT}-Node Blocks that were removed 1468 */ 1469 public static int scriptNodeBlocks(Vector<? extends HTMLNode> html) 1470 { 1471 int removeCount = 0; 1472 1473 while (TagNodeRemoveInclusive.first(html, "script") > 0) removeCount++; 1474 1475 return removeCount; 1476 } 1477 1478 1479 // **************************************************************************************** 1480 // **************************************************************************************** 1481 // Remove a Sub-Range of nodes 1482 // **************************************************************************************** 1483 // **************************************************************************************** 1484 1485 1486 /** 1487 * Java's {@code java.util.Vector} class does not allow public access to the 1488 * {@code removeRange(start, end)} function. It is protected in Java's Documentation about 1489 * the {@code Vector} class. This method does exactly that, nothing else. 1490 * 1491 * @param page Any Java HTML page, constructed of {@code HTMLNode (TagNode & TextNode)} 1492 * 1493 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 1494 * 1495 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 1496 * 1497 * @return the number of nodes removed. 1498 * 1499 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 1500 * 1501 * @see #pollRange(Vector, int, int) 1502 * @see #range(Vector, DotPair) 1503 */ 1504 public static <T extends HTMLNode> int range(Vector<T> page, int sPos, int ePos) 1505 { 1506 // Torello.Java.LV 1507 LV l = new LV(sPos, ePos, page); 1508 1509 // According to the Sun-Oracle Docs, the returned sublist "mirros" the original vector, 1510 // which means that when it is changed, so is the original vector. 1511 1512 page.subList(l.start, l.end).clear(); 1513 1514 return l.size(); 1515 } 1516 1517 /** 1518 * Convenience Method. 1519 * <BR />Receives: {@code DotPair} 1520 * <BR />Invokes: {@link #range(Vector, int, int)} 1521 */ 1522 public static int range(Vector<? extends HTMLNode> html, DotPair dp) 1523 { return range(html, dp.start, dp.end + 1); } 1524 1525 1526 // **************************************************************************************** 1527 // **************************************************************************************** 1528 // Remove Specified Nodes by Vector-Index 1529 // **************************************************************************************** 1530 // **************************************************************************************** 1531 1532 1533 /** 1534 * <SPAN STYLE="color: red;"><B>OPT: Optimized</B></SPAN> 1535 * 1536 * <BR /><BR />This method does the same thing as 1537 * {@link Remove#nodes(boolean, Vector, int[])}, but all error checking is skipped, and the 1538 * input integer array is presumed to have been sorted. There are no guarantees about the 1539 * behavior of this method if the input array {@code 'posArr'} is not sorted, 1540 * <I>least-to-greatest,</I> or if there are duplicate or negative values in this array. 1541 * 1542 * <BR /><BR /><B CLASS=JDDescLabel>Empty Var-Args:</B> 1543 * 1544 * <BR />If the var-args input integer-array parameter is empty, this method shall exit 1545 * gracefully (and immediately). 1546 * 1547 * @param page Any HTML-Page, usually ones generated by {@code HTMLPage.getPageTokens}, but 1548 * these may be obtained or created in any fashion so necessary. 1549 * 1550 * @param posArr An array of integers which list/identify the nodes in the page to be 1551 * removed. Because this implementation has been optimized, no error checking will be 1552 * performed on this input. It is presumed to be sorted, least-to-greatest, and that all 1553 * values in the array are valid-indices into the vectorized-html parameter {@code 'page'} 1554 */ 1555 public static <T extends HTMLNode> void nodesOPT(Vector<T> page, int... posArr) 1556 { 1557 if (posArr.length == 0) return; 1558 1559 int endingInsertPos = page.size() - posArr.length; 1560 int posArrIndex = 0; 1561 int insertPos = posArr[0]; 1562 int retrievePos = posArr[0]; 1563 1564 // There is very little that can be documented about these two loops. Took 3 hours 1565 // to figure out. Read the variables names for "best documentation" 1566 1567 while (insertPos < endingInsertPos) 1568 { 1569 // This inner-loop is necessary for when the posArr has consecutive-elements that 1570 // are *ALSO* consecutive-pointers. 1571 // 1572 // For instance, this invokation: 1573 // Util.removeNodes(page, 4, 5, 6); ... 1574 // where 4, 5, and 6 are consecutive - the inner while-loop is required. 1575 // 1576 // For this invokation: 1577 // Util.removeNodes(page, 2, 4, 6); 1578 // the inner-loop is not entered. 1579 1580 while ((posArrIndex < posArr.length) && (retrievePos == posArr[posArrIndex])) 1581 { retrievePos++; posArrIndex++; } 1582 1583 page.setElementAt(page.elementAt(retrievePos++), insertPos++); 1584 } 1585 1586 // Remove all remaining elements in the tail of the array. 1587 page.setSize(page.size() - posArr.length); 1588 } 1589 1590 1591 /** 1592 * This method remove each HTMLNode from the passed-parameter {@code 'page'} 1593 * listed/identified by the input array {@code 'nodeList'}. 1594 * 1595 * <BR /><BR /><B CLASS=JDDescLabel>Empty Var-Args:</B> 1596 * 1597 * <BR />If the var-args input integer-array parameter is empty, this method shall exit 1598 * gracefully (and immediately). 1599 * 1600 * @param preserveInputArray This is a convenience input parameter that allows a programmer 1601 * to "preserve" the original input-parameter integer-array that is passed to this method. 1602 * It could be argued this parameter is "superfluous" - however, keep in mind that the 1603 * passed parameter {@code 'nodeList'} <B><I>must be sorted</I></B> before this method is 1604 * able function properly. There is a sort that's performed within the body of this method. 1605 * Just in case that the original order of the integer-array input-parameter must be 1606 * preserved, its possible to request for the sort to operate on "a clone" of the 1607 * input-parameter integer-array, instead of the original integer-array {@code 'nodeList'} 1608 * itself. 1609 * 1610 * @param page Any HTML-Page, usually ones generated by 1611 * {@code HTMLPage.getPageTokens(...)}, but these may be obtained or created in any fashion 1612 * so necessary. 1613 * 1614 * @param nodeList An array of integers which list/identify the nodes in the page to be 1615 * removed. 1616 * 1617 * @throws IllegalArgumentException If the {@code 'nodeList'} contains duplicate entries. 1618 * Obviously, no {@code HTMLNode} may be removed from the {@code Vector<HTMLNode>} more 1619 * than once. 1620 * 1621 * @throws IndexOutOfBoundsException If the nodeList contains index-pointers / items that 1622 * are not within the bounds of the passed HTML-Page {@code Vector}. 1623 */ 1624 public static <T extends HTMLNode> void nodes 1625 (boolean preserveInputArray, Vector<T> page, int... nodeList) 1626 { 1627 if (nodeList.length == 0) return; 1628 1629 // @Safe Var Args 1630 int[] posArr = preserveInputArray ? nodeList.clone() : nodeList; 1631 int len = posArr.length; 1632 1633 Arrays.sort(posArr); 1634 1635 // Check for duplicates in the nodeList, no HTMLNode may be removed twice! 1636 for (int i=0; i < (len - 1); i++) 1637 1638 if (posArr[i] == posArr[i+1]) throw new IllegalArgumentException( 1639 "The input array contains duplicate items, this is not allowed.\n" + 1640 "This is since each array-entry is intended to be a pointer/index for items " + 1641 "to be removed.\nNo item can possibly be removed twice.!" 1642 ); 1643 1644 // Make sure all nodes are within the bounds of the original Vector. (no negative 1645 // indexes, no indexes greater than the size of the Vector) 1646 1647 if ((posArr[0] < 0) || (posArr[len - 1] >= page.size())) 1648 1649 throw new IndexOutOfBoundsException ( 1650 "The input array contains entries which are not within the bounds of the " + 1651 "original-passed Vector.\nHTMLPage Vector has: " + page.size() + 1652 " elements.\n" + 1653 "Maximum element in the nodeList is [" + posArr[len - 1] + "], and the " + 1654 "minimum element is: [" + posArr[0] + "]" 1655 ); 1656 1657 int endingInsertPos = page.size() - posArr.length; 1658 int posArrIndex = 0; 1659 int insertPos = posArr[0]; 1660 int retrievePos = posArr[0]; 1661 1662 // There is very little that can be documented about these two loops. Took 3 hours 1663 // to figure out. Read the variables names for "best documentation" 1664 1665 while (insertPos < endingInsertPos) 1666 { 1667 // This inner-loop is necessary for when the posArr has consecutive-elements that 1668 // are *ALSO* consecutive-pointers. 1669 // 1670 // For instance, this invocation: 1671 // Util.removeNodes(page, 4, 5, 6); 1672 // where 4, 5, and 6 are consecutive - the inner while-loop is required. 1673 // 1674 // For this invocation: 1675 // Util.removeNodes(page, 2, 4, 6); 1676 // the inner-loop is not entered. 1677 1678 while ((posArrIndex < posArr.length) && (retrievePos == posArr[posArrIndex])) 1679 { retrievePos++; posArrIndex++; } 1680 1681 page.setElementAt(page.elementAt(retrievePos++), insertPos++); 1682 } 1683 1684 // Remove all remaining elements in the tail of the array. 1685 page.setSize(page.size() - posArr.length); 1686 } 1687 1688 1689 // **************************************************************************************** 1690 // **************************************************************************************** 1691 // Inclusive-Empty Removal Operations 1692 // **************************************************************************************** 1693 // **************************************************************************************** 1694 1695 1696 /** 1697 * Convenience Method. 1698 * <BR />Invokes: {@link #inclusiveEmpty(Vector, int, int, String[])} 1699 */ 1700 public static int inclusiveEmpty(Vector<HTMLNode> page, String... htmlTags) 1701 { return inclusiveEmpty(page, 0, -1, htmlTags); } 1702 1703 /** 1704 * Convenience Method. 1705 * <BR />Receives: {@code DotPair} 1706 * <BR />Invokes: {@link #inclusiveEmpty(Vector, int, int, String[])} 1707 */ 1708 public static int inclusiveEmpty(Vector<HTMLNode> page, DotPair dp, String... htmlTags) 1709 { return inclusiveEmpty(page, dp.start, dp.end + 1, htmlTags); } 1710 1711 /** 1712 * This will do an "Inclusive Search" using the standard class 1713 * {@link TagNodeInclusiveIterator} in the {@code package NodeSearch}. Then it will 1714 * inspect the contents of the subsections. Any subsections that do not contain any 1715 * instances of {@code HTMLNode} in between them, or any subsections that only contain 1716 * "blank-text" (white-space) between them shall be removed. 1717 * 1718 * <BR /><BR /><B CLASS=JDDescLabel>Recursive Method:</B> 1719 * 1720 * <BR />The search logic shall perform multiple <I><B>recursive iterations</B></I> of 1721 * itself, such that if, for instance, the user requested that all empty HTML divider 1722 * ({@code <DIV>}) elements be removed, if after removing a set a dividers resulted in more 1723 * empty ones (nested {@code <DIV>} elements), then an additional removal shall be called. 1724 * <I>This recursion shall continue until there are no empty HTML elements of the types 1725 * listed by</I> {@code 'htmlTags'} 1726 * 1727 * @param page Any vectorized-html page or sub-page. 1728 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 1729 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 1730 * 1731 * @param htmlTags The list of <I>inclusive</I> (non-singleton) html elements to search for 1732 * possibly being empty container tags. 1733 * 1734 * @return The number of {@code HTMLNode's} that were removed. 1735 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 1736 */ 1737 public static int inclusiveEmpty 1738 (Vector<HTMLNode> page, int sPos, int ePos, String... htmlTags) 1739 { 1740 DotPair subList; 1741 1742 int removed = 0; 1743 HNLIInclusive iter = TagNodeInclusiveIterator.iter(page, htmlTags); 1744 LV l = new LV(page, sPos, ePos); 1745 1746 iter.restrictCursor(l); 1747 1748 TOP: 1749 while (iter.hasNext()) 1750 1751 // If there is only the opening & closing pair, with nothing in between, 1752 // then the pair must be removed because it is "Empty" (Inclusive Empty) 1753 1754 if ((subList = iter.nextDotPair()).size() == 2) 1755 { 1756 iter.remove(); 1757 ePos -= subList.size(); 1758 removed += subList.size(); 1759 } 1760 1761 else 1762 { 1763 // If there is any TagNode in between the start-end pair, then this is NOT 1764 // EMPTY. In this case, skip to the next start-end opening-closing pair. 1765 1766 for (int i=(subList.start + 1); i < subList.end; i++) 1767 if (! page.elementAt(i).isTextNode()) 1768 continue TOP; 1769 1770 // If there were only TextNode's between an opening-closing TagNode Pair.... 1771 // **AND** those TextNode's are only white-space, then this also considered 1772 // Inclusively Empty. (Get all TextNode's, and if .trim() reduces the length() 1773 // to zero, then it was only white-space. 1774 1775 if (Util.textNodesString(page, subList).trim().length() == 0) 1776 { 1777 iter.remove(); 1778 ePos -= subList.size(); 1779 removed += subList.size(); 1780 } 1781 } 1782 1783 // This process must be continued recursively, because if any inner, for instance, 1784 // <DIV> ... </DIV> was removed, then the outer list must be re-checked... 1785 1786 if (removed > 0) 1787 return removed + Remove.inclusiveEmpty(page, sPos, ePos, htmlTags); 1788 else 1789 return 0; 1790 } 1791 1792 1793 // **************************************************************************************** 1794 // **************************************************************************************** 1795 // Miscellaneous Removal Operations 1796 // **************************************************************************************** 1797 // **************************************************************************************** 1798 1799 1800 /** 1801 * Removes the first and last element of a vectorized-HTML web-page, or sub-page. 1802 * Generally, this could be used to remove the surrounding tag's {@code '<DIV>'} ... 1803 * {@code '</DIV>'}, or something similar. 1804 * 1805 * <BR /><BR />This method <B STYLE="color: red;">WILL NOT CHECK</B> whether there are 1806 * matching HTML open-and-close tags at the end beginning and end of this sub-section. 1807 * Generally, though, that is how this method is intended to be used. 1808 * 1809 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 1810 * 1811 * @throws IllegalArgumentException If the {@code Vector} has fewer than two elements. 1812 */ 1813 public static void firstLast(Vector<? extends HTMLNode> html) 1814 { 1815 int size = html.size(); 1816 1817 if (size < 2) throw new IllegalArgumentException( 1818 "You have requested that the first and last elements the input 'page' parameter " + 1819 "(a vector) be removed. However, the vector size is only [" + size + "], so " + 1820 "this cannot be performed." 1821 ); 1822 1823 // NOTE: *** This removes elementAt(0) and elementAt(size-1) 1824 // *** NOT ALL ELEMENTS BETWEEN 0 and (size-1) 1825 1826 Util.Remove.nodesOPT(html, 0, size-1); 1827 } 1828 1829 } 1830 1831 1832 // ******************************************************************************************** 1833 // ******************************************************************************************** 1834 // Static Inner-Class: Inclusive 1835 // ******************************************************************************************** 1836 // ******************************************************************************************** 1837 1838 1839 /** 1840 * Tools for finding the matching-closing tag of any open {@link TagNode}. 1841 * 1842 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=UTILINCL> 1843 */ 1844 @Torello.JavaDoc.StaticFunctional 1845 public static class Inclusive 1846 { 1847 private Inclusive() { } 1848 1849 1850 // **************************************************************************************** 1851 // **************************************************************************************** 1852 // Inclusive Find/Get 1853 // **************************************************************************************** 1854 // **************************************************************************************** 1855 1856 /** 1857 * This finds the closing HTML {@code 'TagNode'} match for a given opening 1858 * {@code 'TagNode'} in a given-input html page or sub-section. 1859 * 1860 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 1861 * 1862 * @param nodeIndex An index into that {@code Vector}. This index must point to an 1863 * {@code HTMLNode} element that is: 1864 * 1865 * <BR /><BR /><OL CLASS=JDOL> 1866 * <LI>An instance of {@code TagNode}</LI> 1867 * <LI>A {@code TagNode} whose {@code 'isClosing'} field is {@code FALSE}</LI> 1868 * <LI>Is not a {@code 'singleton'} HTML element-token 1869 * (i.e. {@code <IMG>, <BR>, <H1>, etc...}) 1870 * </LI> 1871 * </OL> 1872 * 1873 * @return An "inclusive search" finds {@code OpeningTag} and {@code ClosingTag} pairs - 1874 * <I>and returns all the elements between them in the contents of a 1875 * return-{@code Vector}, or {@code Vector DotPair}-end-point value</I>. This method 1876 * will take a particular node of a {@code Vector}, and (as long it has a match) 1877 * find it's <I><B>closing {@code HTMLNode} match.</B></I> The integer returned will 1878 * be the index into this page of the closing, matching {@code TagNode.} 1879 * 1880 * @throws TagNodeExpectedException If the node in the {@code Vector}-parameter 1881 * {@code 'html'} contained at index {@code 'nodeIndex'} is not an instance of 1882 * {@code TagNode}, then this exception is thrown. 1883 * 1884 * @throws OpeningTagNodeExpectedException If the node in the {@code Vector}-parameter 1885 * {@code 'html'} at index {@code 'nodeIndex'} is a closing version of the HTML element, 1886 * then this exception shall throw. 1887 * 1888 * @throws InclusiveException If the node in {@code Vector}-parameter {@code 'html'}, 1889 * pointed-to by index {@code 'nodeIndex'} is an HTML {@code 'Singleton'} / Self-Closing 1890 * Tag, then this exception will be thrown. 1891 * 1892 * @see TagNode 1893 * @see TagNode#tok 1894 * @see TagNode#isClosing 1895 * @see HTMLNode 1896 */ 1897 public static int find(Vector<? extends HTMLNode> html, int nodeIndex) 1898 { 1899 TagNode tn = null; 1900 HTMLNode n = null; 1901 String tok = null; 1902 1903 if (! html.elementAt(nodeIndex).isTagNode()) 1904 1905 throw new TagNodeExpectedException ( 1906 "You have attempted to find a closing tag to match an opening one, " + 1907 "but the 'nodeIndex' (" + nodeIndex + ") you have passed doesn't contain " + 1908 "an instance of TagNode." 1909 ); 1910 1911 else tn = (TagNode) html.elementAt(nodeIndex); 1912 1913 if (tn.isClosing) throw new OpeningTagNodeExpectedException( 1914 "The TagNode indicated by 'nodeIndex' = " + nodeIndex + " has its 'isClosing' " + 1915 "boolean as TRUE - this is not an opening TagNode, but it must be to continue." 1916 ); 1917 1918 // Checks to ensure this token is not a 'self-closing' or 'singleton' tag. 1919 // If it is an exception shall throw. 1920 InclusiveException.check(tok = tn.tok); 1921 1922 int end = html.size(); 1923 int openCount = 1; 1924 1925 for (int pos = (nodeIndex+1); pos < end; pos++) 1926 1927 if ((n = html.elementAt(pos)).isTagNode()) 1928 if ((tn = ((TagNode) n)).tok.equals(tok)) 1929 { 1930 // This keeps a "Depth Count" - where "depth" is just the number of 1931 // opened tags, for which a matching, closing tag hasn't been found yet. 1932 1933 openCount += (tn.isClosing ? -1 : 1); 1934 1935 // When all open-tags of the specified HTML Element 'tok' have been 1936 // found, search has finished. 1937 1938 if (openCount == 0) return pos; 1939 } 1940 1941 // The closing-matching tag was not found 1942 return -1; 1943 } 1944 1945 /** 1946 * Convenience Method. 1947 * <BR />Invokes: {@link #find(Vector, int)} 1948 * <BR />Converts: output to <B><CODE>'GET'</CODE></B> format ({@code Vector}-sublist) 1949 * <BR />Using: {@link Util#cloneRange(Vector, int, int)} 1950 */ 1951 public static Vector<HTMLNode> get(Vector<? extends HTMLNode> html, int nodeIndex) 1952 { 1953 int endPos = find(html, nodeIndex); 1954 1955 return (endPos == -1) ? null : cloneRange(html, nodeIndex, endPos + 1); 1956 } 1957 1958 /** 1959 * Convenience Method. 1960 * <BR />Invokes: {@link #find(Vector, int)} 1961 * <BR />Converts: output to <B><CODE>'PEEK'</CODE></B> format ({@code SubSection}) 1962 * <BR />Using: {@link Util#cloneRange(Vector, int, int)} 1963 */ 1964 public static SubSection peek(Vector<? extends HTMLNode> html, int nodeIndex) 1965 { 1966 int endPos = find(html, nodeIndex); 1967 1968 return (endPos == -1) ? null : new SubSection( 1969 new DotPair(nodeIndex, endPos), 1970 cloneRange(html, nodeIndex, endPos + 1) 1971 ); 1972 } 1973 1974 /** 1975 * Convenience Method. 1976 * <BR />Invokes: {@link #find(Vector, int)} 1977 * <BR />Converts: output to <B><CODE>'POLL'</CODE></B> format ({@code Vector}-sublist), 1978 * <BR />Using: {@link Util#pollRange(Vector, int, int)} 1979 * <BR />Removes: The requested Sub-List 1980 */ 1981 public static Vector<HTMLNode> poll(Vector<? extends HTMLNode> html, int nodeIndex) 1982 { 1983 int endPos = find(html, nodeIndex); 1984 1985 return (endPos == -1) ? null : pollRange(html, nodeIndex, endPos + 1); 1986 } 1987 1988 /** 1989 * Convenience Method. 1990 * <BR />Invokes: {@link #find(Vector, int)} 1991 * <BR />Converts: output to <B><CODE>'REMOVE'</CODE></B> format ({@code int} - number 1992 * of nodes removed) 1993 * <BR />Using: {@link Remove#range(Vector, int, int)} 1994 * <BR />Removes: The requested Sub-List 1995 */ 1996 public static int remove(Vector<? extends HTMLNode> html, int nodeIndex) 1997 { 1998 int endPos = find(html, nodeIndex); 1999 2000 return (endPos == -1) ? 0 : Util.Remove.range(html, nodeIndex, endPos + 1); 2001 } 2002 2003 2004 // **************************************************************************************** 2005 // **************************************************************************************** 2006 // Optimized Methods, Inclusive Find/Get/Subsection 2007 // **************************************************************************************** 2008 // **************************************************************************************** 2009 2010 /** 2011 * Convenience Method. 2012 * <BR />Invokes: {@link #dotPairOPT(Vector, int)} 2013 * <BR />Converts: output to {@code Vector<HTMLNode>} 2014 */ 2015 public static Vector<HTMLNode> vectorOPT(Vector<? extends HTMLNode> html, int tagPos) 2016 { 2017 DotPair dp = dotPairOPT(html, tagPos); 2018 2019 if (dp == null) return null; 2020 else return Util.cloneRange(html, dp.start, dp.end + 1); 2021 } 2022 2023 /** 2024 * Convenience Method. 2025 * <BR />Invokes: {@link #dotPairOPT(Vector, int)} 2026 * <BR />Converts: output to {@code SubSection} 2027 */ 2028 public static SubSection subSectionOPT(Vector<? extends HTMLNode> html, int tagPos) 2029 { 2030 DotPair dp = dotPairOPT(html, tagPos); 2031 2032 if (dp == null) return null; 2033 else return new SubSection(dp, Util.cloneRange(html, dp.start, dp.end + 1)); 2034 } 2035 2036 /** 2037 * 2038 * <EMBED CLASS='external-html' DATA-FILE-ID=UTILIOPT> 2039 * <!-- Inclusive Opt Description --> 2040 * 2041 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 2042 * @param tagPos <EMBED CLASS='external-html' DATA-FILE-ID=UTILOPTTP> 2043 * @return A <B>'DotPair'</B> version of an inclusive, end-to-end HTML tag-element. 2044 * 2045 * <EMBED CLASS='external-html' DATA-FILE-ID=UTILOPTJSN> 2046 * <!-- Note on JS-DOM Tree innerHTML --> 2047 * 2048 * @see TagNode 2049 * @see TagNode#isClosing 2050 * @see TagNode#tok 2051 * @see DotPair 2052 */ 2053 public static DotPair dotPairOPT(Vector<? extends HTMLNode> html, int tagPos) 2054 { 2055 // Temp Variables 2056 HTMLNode n; TagNode tn; int openCount = 1; 2057 2058 int len = html.size(); 2059 2060 // This is the name (token) of the "Opening HTML Element", we are searching for 2061 // the matching, closing element 2062 2063 String tok = ((TagNode) html.elementAt(tagPos)).tok; 2064 2065 for (int i = (tagPos+1); i < len; i++) 2066 2067 if ((n = html.elementAt(i)).isTagNode()) 2068 if ((tn = (TagNode) n).tok.equals(tok)) 2069 { 2070 // This keeps a "Depth Count" - where "depth" is just the number of 2071 // opened tags, for which a matching, closing tag hasn't been found yet. 2072 2073 openCount += (tn.isClosing ? -1 : 1); 2074 2075 // When all open-tags of the specified HTML Element 'tok' have been 2076 // found, search has finished. 2077 2078 if (openCount == 0) return new DotPair(tagPos, i); 2079 } 2080 2081 // Was not found 2082 return null; 2083 } 2084 2085 /** 2086 * Convenience Method. 2087 * <BR />Invokes: {@link #dotPairOPT(Vector, int, int)} 2088 * <BR />Converts: output to {@code Vector<HTMLNode>} 2089 */ 2090 public static Vector<HTMLNode> vectorOPT 2091 (Vector<? extends HTMLNode> html, int tagPos, int end) 2092 { 2093 DotPair dp = dotPairOPT(html, tagPos, end); 2094 2095 if (dp == null) return null; 2096 else return Util.cloneRange(html, dp.start, dp.end + 1); 2097 } 2098 2099 /** 2100 * Convenience Method. 2101 * <BR />Invokes: {@link #dotPairOPT(Vector, int, int)} 2102 * <BR />Converts: output to {@code SubSection} 2103 */ 2104 public static SubSection subSectionOPT 2105 (Vector<? extends HTMLNode> html, int tagPos, int end) 2106 { 2107 DotPair dp = dotPairOPT(html, tagPos, end); 2108 2109 if (dp == null) return null; 2110 else return new SubSection(dp, Util.cloneRange(html, dp.start, dp.end + 1)); 2111 } 2112 2113 /** 2114 * 2115 * <EMBED CLASS='external-html' DATA-FILE-ID=UTILIOPT> 2116 * <!-- Inclusive Opt Description --> 2117 * 2118 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 2119 * @param tagPos <EMBED CLASS='external-html' DATA-FILE-ID=UTILOPTTP> 2120 * @param end <EMBED CLASS='external-html' DATA-FILE-ID=UTILOPTEND> 2121 * 2122 * @return A <B>'DotPair'</B> version of an inclusive, end-to-end HTML tag-element. 2123 * 2124 * <EMBED CLASS='external-html' DATA-FILE-ID=UTILOPTJSN> 2125 * <!-- Note on JS-DOM Tree innerHTML --> 2126 * 2127 * @see TagNode 2128 * @see TagNode#isClosing 2129 * @see TagNode#tok 2130 * @see DotPair 2131 */ 2132 public static DotPair dotPairOPT(Vector<? extends HTMLNode> html, int tagPos, int end) 2133 { 2134 // Temp Variables 2135 HTMLNode n; TagNode tn; int openCount = 1; int endPos; 2136 2137 // This is the name (token) of the "Opening HTML Element", we are searching for 2138 // the matching, closing element 2139 String tok = ((TagNode) html.elementAt(tagPos)).tok; 2140 2141 for (endPos = (tagPos+1); endPos < end; endPos++) 2142 2143 if ((n = html.elementAt(endPos)).isTagNode()) 2144 if ((tn = (TagNode) n).tok.equals(tok)) 2145 { 2146 // This keeps a "Depth Count" - where "depth" is just the number of 2147 // opened tags, for which a matching, closing tag hasn't been found yet. 2148 openCount += (tn.isClosing ? -1 : 1); 2149 2150 // When all open-tags of the specified HTML Element 'tok' have been 2151 // found, search has finished. 2152 if (openCount == 0) return new DotPair(tagPos, endPos); 2153 } 2154 2155 // The end of the vectorized-html page (or subsection) was reached, but the 2156 // matching-closing element was not found. 2157 return null; // assert(endPos == html.size()); 2158 } 2159 } 2160}