001package Torello.HTML; 002 003import java.util.stream.*; 004import java.util.*; 005 006import Torello.Java.LV; 007import Torello.Java.StrCSV; 008import Torello.Java.ExceptionCheckError; 009 010/** 011 * A simple utility class that, used ubiquitously throughout Java HTML, which maintains two integer 012 * fields - <CODE><B><A HREF='#start'>DotPai.start</A></B></CODE> and 013 * <CODE><B><A HREF='#end'>DotPai.end</A></B></CODE> , for demarcating the begining and ending 014 * of a sub-list within an HTML web-page. 015 * 016 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=DOT_PAIR> 017 * 018 * @see NodeIndex 019 * @see SubSection 020 */ 021public final class DotPair 022implements java.io.Serializable, Comparable<DotPair>, Cloneable, Iterable<Integer> 023{ 024 /** <EMBED CLASS='external-html' DATA-FILE-ID=SVUID> */ 025 public static final long serialVersionUID = 1; 026 027 /** 028 * This is intended to be the "starting index" into an sub-array of an HTML {@code Vector} of 029 * {@code HTMLNode} elements. 030 */ 031 public final int start; 032 033 /** 034 * This is intended to be the "ending index" into a sub-array of an HTML {@code Vector} of 035 * {@code HTMLNode} elements. 036 */ 037 public final int end; 038 039 040 // ******************************************************************************************** 041 // ******************************************************************************************** 042 // Constructor 043 // ******************************************************************************************** 044 // ******************************************************************************************** 045 046 047 /** 048 * This constructor takes two integers and saves them into the {@code public} member fields. 049 * 050 * @param start This is intended to store the starting position of a vectorized-webpage 051 * sub-list or subpage. 052 * 053 * @param end This will store the ending position of a vectorized-html webpage or subpage. 054 * 055 * @throws IndexOutOfBoundsException A negative {@code 'start'} or {@code 'end'} 056 * parameter-value will cause this exception throw. 057 * 058 * @throws IllegalArgumentException A {@code 'start'} parameter-value that is larger than the 059 * {@code 'end'} parameter will cause this exception throw. 060 * 061 * @see NodeIndex 062 * @see SubSection 063 */ 064 public DotPair(int start, int end) 065 { 066 if (start < 0) throw new IndexOutOfBoundsException 067 ("Negative start value passed to DotPair constructor: start = " + start); 068 069 if (end < 0) throw new IndexOutOfBoundsException 070 ("Negative ending value passed to DotPair constructor: end = " + end); 071 072 if (end < start) throw new IllegalArgumentException( 073 "Start-parameter value passed to constructor is greater than ending-parameter: " + 074 "start: [" + start + "], end: [" + end + ']' 075 ); 076 077 this.start = start; 078 this.end = end; 079 } 080 081 /** 082 * Creates a new instance that has been shifted by {@code 'delta'}. 083 * 084 * @param delta The number of array indices to shift {@code 'this'} intance. This parameter 085 * may be negative, and if so, {@code 'this'} will be shifted left, instead of right. 086 * 087 * @return A new, shifted, instance of {@code 'this'} 088 */ 089 public DotPair shift(int delta) 090 { return new DotPair(this.start + delta, this.end + delta); } 091 092 093 // ******************************************************************************************** 094 // ******************************************************************************************** 095 // Standard Java Methods 096 // ******************************************************************************************** 097 // ******************************************************************************************** 098 099 100 /** 101 * Implements the standard java {@code 'hashCode()'} method. This will provide a hash-code 102 * that is likely to avoid crashes. 103 * 104 * @return A hash-code that may be used for inserting {@code 'this'} instance into a hashed 105 * table, map or list. 106 */ 107 public int hashCode() 108 { return this.start + (1000 * this.end); } 109 110 /** 111 * The purpose of this is to remind the user that the array bounds are inclusive at <B>BOTH</B> 112 * ends of the sub-list. 113 * 114 * <BR /><BR /><B CLASS=JDDescLabel>Inclusive & Exclusive:</B> 115 * 116 * <BR />For an instance of {@code 'DotPair'}, the intention is to include both the 117 * characters located at the {@code Vector}-index positions {@link #start} and the one at 118 * {@link #end}. Specifically, (and unlike many of the {@code Node-Search} package methods) 119 * both of the internal fields to this class are <B STYLE='color: red'><I>inclusive</I></B>, 120 * rather than exclusive. 121 * 122 * <BR /><BR />For many of the search methods in package {@link Torello.HTML.NodeSearch}, the 123 * {@code 'ePos'} parameters are always <B STYLE='color: red'><I>exclusive</I></B> - meaning 124 * the character at {@code Vector}=index {@code 'ePos'} is not included in the search. 125 * 126 * @return The length of a sub-array that would be indicated by this dotted pair. 127 */ 128 public int size() { return this.end - this.start + 1; } 129 130 /** 131 * Java's {@code toString()} requirement. 132 * 133 * @return A string representing 'this' instance of DotPair. 134 */ 135 public String toString() { return "[" + start + ", " + end + "]"; } 136 137 /** 138 * Java's {@code public boolean equals(Object o)} requirements. 139 * 140 * @param o This may be any Java {@code Object}, but only ones of {@code 'this'} type whose 141 * internal-values are identical will force this method to return {@code TRUE}. 142 * 143 * @return {@code TRUE} if (and only if) parameter {@code 'o'} is an {@code instanceof DotPair} 144 * and, also, both have equal start and ending field values. 145 */ 146 public boolean equals(Object o) 147 { 148 if (o instanceof DotPair) 149 { 150 DotPair dp = (DotPair) o; 151 return (this.start == dp.start) && (this.end == dp.end); 152 } 153 154 else return false; 155 } 156 157 /** 158 * Java's {@code interface Cloneable} requirements. This instantiates a new {@code DotPair} 159 * with identical {@code 'start', 'end'} fields. 160 * 161 * @return A new {@code DotPair} whose internal fields are identical to this one. 162 */ 163 public DotPair clone() { return new DotPair(this.start, this.end); } 164 165 /** 166 * Java's {@code interface Comparable<T>} requirements. <I>This is not the only comparison4 167 * operation possible,</I> but it does satisfy one reasonable requirement - 168 * <I>SPECIFICALLY:</I> which of two separate instances of {@code DotPair} start first. 169 * 170 * <BR /><BR /><B CLASS=JDDescLabel>Comparator Heuristic:</B> 171 * 172 * <BR />If two {@code DotPair} instances begin at the same {@code Vector}-index, then the 173 * shorter of the two shall come first. 174 * 175 * @param other Any other {@code DotPair} to be compared to {@code 'this' DotPair} 176 * 177 * @return An integer that fulfils Java's 178 * {@code interface Comparable<T> public boolean compareTo(T t)} method requirements. 179 */ 180 public int compareTo(DotPair other) 181 { 182 int ret = this.start - other.start; 183 184 return (ret != 0) ? ret : (this.size() - other.size()); 185 } 186 187 /** 188 * This is an "alternative Comparitor" that can be used for sorting instances of this class. 189 * It should work with the {@code Collections.sort(List, Comparator)} method in the standard 190 * JDK package {@code java.util.*;} 191 * 192 * <BR /><BR /><B CLASS=JDDescLabel>Comparator Heuristic:</B> 193 * 194 * <BR />This "extra <CODE>Comparitor</CODE>" simply compares the size of one {@code DotPair} 195 * to a second. The smaller shall be sorted first, and the larger (longer-in-length) 196 * {@code DotPair} shall be sorted later. If they are of equal size, whichever of the two has 197 * an earlier {@link #start} position in the {@code Vector} is considered first. 198 * 199 * @see CommentNode#body 200 */ 201 public static Comparator<DotPair> comp2 = (DotPair dp1, DotPair dp2) -> 202 { 203 int ret = dp1.size() - dp2.size(); 204 205 return (ret != 0) ? ret : (dp1.start - dp2.start); 206 }; 207 208 /** 209 * This shall return an {@code int Iterator} (which is properly named 210 * {@code class java.util.PrimitiveIterator.OfInt}) that iterates integers beginning with the 211 * value in {@code this.start} and ending with the value in {@code this.end}. 212 * 213 * @return An {@code Iterator} that iterates {@code 'this'} instance of {@code DotPair} from 214 * the beginning of the range, to the end of the range. The {@code Iterator} returned will 215 * produce Java's primitive type {@code int}. 216 * 217 * <BR /><BR /><B STYLE='color:red;'>NOTE:</B> The elements returned by the {@code Iterator} 218 * are integers, and this is, in effect, nothing more than one which counts from {@link #start} 219 * to {@link #end}. 220 */ 221 public PrimitiveIterator.OfInt iterator() 222 { 223 return new PrimitiveIterator.OfInt() 224 { 225 private int cursor = start; 226 227 public boolean hasNext() { return this.cursor <= end; } 228 229 public int nextInt() 230 { 231 if (cursor == end) throw new NoSuchElementException 232 ("Cursor has reached the value stored in 'end' [" + end + "]"); 233 234 return cursor++; 235 } 236 }; 237 } 238 239 /** 240 * A simple {@code Iterator} that will iterate elements on an input page, using {@code 'this'} 241 * intance of {@code DotPair's} indices, {@link #start}, and {@link #end}. 242 * 243 * @param page This may be any HTML page or sub-page. This page should correspond to 244 * {@code 'this'} instance of {@code DotPair}. 245 * 246 * @return An {@code Iterator} that will iterate each node in the page, beginning with the 247 * node at {@code page.elementAt(this.start)}, and ending with {@code page.elementAt(this.end)} 248 * 249 * @throws IndexOutOfBoundsException This throws if {@code 'this'} instance does not have a 250 * range that adheres to the size of the input {@code 'page'} parameter. 251 */ 252 public <T extends HTMLNode> Iterator<T> iterator(Vector<T> page) 253 { 254 if (this.start >= page.size()) throw new IndexOutOfBoundsException( 255 "This instance of DotPair points to elements that are outside of the range of the" + 256 "input 'page' Vector.\n" + 257 "'page' parameter size: " + page.size() + ", this.start: [" + this.start + "]" 258 ); 259 260 if (this.end >= page.size()) throw new IndexOutOfBoundsException( 261 "This instance of DotPair points to elements that are outside of the range of the" + 262 "input 'page' Vector.\n" + 263 "'page' parameter size: " + page.size() + ", this.end: [" + this.end + "]" 264 ); 265 266 return new Iterator<T>() 267 { 268 private int cursor = start; // a.k.a. 'this.start' 269 private int expectedSize = page.size(); 270 private int last = end; // a.k.a. 'this.end' 271 272 public boolean hasNext() { return cursor < last; } 273 274 public T next() 275 { 276 if (++cursor > last) throw new NoSuchElementException( 277 "This iterator's cursor has run past the end of the DotPaiar instance that " + 278 "formed this Iterator. No more elements to iterate. Did you call hasNext() ?" 279 ); 280 281 if (page.size() != expectedSize) throw new ConcurrentModificationException( 282 "The expected size of the underlying vector has changed." + 283 "\nCurrent-Size " + 284 "[" + page.size() + "], Expected-Size [" + expectedSize + "]\n" + 285 "\nCursor location: [" + cursor + "]" 286 ); 287 288 return page.elementAt(cursor); 289 } 290 291 // Removes the node from the underlying {@code Vector at the cursor's location. 292 public void remove() 293 { page.removeElementAt(cursor); expectedSize--; cursor--; last--; } 294 }; 295 } 296 297 298 // ******************************************************************************************** 299 // ******************************************************************************************** 300 // Simple Boolean tests 301 // ******************************************************************************************** 302 // ******************************************************************************************** 303 304 305 /** 306 * This will test whether a specific index is contained (between {@code this.start} and 307 * {@code this.end}, inclusively. 308 * 309 * @param index This is any integer index value. It must be greater than zero. 310 * 311 * @return {@code TRUE} If the value of index is greater-than-or-equal-to the value stored in 312 * field {@code 'start'} and furthermore is less-than-or-equal-to the value of field 313 * {@code 'end'} 314 * 315 * @throws IndexOutOfBoundsException If the value is negative, this exception will throw. 316 */ 317 public boolean isInside(int index) 318 { 319 if (index < 0) throw new IndexOutOfBoundsException 320 ("You have passed a negative index [" + index + "] here, but this is not allowed."); 321 322 return (index >= start) && (index <= end); 323 } 324 325 /** 326 * Tests whether {@code 'this' DotPair} is fully enclosed by {@code DotPair} parameter 327 * {@code 'other'} 328 * 329 * @param other Another {@code DotPair}. This parameter is expected to be a descriptor of the 330 * same vectorized-webpage as {@code 'this' DotPair} is. It is not mandatory, but if not, the 331 * comparison is likely meaningless. 332 * 333 * @return {@code TRUE} If (and only if) parameter {@code 'other'} encloses {@code 'this'}. 334 */ 335 public boolean enclosedBy(DotPair other) 336 { return (other.start <= this.start) && (other.end >= this.end); } 337 338 /** 339 * Tests whether {@code 'this' DotPair} is enclosed, completely, by parameter {@code DotPair} 340 * parameter {@code 'other'} 341 * 342 * @param other Another {@code DotPair}. This parameter is expected to be a descriptor of the 343 * same vectorized-webpage as {@code 'this' DotPair} is. It is not mandatory, but if not, the 344 * comparison is likely meaningless. 345 * 346 * @return {@code TRUE} If (and only if) parameter {@code 'other'} is enclosed completely by 347 * {@code 'this'}. 348 */ 349 public boolean encloses(DotPair other) 350 { return (this.start <= other.start) && (this.end >= other.end); } 351 352 /** 353 * Tests whether parameter {@code 'other'} has any overlapping {@code Vector}-indices with 354 * {@code 'this' DotPair} 355 * 356 * @param other Another {@code DotPair}. This parameter is expected to be a descriptor of the 357 * same vectorized-webpage as {@code 'this' DotPair} is. It is not mandatory, but if not, the 358 * comparison is likely meaningless. 359 * 360 * @return {@code TRUE} If (and only if) parameter {@code 'other'} and {@code 'this'} have any 361 * overlap. 362 */ 363 public boolean overlaps(DotPair other) 364 { 365 return 366 ((this.start >= other.start) && (this.start <= other.end)) || 367 ((this.end >= other.start) && (this.end <= other.end)); 368 } 369 370 /** 371 * Tests whether {@code 'this'} lays, <I>completely</I>, before {@code DotPair} parameter 372 * {@code 'other'}. 373 * 374 * @param other Another {@code DotPair}. This parameter is expected to be a descriptor of the 375 * same vectorized-webpage as {@code 'this' DotPair} is. It is not mandatory, but if not, the 376 * comparison is likely meaningless. 377 * 378 * @return {@code TRUE} if <I>every index</I> of {@code 'this'} has a value that is less than 379 * every index of {@code 'other'} 380 */ 381 public boolean isBefore(DotPair other) 382 { return this.end < other.start; } 383 384 /** 385 * Tests whether {@code 'this'} begins before {@code DotPair} parameter {@code 'other'}. 386 * 387 * @param other Another {@code DotPair}. This parameter is expected to be a descriptor of the 388 * same vectorized-webpage as {@code 'this' DotPair} is. It is not mandatory, but if not, the 389 * comparison is likely meaningless. 390 * 391 * @return {@code TRUE} if {@code this.start} is less than {@code other.start}, and 392 * {@code FALSE} otherwise. 393 */ 394 public boolean startsBefore(DotPair other) 395 { return this.start < other.start; } 396 397 /** 398 * Tests whether {@code 'this'} lays, <I>completely</I>, after {@code DotPair} parameter 399 * {@code 'other'}. 400 * 401 * @param other Another {@code DotPair}. This parameter is expected to be a descriptor of the 402 * same vectorized-webpage as {@code 'this' DotPair} is. It is not mandatory, but if not, the 403 * comparison is likely meaningless. 404 * 405 * @return {@code TRUE} if <I>every index</I> of {@code 'this'} has a value that is greater 406 * than every index of {@code 'other'} 407 */ 408 public boolean isAfter(DotPair other) 409 { return this.start > other.end; } 410 411 /** 412 * Tests whether {@code 'this'} ends after {@code DotPair} parameter {@code 'other'}. 413 * 414 * @param other Another {@code DotPair}. This parameter is expected to be a descriptor of the 415 * same vectorized-webpage as {@code 'this' DotPair} is. It is not mandatory, but if not, the 416 * comparison is likely meaningless. 417 * 418 * @return {@code TRUE} if {@code this.end} is greater than {@code other.end}, and 419 * {@code FALSE} otherwise. 420 */ 421 public boolean endsAfter(DotPair other) 422 { return this.end > other.end; } 423 424 425 426 427 // ******************************************************************************************** 428 // ******************************************************************************************** 429 // Exception Check 430 // ******************************************************************************************** 431 // ******************************************************************************************** 432 433 434 /** 435 * A method that will do a fast check that {@code 'this'} intance holds index-pointers to 436 * an opening and closing HTML-Tag pair. Note, though these mistakes may seem trivial, when 437 * parsing Internet Web-Pages, these are exactly the type of basic mistakes that users will 438 * make when their level of 'concentration' is low. This is no different that checking an 439 * array-index or {@code String}-index for an {@code IndexOutOfBoundsException}. 440 * 441 * <BR /><BR />This type of detailed exception message can make analyzing web-pages more 442 * direct and less error-prone. The 'cost' incurred includes only a few {@code if}-statement 443 * comparisons, and <I>this check should be performed immediatley <B>before a loop is 444 * entered.</B></I> 445 * 446 * @param page Any web-page, or sub-page. It needs to be the page from whence {@code 'this'} 447 * instance of {@code DotPair} was retrieved. 448 * 449 * @throws TagNodeExpectedException If {@code 'this'} instance' {@link #start} or {@link #end} 450 * fields do not point to {@code TagNode} elements on the {@code 'page'}. 451 * 452 * @throws HTMLTokException If {@link #start} or {@link #end} do not point to a {@code TagNode} 453 * whose {@link TagNode#tok} field equals the {@code String} contained by parameter 454 * {@code 'token'}. 455 * 456 * @throws OpeningTagNodeExpectedException If {@link #start} does not point to an opening 457 * {@code TagNode}. 458 * 459 * @throws ClosingTagNodeExpectedException If {@link #end} does not point to a closing 460 * {@code TagNode}. 461 * 462 * @throws NullPointerException If the {@code 'page'} parameter is null. 463 * 464 * @throws ExceptionCheckError <B STYLE='color:red;'>IMPORTANT</B> Since this method is, 465 * indubuitably, a method for performing error checking, the presumption is that the programmer 466 * is trying to check for <I>his users input</I>. If in the processes of checking for user 467 * error, another mistake is made that would generate an exception, this must thought of as a 468 * more serious error. 469 * 470 * <BR /><BR />The purpose of the {@code 'possibleTokens'} array is to check that those tokens 471 * match the tokens that are contained by the {@code TagNode's} on the page at index 472 * {@code this.start}, and {@code this.end}. If invalid HTML tokens, null tokens, or even 473 * HTML Singleton tokens are passed <B>this exception-check, itself, is flawed!</B> If there 474 * are problems with this var-args array, this error is thrown. 475 * 476 * <BR /><BR />It is more serious because it indicates that the programmer has made a mistake 477 * in attempting to check for user-errors. 478 */ 479 public void exceptionCheck(Vector<HTMLNode> page, String... possibleTokens) 480 { 481 if (page == null) throw new NullPointerException 482 ("HTML-Vector parameter was passed a null reference."); 483 484 if (possibleTokens == null) throw new ExceptionCheckError 485 ("HTML tags string-list was passed a null reference."); 486 487 for (String token : possibleTokens) 488 { 489 if (token == null) throw new ExceptionCheckError 490 ("One of the HTML Tag's in the tag-list String-array was null."); 491 492 if (! HTMLTags.isTag(token)) throw new ExceptionCheckError 493 ("One of the passed tokens [" + token +"] is not a valid HTML token."); 494 495 if (HTMLTags.isSingleton(token)) throw new ExceptionCheckError 496 ("One of the passed tokens [" + token +"] is an HTML Singleton."); 497 } 498 499 500 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 501 // Check the DotPair.start 502 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 503 504 if (this.start >= page.size()) throw new IndexOutOfBoundsException( 505 "DotPair's 'start' field [" + this.start + "], is greater than or equal to the " + 506 "size of the HTML-Vector [" + page.size() + "]." 507 ); 508 509 if (! (page.elementAt(this.start) instanceof TagNode)) 510 throw new TagNodeExpectedException(this.start); 511 512 TagNode t1 = (TagNode) page.elementAt(this.start); 513 514 if (t1.isClosing) throw new OpeningTagNodeExpectedException( 515 "The TagNode at index [" + this.start + "] was a closing " + 516 "</" + t1.tok.toUpperCase() + ">, but an opening tag was expected here." 517 ); 518 519 520 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 521 // Now Check the DotPair.end 522 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 523 524 if (this.end >= page.size()) throw new IndexOutOfBoundsException( 525 "DotPair's 'end' field [" + this.end + "], is greater than or equal to the " + 526 "size of the HTML-Vector [" + page.size() + "]." 527 ); 528 529 if (! (page.elementAt(this.end) instanceof TagNode)) 530 throw new TagNodeExpectedException(this.end); 531 532 TagNode t2 = (TagNode) page.elementAt(this.end); 533 534 if (! t2.isClosing) throw new ClosingTagNodeExpectedException( 535 "The TagNode at index [" + this.start + "] was an opening " + 536 "<" + t2.tok.toUpperCase() + ">, but a closing tag was expected here." 537 ); 538 539 540 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 541 // Token Check 542 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 543 544 if (! t1.tok.equalsIgnoreCase(t2.tok)) throw new HTMLTokException( 545 "The opening TagNode was the [" + t1.tok.toLowerCase() + "] HTML Tag, while the " + 546 "closing Tag was the [" + t2.tok.toLowerCase() + "]. These two tag's must be an " + 547 "opening and closing pair, and therefore must match each-other." 548 ); 549 550 for (String possibleToken : possibleTokens) 551 if (possibleToken.equalsIgnoreCase(t1.tok)) 552 return; 553 554 String t = t1.tok.toUpperCase(); 555 556 throw new HTMLTokException( 557 "The opening and closing tags were: <" + t + ">, and </" + t + ">, but " + 558 "unfortunately this Tag is not included among the list of expected tags:\n" + 559 " [" + StrCSV.toCSV(possibleTokens, false, false, 60) + "]." 560 ); 561 } 562 563 /** 564 * Performs an exception check, using {@code 'this'} instance of {@code DotPair}, and throws 565 * an {@code IndexOutOfBoundsException} if {@code 'this'} contains end-points that do not fit 566 * inside the {@code 'page'} Vector Parameter. 567 * 568 * @param page Any HTML Page, or subpage. {@code page.size()} must return a value that is 569 * larger than <B STYLE='color: red;'>BOTH</B> {@link #start} 570 * <B STYLE='color:red;'>AND</B> {@link #end}. 571 * 572 * @throws IndexOutOfBoundsException A value for {@link #start} or {@link #end} which 573 * are larger than the size of the {@code Vector} parameter {@code 'page'} will cause this 574 * exception throw. 575 */ 576 public void exceptionCheck(Vector<HTMLNode> page) 577 { 578 if (this.end >= page.size()) throw new IndexOutOfBoundsException( 579 "The value of this.end [" + this.end + "] is greater than the size of Vector " + 580 "parameter 'page' [" + page.size() + "]" 581 ); 582 583 // This is actually unnecessary. If 'end' is fine, then 'start' must be fine. If 'end' is 584 // out of bounds, then it is irrelevant whether 'start' is out of bounds. "They" play with 585 // your brain when you are coding. 586 587 /* 588 if (this.start >= page.size()) throw new IndexOutOfBoundsException( 589 "The value of this.start [" + this.start + "] is greater than the size of Vector " + 590 "parameter 'page' [" + page.size() + "]" 591 ); 592 */ 593 } 594}