001package Torello.HTML; 002 003import Torello.HTML.NodeSearch.*; 004import Torello.Java.FileRW; // used in @see comments 005import Torello.Java.StringParse; 006import Torello.Java.Additional.Ret2; 007 008import java.util.*; 009import java.util.stream.IntStream; 010 011/** 012 * Utilities for checking that opening and closing {@link TagNode} elements match up (that the HTML 013 * is balanced). 014 * 015 * <EMBED CLASS='external-html' DATA-FILE-ID=BALANCE> 016 */ 017@Torello.JavaDoc.StaticFunctional 018public class Balance 019{ 020 private Balance() { } 021 022 /** 023 * Invokes: 024 * 025 * <BR /><BR /><UL CLASS=JDUL> 026 * <LI>{@link #check(Vector)}</LI> 027 * <LI>{@link #checkNonZero(Hashtable)}</LI> 028 * <LI>{@link #toStringBalance(Hashtable)}</LI> 029 * </UL> 030 * 031 * <DIV CLASS=EXAMPLE>{@code 032 * String b = Balance.CB(a.articleBody); 033 * System.out.println((b == null) ? "Page has Balanced HTML" : b); 034 * 035 * // If Page has equal number of open and close tags prints: 036 * // Page Has Balanced HTML 037 * // OTHERWISE PRINTS REPORT 038 * }</DIV> 039 * 040 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVECSUP> 041 * 042 * @return Will return null if the snippet or page has 'balanced' HTML, otherwise returns the 043 * trimmed balance-report as a {@code String}. 044 */ 045 public static String CB(Vector<HTMLNode> html) 046 { 047 String ret = toStringBalance(checkNonZero(check(html))); 048 049 return (ret.length() == 0) ? null : ret; 050 } 051 052 /** 053 * Creates a {@code Hashtable} that has a count of all open and closed HTML tags found on the 054 * page. 055 * 056 * <BR /><BR />This {@code Hashtable} may be regarded as maintaining "counts" on each-and-every 057 * HTML tag to identify whether there is <I><B>a one-to-one balance mapping between opening and 058 * closing tags</I></B> for each element. When the {@code Hashtable} generated by 059 * this method is non-zero (for a particular HTML-Tag) it means that there are an unequal 060 * number of opening and closing elements for that tag. 061 * 062 * <BR /><BR />Suppose this method were to produce a {@code Hashtable}, and that 063 * {@code Hashtable} queried for a count on the HTML <B CLASS=JDHTags>{@code <DIV>}</B> tag 064 * (dividers). If that count turned out to be a non-zero positive number it would mean that 065 * the Vectorized-HTML had more opening <B CLASS=JDHTags>{@code <DIV>}</B> tags than the 066 * number of closing <B CLASS=JDHTags>{@code </DIV>}</B> tags on that page. 067 * 068 * <EMBED CLASS='external-html' DATA-FILE-ID=BALANCE_VALID_NOTE1> <!-- Validity Note --> 069 * 070 * <BR /><BR />The following example will help explain the use of this method. If an HTML page 071 * needs to be checked to see that all elements are properly opened and closed, this method can 072 * be used to return a list of any HTML element tag that does not have an equal number of 073 * opening and closing tags. 074 * 075 * <BR /><BR />In this example, the generated Java-Doc HTML-Page for class {@code TagNode} is 076 * checked. 077 * 078 * <DIV CLASS="EXAMPLE">{@code 079 * String html = FileRW.loadFileToString(htmlFileName); 080 * Vector<HTMLNode> v = HTMLPage.getPageTokens(html, false); 081 * Hashtable<String, Integer> b = Balance.check(v); 082 * StringBuffer sb = new StringBuffer(); 083 * 084 * // This part just prints a text-output to a string buffer, which is printed to the screen. 085 * for (String key : b.keySet()) 086 * { 087 * Integer i = b.get(key); 088 * 089 * // Only print keys that had a "non-zero count" 090 * // A Non-Zero-Count implies Opening-Tag-Count and Closing-Tag-Count are not equal! 091 * 092 * if (i.intValue() != 0) sb.append(key + "\t" + i.intValue() + "\n"); 093 * } 094 * 095 * // This example output was: "i -1", because of an unclosed italics element. 096 * // NOTE: To find where this unclosed element is, use method: nonNestedCheck(Vector, String) 097 * }</DIV> 098 * 099 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVECSUP> 100 * 101 * @return A {@code Hashtable} map of the count of each HTML-Tag present in the 102 * input {@code Vector}. 103 * 104 * <BR /><BR />For instance, if this {@code Vector} had five 105 * <B CLASS=JDHTags>{@code <A HREF=...>}</B> (Anchor-Link) tags, and six 106 * <B CLASS=JDHTags>{@code </A>}</B> tags, then the returned {@code Hashtable} would have a 107 * {@code String}-key equal to {@code "A"} with an integer value of {@code -1}. 108 * 109 * @see FileRW#loadFileToString(String) 110 * @see HTMLPage#getPageTokens(CharSequence, boolean) 111 */ 112 public static Hashtable<String, Integer> check(Vector<? super TagNode> html) 113 { 114 Hashtable<String, Integer> ht = new Hashtable<>(); 115 116 // Iterate through the HTML List, we are only counting HTML Elements, not text, and 117 // not HTML Comments 118 119 for (Object o : html) if (o instanceof TagNode) 120 { 121 TagNode tn = (TagNode) o; 122 123 // Singleton tags are also known as 'self-closing' tags. BR, HR, IMG, etc... 124 if (HTMLTags.isSingleton(tn.tok)) continue; 125 126 Integer I = ht.get(tn.tok); 127 int i = (I != null) ? I.intValue() : 0; 128 129 // An opening-version (TC.OpeningTags, For Instance <DIV ...>) will ADD 1 to the count 130 // A closing-tag (For Instance: </DIV>) will SUBTRACT 1 from the count 131 132 i += tn.isClosing ? -1 : 1; 133 134 // Update the return result Hashtable for this particular HTML-Element (tn.tok) 135 ht.put(tn.tok, Integer.valueOf(i)); 136 } 137 138 return ht; 139 } 140 141 /** 142 * Creates an array that includes an open-and-close {@code 'count'} for each HTML-Tag / 143 * that was requested via the passed input {@code String[]}-Array parameter {@code 'htmlTags'}. 144 * 145 * <EMBED CLASS='external-html' DATA-FILE-ID=BALANCE_VALID_NOTE1> <!-- Validity Note --> 146 * 147 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVECSUP> 148 * 149 * <BR /><BR />The HTML-Element Open-Close-Counts are computed from this page. 150 * 151 * @param htmlTags This may be one, or many, HTML-Tags whose open-close count needs to be 152 * computed. Any HTML Element that is not present in this list - <I>will not have a count 153 * computed.</I> 154 * 155 * <BR /><BR />The {@code count} results which are stored in an {@code int[]}-Array that should 156 * be considered "parallel" to this input Var-Args-Array. 157 * 158 * @return An array of the count of each html-element present in the input vectorized-html 159 * parameter {@code 'html'}. 160 * For instance, If the following values were passed to this method: 161 * 162 * <BR /><BR /><UL CLASS=JDUL> 163 * <LI> A Vectorized-HTML page that had 5 {@code '<SPAN ...>'} open-elements, and 6 164 * {@code '</SPAN>'} closing {@code SPAN}-Tags. 165 * </LI> 166 * 167 * <LI> And at least one of the {@code String's} in the Var-Args parameter {@code 'htmlTags'} 168 * was equal to the {@code String} {@code "SPAN"} (case insensitive). 169 * </LI> 170 * 171 * <LI> <B>==></B> Then the array-position corresponding to the position in array 172 * {@code 'htmlTags'} that had the {@code "SPAN"} would have a value of {@code '-1'}. 173 * </LI> 174 * </UL> 175 * 176 * @throws HTMLTokException If any of the tags passed are not valid HTML tags. 177 * 178 * @throws SingletonException If and of the {@code String}-Tags passed to parameter 179 * {@code 'htmlTags'} are {@code 'singleton'} (Self-Closing) Tags, then this exception throws 180 */ 181 public static int[] check(Vector<? super TagNode> html, String... htmlTags) 182 { 183 // Check that these are all valid HTML Tags, throw an exception if not. 184 htmlTags = ARGCHECK.htmlTags(htmlTags); 185 186 // Temporary Hash-table, used to store the count of each htmlTag 187 Hashtable<String, Integer> ht = new Hashtable<>(); 188 189 // Initialize the temporary hash-table. This will be discarded at the end of the method, 190 // and converted into a parallel array. (Parallel to the input String... htmlTags array). 191 // Also, check to make sure the user hasn't requested a count of Singleton HTML Elements. 192 193 for (String htmlTag : htmlTags) 194 { 195 if (HTMLTags.isSingleton(htmlTag)) throw new SingletonException( 196 "One of the tags you have passed: [" + htmlTag + "] is a singleton-tag, " + 197 "and is only allowed opening versions of the tag." 198 ); 199 200 ht.put(htmlTag, Integer.valueOf(0)); 201 } 202 203 Integer I; 204 205 // Iterate through the HTML List, we are only counting HTML Elements, not text, and 206 // not HTML Comments 207 for (Object o : html) if (o instanceof TagNode) 208 { 209 TagNode tn = (TagNode) o; 210 211 // Get the current count from the hash-table 212 I = ht.get(tn.tok); 213 214 // The hash-table only holds elements we are counting, if null, then skip. 215 if (I == null) continue; 216 217 // Save the new, computed count, in the hash-table 218 // 219 // An opening-version (TC.OpeningTags, For Instance <DIV ...>) will ADD 1 to the count 220 // A closing-tag (For Instance: </DIV>) will SUBTRACT 1 from the count 221 222 ht.put(tn.tok, Integer.valueOf(I.intValue() + (tn.isClosing ? -1 : 1))); 223 } 224 225 // Convert the hash-table to an integer-array, and return this to the user 226 int[] ret = new int[htmlTags.length]; 227 228 for (int i=0; i < ret.length; i++) 229 ret[i] = 0; 230 231 for (int i=0; i < htmlTags.length; i++) 232 if ((I = ht.get(htmlTags[i])) != null) 233 ret[i] = I.intValue(); 234 235 return ret; 236 } 237 238 /** 239 * Creates a {@code Hashtable} that has a count of all open and closed HTML-Tags found on 240 * the page - whose count-value is not equal to zero. 241 * 242 * <BR /><BR />This method will report when there are unbalanced HTML-Tags on a page, <I><B>and 243 * strictly ignore any & all tags with a count of zero</B></I>. Specifically, if a tag has 244 * a {@code 1-to-1} open-close count, then it will not have any keys avialable in the returned 245 * {@code Hashtable}. 246 * 247 * <EMBED CLASS='external-html' DATA-FILE-ID=BALANCE_VALID_NOTE1> <!-- Validity Note --> 248 * <EMBED CLASS='external-html' DATA-FILE-ID=BALANCE_CLONE> <!-- Clone Note --> 249 * 250 * @param ht This should be a {@code Hashtable} that was produced by a call to one of the two 251 * available {@code check(...)} methods. 252 * 253 * @return A {@code Hashtable} map of the count of each html-element present in this 254 * {@code Vector}. For instance, if this {@code Vector} had 5 {@code '<A ...>'} (Anchor-Link) 255 * elements, and six {@code '</A>'} then this {@code Hashtable} would have a {@code String}-key 256 * {@code 'a'} with an integer value of {@code '-1'}. 257 */ 258 public static Hashtable<String, Integer> checkNonZero(Hashtable<String, Integer> ht) 259 { 260 @SuppressWarnings("unchecked") 261 Hashtable<String, Integer> ret = (Hashtable<String, Integer>) ht.clone(); 262 Enumeration<String> keys = ret.keys(); 263 264 while (keys.hasMoreElements()) 265 { 266 String key = keys.nextElement(); 267 268 // Remove any keys (HTML element-names) that have a normal ('0') count. 269 if (ret.get(key).intValue() == 0) ret.remove(key); 270 } 271 272 return ret; 273 } 274 275 276 /** 277 * This will compute a {@code count} for just one, particular, HTML Element of whether that 278 * Element has been properly opened and closed. An open and close {@code count} (integer 279 * value) will be returned by this method. 280 * 281 * <EMBED CLASS='external-html' DATA-FILE-ID=BALANCE_VALID_NOTE1> <!-- Validity Note --> 282 * 283 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVECSUP> 284 * 285 * @param htmlTag This the html element whose open-close count needs to be kept. 286 * 287 * @return The count of each html-element present in this {@code Vector}. For instance, if the 288 * user had requested that HTML Anchor Links be counted, and if the input {@code Vector} had 5 289 * {@code '<A ...>'} (Anchor-Link) elements, and six {@code '</A>'} then this method would 290 * return {@code -1}. 291 * 292 * @throws HTMLTokException If any of the tags passed are not valid HTML tags. 293 * 294 * @throws SingletonException If this {@code 'htmlTag'} is a {@code 'singleton'} (Self-Closing) 295 * Tag, this exception will throw. 296 */ 297 public static int checkTag(Vector<? super TagNode> html, String htmlTag) 298 { 299 // Check that this is a valid HTML Tag, throw an exception if invalid 300 htmlTag = ARGCHECK.htmlTag(htmlTag); 301 302 if (HTMLTags.isSingleton(htmlTag)) throw new SingletonException( 303 "The tag you have passed: [" + htmlTag + "] is a singleton-tag, and is only " + 304 "allowed opening versions of the tag." 305 ); 306 307 TagNode tn; int i = 0; 308 309 // Iterate through the HTML List, we are only counting HTML Elements, not text, and 310 // not HTML Comments 311 312 for (Object o : html) if (o instanceof TagNode) 313 314 // If we encounter an HTML Element whose tag is the tag whose count we are 315 // computing, then.... 316 317 if ((tn = (TagNode) o).tok.equals(htmlTag)) 318 319 // An opening-version (TC.OpeningTags, For Instance <DIV ...>) will ADD 1 to the count 320 // A closing-tag (For Instance: </DIV>) will SUBTRACT 1 from the count 321 322 i += tn.isClosing ? -1 : 1; 323 324 return i; 325 } 326 327 328 /** 329 * This method will calculate the "Maximum" and "Minimum" depth for every HTML 5.0 Tag found on 330 * a page. The Max-Depth is the "Maximum-Number" of Opening HTML Element Opening Tags were 331 * found for a particular element, before a matching closing version of the same Element is 332 * encountered. In the example below, the maximum "open-count" for the HTML 'divider' Element 333 * ({@code <DIV>}) is {@code '2'}. This is because a second {@code <DIV>} element is opened 334 * before the first is closed. 335 * 336 * <DIV CLASS="HTML">{@code 337 * <DIV class="MySection"><H1>These are my ideas:</H1> 338 * <!-- Above is an outer divider, below is an inner divider --> 339 * <DIV class="MyNumbers">Here are the points: 340 * <!-- HTML Content Here --> 341 * </DIV></DIV> 342 * }</DIV> 343 * 344 * <EMBED CLASS='external-html' DATA-FILE-ID=BALANCE_VALID_NOTE2> 345 * 346 * <BR /><BR /><B CLASS=JDDescLabel>'Count' Computation-Heuristic:</B> 347 * 348 * <BR />This maximum and minimum depth count will not pay any attention to whether HTML open 349 * and close tags "enclose each-other" or are "interleaved." The actual mechanics of the 350 * for-loop which calculaties the {@code count} shall hopefully explain this computation 351 * clearly enough. This may be viewed in this method's hilited source-code, below. 352 * 353 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVECSUP> 354 * 355 * @return The returned {@code Hashtable} will contain an integer-array for each HTML Element 356 * that was found on the page. Each of these arrays shall be of length {@code 3}. 357 * 358 * <BR /><BR /><OL CLASS=JDUL> 359 * <LI>Minimum Depth: {@code return_array[0]}</LI> 360 * <LI>Maximum Depth: {@code return_array[1]}</LI> 361 * <LI>Total Count: {@code return_array[2]}</LI> 362 * </OL> 363 * 364 * <BR /><BR /><B><SPAN STYLE="color: red;">REDUNDANCY NOTE:</SPAN></B> The third element of 365 * the returned array should be identical to the result produced by an invocation of method: 366 * {@code Balance.checkTag(html, htmlTag);} 367 * 368 * @throws HTMLTokException If any of the tags passed are not valid HTML tags. 369 * 370 * @throws SingletonException If this {@code 'htmlTag'} is a {@code 'singleton'} (Self-Closing) 371 * Tag, this exception will throw. 372 */ 373 public static Hashtable<String, int[]> depth(Vector<? super TagNode> html) 374 { 375 Hashtable<String, int[]> ht = new Hashtable<>(); 376 377 // Iterate through the HTML List, we are only counting HTML Elements, not text, and not HTML Comments 378 for (Object o : html) if (o instanceof TagNode) 379 { 380 TagNode tn = (TagNode) o; 381 382 // Don't keep a count on singleton tags. 383 if (HTMLTags.isSingleton(tn.tok)) continue; 384 385 int[] curMaxAndMinArr = ht.get(tn.tok); 386 387 // If this is the first encounter of a particular HTML Element, create a MAX/MIN 388 // integer array, and initialize it's values to zero. 389 390 if (curMaxAndMinArr == null) 391 { 392 curMaxAndMinArr = new int[3]; 393 394 curMaxAndMinArr[0] = 0; // Current Min Depth Count for Element "tn.tok" is zero 395 curMaxAndMinArr[1] = 0; // Current Max Depth Count for Element "tn.tok" is zero 396 curMaxAndMinArr[2] = 0; // Current Computed Depth Count for "tn.tok" is zero 397 398 ht.put(tn.tok, curMaxAndMinArr); 399 } 400 401 // curCount += tn.isClosing ? -1 : 1; 402 // 403 // An opening-version (TC.OpeningTags, For Instance <DIV ...>) will ADD 1 to the count 404 // A closing-tag (For Instance: </DIV>) will SUBTRACT 1 from the count 405 406 curMaxAndMinArr[2] += tn.isClosing ? -1 : 1; 407 408 // If the current depth-count is a "New Minimum" (a new low! :), then save it in the 409 // minimum pos of the output-array. 410 411 if (curMaxAndMinArr[2] < curMaxAndMinArr[0]) curMaxAndMinArr[0] = curMaxAndMinArr[2]; 412 413 // If the current depth-count (for this tag) is a "New Maximum" (a new high), save it 414 // to the max-pos of the output-array. 415 416 if (curMaxAndMinArr[2] > curMaxAndMinArr[1]) curMaxAndMinArr[1] = curMaxAndMinArr[2]; 417 } 418 419 return ht; 420 } 421 422 423 424 /** 425 * This method will calculate the "Maximum" and "Minimum" depth for every HTML Tag listed in 426 * the {@code var-args String[] htmlTags} parameter. The Max-Depth is the "Maximum-Number" of 427 * Opening HTML Element Opening Tags were found for a particular element, before a matching 428 * closing version of the same Element is encountered. In the example below, the maximum 429 * {@code 'open-count'} for the HTML 'divider' Element ({@code <DIV>}) is {@code '2'}. This is 430 * because a second {@code <DIV>} element is opened before the first is closed. 431 * 432 * <DIV CLASS="HTML">{@code 433 * <DIV class="MySection"><H1>These are my ideas:</H1> 434 * <!-- Above is an outer divider, below is an inner divider --> 435 * <DIV class="MyNumbers">Here are the points: 436 * <!-- HTML Content Here --> 437 * </DIV></DIV> 438 * }</DIV> 439 * 440 * <EMBED CLASS='external-html' DATA-FILE-ID=BALANCE_VALID_NOTE2> 441 * 442 * <BR /><BR /><B CLASS=JDDescLabel>'Count' Computation-Heuristic:</B> 443 * 444 * <BR />This maximum and minimum depth count will not pay any attention to whether HTML open 445 * and close tags "enclose each-other" or are "interleaved." The actual mechanics of the 446 * for-loop which calculaties the {@code count} shall hopefully explain this computation 447 * clearly enough. This may be viewed in this method's hilited source-code, below. 448 * 449 * <BR /><BR /><B CLASS=JDDescLabel>Var-Args Addition:</B> 450 * 451 * <BR />This method differs from the method with an identical name (defined above) in that it 452 * adds a <I>{@code String}-VarArgs parameter</I> that allows a user to decide which tags he 453 * would like counted and returned in this {@code Hashtable}, and which he would like to ignore. 454 * 455 * <BR /><BR />If one of the requested HTML-Tags from this{@code String}-VarArgs parameter is not 456 * actually an HTML Element present on the page, the returned {@code Hashtable} will still 457 * contain an {@code int[]}-Array for that tag. The values in that array will be equal to 458 * zero. 459 * 460 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVECSUP> 461 * 462 * @return The returned {@code Hashtable} will contain an integer-array for each HTML Element 463 * that was found on the page. Each of these arrays shall be of length {@code 3}. 464 * 465 * <BR /><BR /><OL CLASS=JDUL> 466 * <LI>Minimum Depth: {@code return_array[0]}</LI> 467 * <LI>Maximum Depth: {@code return_array[1]}</LI> 468 * <LI>Total Count: {@code return_array[2]}</LI> 469 * </OL> 470 * 471 * <BR /><BR /><B><SPAN STYLE="color: red;">REDUNDANCY NOTE:</SPAN></B> The third element of 472 * the returned array should be identical to the result produced by an invocation of method: 473 * {@code Balance.checkTag(html, htmlTag);} 474 * 475 * @throws HTMLTokException If any of the tags passed are not valid HTML tags. 476 * 477 * @throws SingletonException If this {@code 'htmlTag'} is a {@code 'singleton'} 478 * (Self-Closing) Tag, this exception will throw. 479 */ 480 public static Hashtable<String, int[]> depth(Vector<? super TagNode> html, String... htmlTags) 481 { 482 // Check that these are all valid HTML Tags, throw an exception if not. 483 htmlTags = ARGCHECK.htmlTags(htmlTags); 484 485 Hashtable<String, int[]> ht = new Hashtable<>(); 486 487 // Initialize the temporary hash-table. This will be discarded at the end of the method, 488 // and converted into a parallel array. (Parallel to the input String... htmlTags array). 489 // Also, check to make sure the user hasn't requested a count of Singleton HTML Elements. 490 491 for (String htmlTag : htmlTags) 492 { 493 if (HTMLTags.isSingleton(htmlTag)) throw new SingletonException( 494 "One of the tags you have passed: [" + htmlTag + "] is a singleton-tag, " + 495 "and is only allowed opening versions of the tag." 496 ); 497 498 // Insert an initialized array (init to zero) for this HTML Tag/Token 499 int[] arr = new int[3]; 500 501 arr[0] = 0; // Current Minimum Depth Count for HTML Element "tn.tok" is zero 502 arr[1] = 0; // Current Maximum Depth Count for HTML Element "tn.tok" is zero 503 arr[2] = 0; // Current Computed Depth Count is HTML Element "tn.tok" is zero 504 505 ht.put(htmlTag, arr); 506 } 507 508 // Iterate through the HTML List, we are only counting HTML Elements, not text, 509 // and not HTML Comments 510 511 for (Object o: html) if (o instanceof TagNode) 512 { 513 TagNode tn = (TagNode) o; 514 515 int[] curMaxAndMinArr = ht.get(tn.tok); 516 517 // If this is null, we are attempting to perform the count on an HTML Element that 518 // wasn't requested by the user with the var-args 'String... htmlTags' parameter. 519 // The Hashtable was initialized to only have those tags. (see about 5 lines above 520 // where the Hashtable is initialized) 521 522 if (curMaxAndMinArr == null) continue; 523 524 // An opening-version (TC.OpeningTags, For Instance <DIV ...>) will ADD 1 to the count 525 // A closing-tag (For Instance: </DIV>) will SUBTRACT 1 from the count 526 527 curMaxAndMinArr[2] += tn.isClosing ? -1 : 1; 528 529 // If the current depth-count is a "New Minimum" (a new low! :), then save it in the 530 // minimum pos of the output-array. 531 532 if (curMaxAndMinArr[2] < curMaxAndMinArr[0]) curMaxAndMinArr[0] = curMaxAndMinArr[2]; 533 534 // If the current depth-count (for this tag) is a "New Maximum" (a new high), save it 535 // to the max-pos of the output-array. 536 537 if (curMaxAndMinArr[2] > curMaxAndMinArr[1]) curMaxAndMinArr[1] = curMaxAndMinArr[2]; 538 539 // NOTE: No need to update the hash-table, since this is an array - changing its 540 // values is already "reflected" into the Hashtable. 541 } 542 543 return ht; 544 } 545 546 547 /** 548 * Creates a {@code Hashtable} that has a maximum and minimum depth for all HTML tags found on 549 * the page. Any HTML Tags that meet ALL of these criteria shall be removed from the 550 * result-set {@code Hashtable} ... 551 * 552 * <BR /><BR /><UL CLASS=JDUL> 553 * <LI>Minimum Depth Is {@code '0'} - i.e. <I>closing tag never precedes opening.</I></LI> 554 * <LI>Count is {@code '0'} - i.ei. <I>there is a {@code 1-to-1} ratio of opening and closing 555 * tags</I> for the particular HTML Element.</LI> 556 * </UL> 557 * 558 * <BR /><BR /><B>NOTE:</B> This means that there is a {@code 1:1} ratio of opening and closing 559 * versions of the tag, <B><I>and also</I></B> that there are no positions in the vector where 560 * a closing tag to come before an tag to open it. 561 * 562 * <BR /><BR /><B CLASS=JDDescLabel>Cloned Input:</B> 563 * 564 * <BR />This method clones the original input {@code Hashtable}, and removes the tags whose 565 * depth-calculations are invalid - as described above. This allows the user to perform other 566 * operations with the original table, while this class is processing. 567 * 568 * @param ht This should be a {@code Hashtable} that was produced by a call to one of the two 569 * available {@code depth(...)} methods. 570 * 571 * @return This shall a return a list of HTML Tags that are <I>potentially (but not guaranteed 572 * to be)</I> invalid. 573 */ 574 public static Hashtable<String, int[]> depthInvalid(Hashtable<String, int[]> ht) 575 { 576 @SuppressWarnings("unchecked") 577 Hashtable<String, int[]> ret = (Hashtable<String, int[]>) ht.clone(); 578 Enumeration<String> keys = ret.keys(); 579 580 // Using the "Enumeration" class allows the situation where elements can be removed from 581 // the underlying data-structure - while iterating through that data-structure. This is 582 // not possible using a keySet Iterator. 583 584 while (keys.hasMoreElements()) 585 { 586 String key = keys.nextElement(); 587 int[] arr = ret.get(key); 588 589 if ((arr[1] >= 0) && (arr[2] == 0)) ret.remove(key); 590 } 591 592 return ret; 593 } 594 595 /** 596 * Creates a {@code Hashtable} that has a maximum and minimum depth for all HTML tags found on 597 * the page. Any HTML Tags that meet ALL of these criteria, below, shall be removed from the 598 * result-set {@code Hashtable} ... 599 * 600 * <BR /><BR /><UL CLASS=JDUL> 601 * <LI> Maximum Depth is precisely {@code '1'} - i.e. <I>Each element of this tag is closed 602 * before a second is open.</I> 603 * </LI> 604 * </UL> 605 * 606 * <BR /><BR /><B CLASS=JDDescLabel>Cloned Input:</B> 607 * 608 * <BR />This method clones the original input {@code Hashtable}, and removes the tags whose 609 * maximum-depth is not greater than one. This allows the user to perform other operations 610 * with the original table, while this class is processing. 611 * 612 * @param ht This should be a {@code Hashtable} that was produced by a call to one of the two 613 * available {@code depth(...)} methods. 614 * 615 * @return This shall a return a list of HTML Tags that are <I>potentially (but not guaranteed 616 * to be)</I> 617 * invalid. 618 */ 619 public static Hashtable<String, int[]> depthGreaterThanOne(Hashtable<String, int[]> ht) 620 { 621 @SuppressWarnings("unchecked") 622 Hashtable<String, int[]> ret = (Hashtable<String, int[]>) ht.clone(); 623 Enumeration<String> keys = ret.keys(); 624 625 // Using the "Enumeration" class allows the situation where elements can be removed from 626 // the underlying data-structure - while iterating through that data-structure. This is not 627 // possible using a keySet Iterator. 628 629 while (keys.hasMoreElements()) 630 { 631 String key = keys.nextElement(); 632 int[] arr = ret.get(key); 633 634 if (arr[1] == 1) ret.remove(key); 635 } 636 637 return ret; 638 } 639 640 641 /** 642 * This method will calculate the "Maximum" and "Minimum" depth for a particular HTML Tag. 643 * The Max-Depth just means the number of Maximum-Number of Opening HTML Element Opening Tags 644 * were found, before a matching closing version of the same Element is encountered. For 645 * instance: {@code <DIV ...><DIV ..> Some Page</DIV></DIV>} has a maximum depth of 646 * {@code '2'}. This means there is a point in the vectorized-html where there are 2 647 * successive divider elements that are opened, before even one has been closed. 648 * 649 * <EMBED CLASS='external-html' DATA-FILE-ID=BALANCE_VALID_NOTE2> 650 * 651 * <BR /><BR /><B CLASS=JDDescLabel>'Count' Computation-Heuristic:</B> 652 * 653 * <BR />This maximum and minimum depth count will not pay any attention to whether HTML open 654 * and close tags "enclose each-other" or are "interleaved." The actual mechanics of the 655 * for-loop which calculaties the {@code count} shall hopefully explain this computation 656 * clearly enough. This may be viewed in this method's hilited source-code, below. 657 * 658 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVECSUP> 659 * 660 * @param htmlTag This the html element whose maximum and minimum depth-count needs to be 661 * computed. 662 * 663 * @return The returned integer-array, shall be of length 3. 664 * 665 * <BR /><BR /><OL CLASS=JDUL> 666 * <LI>Minimum Depth: {@code return_array[0]}</LI> 667 * <LI>Maximum Depth: {@code return_array[1]}</LI> 668 * <LI>Total Count: {@code return_array[2]}</LI> 669 * </OL> 670 * 671 * <BR /><BR /><B><SPAN STYLE="color: red;">REDUNDANCY NOTE:</SPAN></B> The third element of 672 * the returned array should be identical to the result produced by an invocation of method: 673 * {@code Balance.checkTag(html, htmlTag);} 674 * 675 * @throws HTMLTokException If any of the tags passed are not valid HTML tags. 676 * 677 * @throws SingletonException If this {@code 'htmlTag'} is a {@code 'singleton'} (Self-Closing) 678 * Tag, this exception will throw. 679 */ 680 public static int[] depthTag(Vector<? super TagNode> html, String htmlTag) 681 { 682 // Check that this is a valid HTML Tag, throw an exception if invalid 683 htmlTag = ARGCHECK.htmlTag(htmlTag); 684 685 if (HTMLTags.isSingleton(htmlTag)) throw new SingletonException( 686 "The tag you have passed: [" + htmlTag + "] is a singleton-tag, and is only allowed " + 687 "opening versions of the tag." 688 ); 689 690 TagNode tn; int i = 0; int max = 0; int min = 0; 691 692 // Iterate through the HTML List, we are only counting HTML Elements, not text, and not HTML Comments 693 for (Object o : html) if (o instanceof TagNode) 694 695 if ((tn = (TagNode) o).tok.equals(htmlTag)) 696 { 697 // An opening-version (TC.OpeningTags, For Instance <DIV ...>) will ADD 1 to the count 698 // A closing-tag (For Instance: </DIV>) will SUBTRACT 1 from the count 699 700 i += tn.isClosing ? -1 : 1; 701 702 if (i > max) max = i; 703 if (i < min) min = i; 704 } 705 706 // Generate the output array, and return 707 int[] ret = new int[2]; 708 709 ret[0] = min; 710 ret[1] = max; 711 ret[2] = i; 712 713 return ret; 714 } 715 716 /** 717 * This will find the (likely) places where the "non-nested HTML Elements" have become nested. 718 * For the purposes of finding mismatched elements - such as an unclosed "Italics" Element, or 719 * an "Extra" Italics Element - this method will find places where a new HTML Tag has opened 720 * before a previous one has been closed - <I>or vice-versa (where there is an 'extra' 721 * closed-tag).</I> 722 * 723 * <BR /><BR />Certainly, if "nesting" is usually acceptable (for instance the HTML divider 724 * {@code '<DIV>...</DIV>'} construct) <I><B>then the results of this method would not have any 725 * meaning.</I></B> Fortunately, for the vast majority of HTML Elements {@code <I>, <B>, <A>, 726 * etc...} nesting the tags is not allowed or encouraged. 727 * 728 * <BR /><BR />The following example use of this method should make clear the application. If 729 * a user has identified that there is an unclosed HTML italics element ({@code <I>...</I>}) 730 * somewhere on a page, for-example, and that page has numerous italics elements, this method 731 * can pinpoint the failure instantly, using this example. Note that the file-name is a 732 * Java-Doc generated output HTML file. The documentation for this package received a copious 733 * amount of attention due to the sheer number of method-names and class-names used throughout. 734 * 735 * <DIV CLASS="EXAMPLE">{@code 736 * String fStr = FileRW.loadFileToString("javadoc/Torello/HTML/TagNode.html"); 737 * Vector<HTMLNode> v = HTMLPage.getPageTokens(fStr, false); 738 * int[] posArr = Balance.nonNestedCheck(v, "i"); 739 * 740 * // Below, the class 'Debug' is used to pretty-print the vectorized-html page. Here, the 741 * // output will find the lone, non-closed, HTML italics <I> ... </I> tag-element, and output 742 * // it to the terminal-window. The parameter '5' means the nearest 5 elements (in either 743 * // direction) are printed, in addition to the elements at the indices in the posArr. 744 * // Parameter 'true' implies that two curly braces are printed surrounding the matched node. 745 * 746 * System.out.println(Debug.print(v, posArr, 5, " Skip a few ", true, Debug::K)); 747 * }</DIV> 748 * 749 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVECSUP> 750 * 751 * @param htmlTag This the html element whose maximum and minimum depth-count was not {@code 1} 752 * and {@code 0}, respectively. The precise location where the depth achieved either a 753 * negative depth, or depth greater than {@code 1} will be returned in the integer array. In 754 * English: When two opening-tags or two closing-tags are identified, successively, then the 755 * index where the second tag was found is recorded into the output array. 756 * 757 * @return This will return an array of vectorized-html index-locations / index-pointers where 758 * the first instance of an extra opening, or an extra-closing tag, occurs. This will 759 * facilitate finding tags that are not intended to be nested. If "tag-nesting" (for example 760 * HTML divider, {@code 'DIV'}, elements), then the results returned by this method will not be 761 * useful. 762 * 763 * @throws HTMLTokException If any of the tags passed are not valid HTML tags. 764 * 765 * @throws SingletonException If this {@code 'htmlTag'} is a {@code 'singleton'} (Self-Closing) 766 * Tag, this exception will throw. 767 * 768 * @see FileRW#loadFileToString(String) 769 * @see HTMLPage#getPageTokens(CharSequence, boolean) 770 * @see Debug#print(Vector, int[], int, String, boolean, BiConsumer) 771 */ 772 public static int[] nonNestedCheck(Vector<? super TagNode> html, String htmlTag) 773 { 774 // Java Streams are an easier way to keep variable-length lists. They use 775 // "builders" - and this one is for an "IntStream" 776 777 IntStream.Builder b = IntStream.builder(); 778 779 // Check that this is a valid HTML Tag, throw an exception if invalid 780 htmlTag = ARGCHECK.htmlTag(htmlTag); 781 782 if (HTMLTags.isSingleton(htmlTag)) throw new SingletonException( 783 "The tag you have passed: [" + htmlTag + "] is a singleton-tag, and is only " + 784 "allowed opening versions of the tag." 785 ); 786 787 Object o; TagNode tn; int len = html.size(); TC last = null; 788 789 // Iterate through the HTML List, we are only counting HTML Elements, not text, 790 // and not HTML Comments 791 792 for (int i=0; i < len; i++) 793 794 if ((o = html.elementAt(i)) instanceof TagNode) 795 if ((tn = (TagNode) o).tok.equals(htmlTag)) 796 { 797 if ((tn.isClosing) && (last == TC.ClosingTags)) b.add(i); 798 if ((! tn.isClosing) && (last == TC.OpeningTags)) b.add(i); 799 800 last = tn.isClosing ? TC.ClosingTags : TC.OpeningTags; 801 } 802 803 return b.build().toArray(); 804 } 805 806 /** 807 * For likely greater than 95% of HTML tags - finding situations where that tag has 808 * <I><B>'nested tags'</I></B> is highly unlikely. Unfortunately, two or three of the most 809 * common tags in use, for instance {@code <DIV>, <SPAN>}, finding where a mis-match has 810 * occurred (tracking down an "Unclosed divider") is an order of magnitude more difficult than 811 * finding an unclosed anchor {@code '<A HREF...>'}. This method shall return two parallel 812 * arrays. The first array will contain vector indices. The second array contains the depth 813 * (nesting level) of that tag at that position. In this way, finding an unclosed divider is 814 * tantamount to finding where all closing-dividers seem to evaluate to a depth of '1' (one) 815 * rather than '0' (zero). 816 * 817 * <BR /><BR /><B>NOTE:</B> This method can highly useful for SPAN and DIV, while the 818 * "non-standard depth locations" method can be extremely useful for simple, non-nested tags 819 * such as Anchor, Paragraph, Section, etc... - HTML Elements that are mostly never nested. 820 * 821 * <DIV CLASS="EXAMPLE">{@code 822 * // Load an HTML File to a String 823 * String file = LFEC.loadFile("~/HTML/MyHTMLFile.html"); 824 * 825 * // Parse, and convert to vectorized-html 826 * Vector<HTMLNode> v = HTMLPage.getPageTokens(file, false); 827 * 828 * // Run this method 829 * Ret2<int[], int[]> r = Balance.locationsAndDepth(v, "div"); 830 * 831 * // This array has vector-indices 832 * int[] posArr = (int[]) r.a; 833 * 834 * // This (parallel) array has the depth at that index. 835 * int[] depthArr = (int[]) r.b; 836 * 837 * for (int i=0; i < posArr.length; i++) System.out.println( 838 * "(" + posArr[i] + ", " + depthArr[i] + "):\t" + // Prints the Vector-Index, and Depth 839 * C.BRED + v.elementAt(posArr[i]).str + C.RESET // Prints the actual HTML divider. 840 * ); 841 * }</DIV> 842 * 843 * <BR />The above code would produce a list of HTML Divider elements, along with their index 844 * in the {@code Vector}, and the exact depth (number of nested, open {@code 'DIV'} elements) 845 * at that location. This is usually helpful when trying to find unclosed HTML Tags. 846 * 847 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVECSUP> 848 * 849 * @param htmlTag This the html element that has an imbalanced OPEN-CLOSE ratio in the tree. 850 * 851 * @return Two parallel arrays, as follows: 852 * 853 * <BR /><BR /><OL CLASS=JDOL> 854 * <LI> {@code Ret2.a (int[])} 855 * <BR /><BR /> 856 * This shall be an integer array of {@code Vector}-indices where the HTML Element has 857 * been found. 858 * <BR /><BR /> 859 * </LI> 860 * <LI> {@code Ret2.b (int[])} 861 * <BR /><BR /> 862 * This shall contain an array of the value of the depth for the {@code 'htmlTag'} 863 * at the particular {@code Vector}-index identified in the first-array. 864 * </LI> 865 * </OL> 866 * 867 * @throws HTMLTokException If any of the tags passed are not valid HTML tags. 868 * 869 * @throws SingletonException If this {@code 'htmlTag'} is a {@code 'singleton'} (Self-Closing) 870 * Tag, this exception will throw. 871 */ 872 public static Ret2<int[], int[]> locationsAndDepth(Vector<? super TagNode> html, String htmlTag) 873 { 874 // Java Streams are an easier way to keep variable-length lists. They use 875 // "builders" - and this one is for an "IntStream" 876 877 IntStream.Builder locations = IntStream.builder(); 878 IntStream.Builder depthAtLocation = IntStream.builder(); 879 880 // Check that this is a valid HTML Tag, throw an exception if invalid 881 htmlTag = ARGCHECK.htmlTag(htmlTag); 882 883 if (HTMLTags.isSingleton(htmlTag)) throw new SingletonException( 884 "The tag you have passed: [" + htmlTag + "] is a singleton-tag, and is only " + 885 "allowed opening versions of the tag." 886 ); 887 888 Object o; TagNode tn; int len = html.size(); int depth = 0; 889 890 // Iterate through the HTML List, we are only counting HTML Elements, not text, and 891 // not HTML Comments 892 893 for (int i=0; i < len; i++) if ((o = html.elementAt(i)) instanceof TagNode) 894 895 if ((tn = (TagNode) o).tok.equals(htmlTag)) 896 { 897 depth += tn.isClosing ? -1 : 1; 898 899 locations.add(i); 900 901 depthAtLocation.add(depth); 902 } 903 904 return new Ret2<int[], int[]> 905 (locations.build().toArray(), depthAtLocation.build().toArray()); 906 } 907 908 /** 909 * Converts a depth report to a {@code String}, for printing. 910 * 911 * @param depthReport This should be a {@code Hashtable} returned by any of the depth-methods. 912 * 913 * @return This shall return the report as a {@code String}. 914 */ 915 public static String toStringDepth(Hashtable<String, int[]> depthReport) 916 { 917 StringBuilder sb = new StringBuilder(); 918 919 for (String htmlTag : depthReport.keySet()) 920 { 921 int[] arr = depthReport.get(htmlTag); 922 923 sb.append( 924 "HTML Element: [" + htmlTag + "]:\t" + 925 "Min-Depth: " + arr[0] + ",\tMax-Depth: " + arr[1] + ",\tCount: " + arr[2] + "\n" 926 ); 927 } 928 929 return sb.toString(); 930 } 931 932 933 /** 934 * Converts a balance report to a {@code String}, for printing. 935 * 936 * @param balanceCheckReport This should be a {@code Hashtable} returned by any of the 937 * balance-check methods. 938 * 939 * @return This shall return the report as a {@code String}. 940 */ 941 public static String toStringBalance(Hashtable<String, Integer> balanceCheckReport) 942 { 943 StringBuilder sb = new StringBuilder(); 944 int maxTagLen = 0; 945 int maxValStrLen = 0; 946 int maxAbsValStrLen = 0; 947 int val; 948 String valAsStr; 949 950 // For good spacing purposes, we need the length of the longest of the tags. 951 for (String htmlTag : balanceCheckReport.keySet()) 952 if (htmlTag.length() > maxTagLen) 953 maxTagLen = htmlTag.length(); 954 955 // 17 is the length of the string below, 2 is the amount of extra-space needed 956 maxTagLen += 17 + 2; 957 958 for (int v : balanceCheckReport.values()) 959 if ((valAsStr = ("" + v)).length() > maxValStrLen) 960 maxValStrLen = valAsStr.length(); 961 962 for (int v : balanceCheckReport.values()) 963 if ((valAsStr = ("" + Math.abs(v))).length() > maxAbsValStrLen) 964 maxAbsValStrLen = valAsStr.length(); 965 966 for (String htmlTag : balanceCheckReport.keySet()) 967 968 sb.append( 969 StringParse.rightSpacePad("HTML Element: [" + htmlTag + "]:", maxTagLen) + 970 StringParse.rightSpacePad( 971 ("" + (val = balanceCheckReport.get(htmlTag).intValue())), 972 maxValStrLen 973 ) + 974 NOTE(val, htmlTag, maxAbsValStrLen) + 975 "\n" 976 ); 977 978 return sb.toString(); 979 } 980 981 private static String NOTE(int val, String htmlTag, int padding) 982 { 983 if (val == 0) return ""; 984 985 else if (val > 0) return 986 ", which implies " + StringParse.rightSpacePad("" + Math.abs(val), padding) + 987 " unclosed <" + htmlTag + "> element(s)"; 988 989 else return 990 ", which implies " + StringParse.rightSpacePad("" + Math.abs(val), padding) + 991 " extra </" + htmlTag + "> element(s)"; 992 } 993 994 /** 995 * Converts a balance report to a {@code String}, for printing. 996 * 997 * @param balanceCheckReport This should be a {@code Hashtable} returned by any of the 998 * balance-check methods. 999 * 1000 * @return This shall return the report as a {@code String}. 1001 * 1002 * @throws IllegalArgumentException This exception throws if the length of the two input arrays 1003 * are not equal. It is imperative that the balance report being printed was created by the 1004 * html-tags that are listed in the HTML Token var-args parameter. If the two arrays are the 1005 * same length, but the tags used to create the report Hashtable are not the same ones being 1006 * passed to the var-args parameter {@code 'htmlTags'} - <I>the logic will not know the 1007 * difference, and no exception is thrown.</I> 1008 */ 1009 public static String toStringBalance(int[] balanceCheckReport, String... htmlTags) 1010 { 1011 if (balanceCheckReport.length != htmlTags.length) throw new IllegalArgumentException( 1012 "The balance report that you are checking was not generated using the html token " + 1013 "list provided, they are different lengths. balanceCheckReport.length: " + 1014 "[" + balanceCheckReport.length + "]\t htmlTags.length: [" + htmlTags.length + "]" 1015 ); 1016 1017 StringBuilder sb = new StringBuilder(); 1018 1019 for (int i=0; i < balanceCheckReport.length; i++) 1020 sb.append("HTML Element: [" + htmlTags[i] + "]:\t" + balanceCheckReport[i] + "\n"); 1021 1022 return sb.toString(); 1023 } 1024 1025}