001package Torello.HTML; 002 003import java.util.*; 004import java.util.regex.*; 005import java.util.stream.*; 006import java.io.Serializable; 007import java.lang.reflect.*; 008 009import Torello.HTML.NodeSearch.*; 010import Torello.Java.*; 011 012/** 013 * Computes miscellaneous statistics for a web-page, or sub-page. 014 * 015 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=PAGE_STATS> 016 * 017 * <STYLE TYPE="text/css"> 018 * .PAGESTATS 019 * { color: red; font-weight: bold; font-family: "Courier New", Courier, monospace; } 020 * </STYLE> 021 */ 022public class PageStats implements Serializable, Comparable<PageStats>, Cloneable 023{ 024 /** <EMBED CLASS='external-html' DATA-FILE-ID=SVUID> */ 025 public static final long serialVersionUID = 1; 026 027 /** 028 * If a Vectorized HTML page were converted to a String, this would be the length of that 029 * string. 030 * 031 * @see Util#strLength(Vector) 032 * @see Util#strLength(Vector, int, int) 033 */ 034 public final int strLength; 035 036 /** 037 * The string hash-code of the vectorized-HTML webpage, as if it were being represented as one 038 * single {@code java.lang.String.} 039 * 040 * @see Util#hashCode(Vector) 041 * @see Util#hashCode(Vector, int, int) 042 */ 043 public final int hash; 044 045 046 047 048 049 /** 050 * The number of HTML {@code TagNode} elements that have a {@code 'class'} attribute, when 051 * queried using {@code TagNode.AV("class")} 052 */ 053 public final short hasAVclass; 054 055 /** 056 * The number of HTML {@code TagNode} elements that have a {@code 'style'} attribute, when 057 * queried using {@code TagNode.AV("style")} 058 */ 059 public final short hasAVstyle; 060 061 /** 062 * The number of HTML {@code TagNode} elements that have an {@code 'id'} attribute, when 063 * queried using {@code TagNode.AV("id")} 064 */ 065 public final short hasAVid; 066 067 /** 068 * The number of HTML {@code TagNode} elements that have a {@code 'title'} attribute, when 069 * queried using {@code TagNode.AV("title")} 070 */ 071 public final short hasAVtitle; 072 073 /** 074 * The number of HTML {@code TagNode} elements that have an {@code 'href'} attribute, when 075 * queried using {@code TagNode.AV("href")} 076 */ 077 public final short hasAVhref; 078 079 /** 080 * The number of HTML {@code TagNode} elements that have a {@code 'hreflang'} attribute, when 081 * queried using {@code TagNode.AV("hreflang")} 082 */ 083 public final short hasAVhreflang; 084 085 /** 086 * The number of HTML {@code TagNode} elements that have a {@code 'src'} attribute, when 087 * queried using {@code TagNode.AV("src")} 088 */ 089 public final short hasAVsrc; 090 091 /** 092 * The number of HTML {@code TagNode} elements that have a {@code 'srcset'} attribute, when 093 * queried using {@code TagNode.AV("srcset")} 094 */ 095 public final short hasAVsrcset; 096 097 /** 098 * The number of HTML {@code TagNode} elements that have a {@code 'srclang'} attribute, when 099 * queried using {@code TagNode.AV("srclang")} 100 */ 101 public final short hasAVsrclang; 102 103 /** 104 * The number of HTML {@code TagNode} elements that have a {@code 'srcdoc'} attribute, when 105 * queried using {@code TagNode.AV("srcdoc")} 106 */ 107 public final short hasAVsrcdoc; 108 109 /** 110 * The number of HTML {@code TagNode} elements that have an {@code 'alt'} attribute, when 111 * queried using {@code TagNode.AV("alt")} 112 */ 113 public final short hasAValt; 114 115 /** 116 * The number of HTML {@code TagNode} elements that have a {@code 'target'} attribute, when 117 * queried using {@code TagNode.AV("target")} 118 */ 119 public final short hasAVtarget; 120 121 /** 122 * The number of HTML {@code TagNode} elements that have a {@code 'width'} attribute, when 123 * queried using {@code TagNode.AV("width")} 124 */ 125 public final short hasAVwidth; 126 127 /** 128 * The number of HTML {@code TagNode} elements that have a {@code 'height'} attribute, when 129 * queried using {@code TagNode.AV("height")} 130 */ 131 public final short hasAVheight; 132 133 /** 134 * The number of HTML {@code TagNode} elements that have a {@code 'size'} attribute, when 135 * queried using {@code TagNode.AV("size")} 136 */ 137 public final short hasAVsize; 138 139 /** 140 * The number of HTML {@code TagNode} elements that have a {@code 'sizes'} attribute, when 141 * queried using {@code TagNode.AV("sizes")} 142 */ 143 public final short hasAVsizes; 144 145 /** 146 * The number of HTML {@code TagNode} elements that have a {@code 'cols'} attribute, when 147 * queried using {@code TagNode.AV("cols")} 148 */ 149 public final short hasAVcols; 150 151 /** 152 * The number of HTML {@code TagNode} elements that have a {@code 'colspan'} attribute, when 153 * queried using {@code TagNode.AV("colspan")} 154 */ 155 public final short hasAVcolspan; 156 157 /** 158 * The number of HTML {@code TagNode} elements that have a {@code 'rows'} attribute, when 159 * queried using {@code TagNode.AV("rows")} 160 */ 161 public final short hasAVrows; 162 163 /** 164 * The number of HTML {@code TagNode} elements that have a {@code 'rowspan'} attribute, when 165 * queried using {@code TagNode.AV("rowspan")} 166 */ 167 public final short hasAVrowspan; 168 169 /** 170 * The number of HTML {@code TagNode} elements that have a {@code 'wrap'} attribute, when 171 * queried using {@code TagNode.AV("wrap")} 172 */ 173 public final short hasAVwrap; 174 175 /** 176 * The number of HTML {@code TagNode} elements that have a {@code 'value'} attribute, when 177 * queried using {@code TagNode.AV("value")} 178 */ 179 public final short hasAVvalue; 180 181 /** 182 * The number of HTML {@code TagNode} elements that have a {@code 'type'} attribute, when 183 * queried using {@code TagNode.AV("type")} 184 */ 185 public final short hasAVtype; 186 187 /** 188 * The number of HTML {@code TagNode} elements that have a {@code 'name'} attribute, when 189 * queried using {@code TagNode.AV("name")} 190 */ 191 public final short hasAVname; 192 193 /** 194 * The number of HTML {@code TagNode} elements that have a {@code 'min'} attribute, when 195 * queried using {@code TagNode.AV("min")} 196 */ 197 public final short hasAVmin; 198 199 /** 200 * The number of HTML {@code TagNode} elements that have a {@code 'max'} attribute, when 201 * queried using {@code TagNode.AV("max")} 202 */ 203 public final short hasAVmax; 204 205 /** 206 * The number of HTML {@code TagNode} elements that have a {@code 'minlength'} attribute, when 207 * queried using {@code TagNode.AV("minlength")} 208 */ 209 public final short hasAVminlength; 210 211 /** 212 * The number of HTML {@code TagNode} elements that have a {@code 'maxlength'} attribute, when 213 * queried using {@code TagNode.AV("maxlength")} 214 */ 215 public final short hasAVmaxlength; 216 217 /** 218 * The number of HTML {@code TagNode} elements that have a {@code 'accept'} attribute, when 219 * queried using {@code TagNode.AV("accept")} 220 */ 221 public final short hasAVaccept; 222 223 224 225 226 227 228 /** This is identical to the value returned by: {@code pageVector.size()} */ 229 public final int numNodes; 230 231 /** 232 * The number of {@code HTMLNode's} in the {@code Vector<HTMLNode>} that qualify as an 233 * {@code "instanceof" TagNode}. 234 * 235 * @see Util.Count#tagNodes(Vector) 236 * @see Util.Count#tagNodes(Vector, int, int) 237 */ 238 public final int numTagNodes; 239 240 /** 241 * The number of {@code HTMLNode's} in the {@code Vector<HTMLNode>} that qualify as an 242 * {@code "instanceof" TextNode}. 243 * 244 * @see Util.Count#textNodes(Vector) 245 * @see Util.Count#textNodes(Vector, int, int) 246 */ 247 public final int numTextNodes; 248 249 /** 250 * The number of {@code HTMLNode's} in the {@code Vector<HTMLNode>} that qualify as an 251 * {@code "instanceof" CommentNode}. 252 * 253 * @see Util.Count#commentNodes(Vector) 254 * @see Util.Count#commentNodes(Vector, int, int) 255 */ 256 public final int numCommentNodes; 257 258 /** 259 * This is the total number of new-line {@code '\n'} characters found inside any 260 * {@code TextNode} present in the page-vector. 261 * 262 * @see Util.Count#newLines(Vector) 263 * @see Util.Count#newLines(Vector, int, int) 264 */ 265 public final int numNewLines; 266 267 /** 268 * The total number of HTML <SPAN CLASS="PAGESTATS"><IMG ...></SPAN> {@code 'TagNode'} 269 * elements found on the page. 270 * 271 * <BR /><BR /><B><SPAN STYLE='color: red;'>NOTE:</B></SPAN> This is considered an HTML-5 272 * {@code 'Singleton'} element, and thusly should only have an "Opening-Tag Version" of the 273 * Element. If there are (accidentally) closing-versions of this tag, they will not be counted 274 * by {@code class PageStats} 275 */ 276 public final short numImages; 277 278 /** 279 * The total number of HTML <SPAN CLASS="PAGESTATS"><META ...></SPAN> {@code 'TagNode'} 280 * elements found on the page. 281 * 282 * <BR /><BR /><B><SPAN STYLE='color: red;'>NOTE:</B></SPAN> This is considered an HTML-5 283 * {@code 'Singleton'} element, and thusly should only have an "Opening-Tag Version" of the 284 * Element. If there are (accidentally) closing-versions of this tag, they will not be counted 285 * by {@code class PageStats} 286 */ 287 public final short numMeta; 288 289 /** 290 * The total number of HTML <SPAN CLASS="PAGESTATS"><LINK ...></SPAN> {@code 'TagNode'} 291 * elements found on the page. 292 * 293 * <BR /><BR /><B><SPAN STYLE='color: red;'>NOTE:</B></SPAN> This is considered an HTML-5 294 * {@code 'Singleton'} element, and thusly should only have an "Opening-Tag Version" of the 295 * Element. If there are (accidentally) closing-versions of this tag, they will not be counted 296 * by {@code class PageStats} 297 */ 298 public final short numLink; 299 300 /** 301 * The total number of HTML <SPAN CLASS="PAGESTATS"><INPUT ...></SPAN> {@code 'TagNode'} 302 * elements found on the page. 303 * 304 * <BR /><BR /><B><SPAN STYLE='color: red;'>NOTE:</B></SPAN> This is considered an HTML-5 305 * {@code 'Singleton'} element, and thusly should only have an "Opening-Tag Version" of the 306 * Element. If there are (accidentally) closing-versions of this tag, they will not be counted 307 * by {@code class PageStats} 308 */ 309 public final short numInput; 310 311 /** 312 * The total number of HTML <SPAN CLASS="PAGESTATS"><EMBED ...></SPAN> {@code 'TagNode'} 313 * elements found on this page. 314 * 315 * <BR /><BR /><B><SPAN STYLE='color: red;'>NOTE:</B></SPAN> This is considered an HTML-5 316 * {@code 'Singleton'} element, and thusly should only have an "Opening-Tag Version" of the 317 * Element. If there are (accidentally) closing-versions of this tag, they will not be counted 318 * by {@code class PageStats} 319 */ 320 public final short numEmbed; 321 322 /** 323 * The total number of HTML <SPAN CLASS="PAGESTATS"><HR></SPAN> {@code 'TagNode'} 324 * elements found on the page. 325 * 326 * <BR /><BR /><B><SPAN STYLE='color: red;'>NOTE:</B></SPAN> This is considered an HTML-5 327 * {@code 'Singleton'} element, and thusly should only have an "Opening-Tag Version" of the 328 * Element. If there are (accidentally) closing-versions of this tag, they will not be counted 329 * by {@code class PageStats} 330 */ 331 public final short numHR; 332 333 /** 334 * The total number of HTML <SPAN CLASS="PAGESTATS"><BR></SPAN> {@code 'TagNode'} 335 * elements found on the page. 336 * 337 * <BR /><BR /><B><SPAN STYLE='color: red;'>NOTE:</B></SPAN> This is considered an HTML-5 338 * {@code 'Singleton'} element, and thusly should only have an "Opening-Tag Version" of the 339 * Element. If there are (accidentally) closing-versions of this tag, they will not be counted 340 * by {@code class PageStats} 341 */ 342 public final short numBR; 343 344 345 /** 346 * The total number of <SPAN CLASS="PAGESTATS"><TABLE>, TC.OpeningTags, TagNode</SPAN> 347 * elements found on the page. Any element-internal attributes / inner-tags actually found 348 * inside the HTML element will just be ignored for the purposes of this count. 349 */ 350 public final short numOpenTables; 351 352 /** 353 * The total number of <SPAN CLASS="PAGESTATS"></TABLE>, TC.ClosingTags, TagNode</SPAN> 354 * elements found on the page. 355 */ 356 public final short numClosedTables; 357 358 /** 359 * The total number of <SPAN CLASS="PAGESTATS"><A> (Anchor), TC.OpeningTags, 360 * TagNode</SPAN> elements found on the page. Any element-internal attributes / inner-tags 361 * actually found inside the HTML element will just be ignored for the purposes of this count. 362 */ 363 public final short numOpenAnchors; 364 365 /** 366 * The total number of <SPAN CLASS="PAGESTATS"></A>, TC.ClosingTags, TagNode</SPAN> 367 * elements found on the page. 368 */ 369 public final short numClosedAnchors; 370 371 /** 372 * The total number of <SPAN CLASS="PAGESTATS"><P> (Paragraph), TC.OpeningTags, 373 * TagNode</SPAN> elements found on the page. Any element-internal attributes / inner-tags 374 * actually found inside the HTML element will just be ignored for the purposes of this count. 375 */ 376 public final short numOpenParagraphs; 377 378 /** 379 * The total number of <SPAN CLASS="PAGESTATS"></P>, TC.ClosingTags, TagNode</SPAN> 380 * elements found on the page. 381 */ 382 public final short numClosedParagraphs; 383 384 /** 385 * The total number of <SPAN CLASS="PAGESTATS"><DIV> (Divider), TC.OpeningTags, 386 * TagNode</SPAN> elements found on the page. Any element-internal attributes / inner-tags 387 * actually found inside the HTML element will just be ignored for the purposes of this count. 388 */ 389 public final short numOpenDivs; 390 391 /** 392 * The total number of <SPAN CLASS="PAGESTATS"></DIV>, TC.ClosingTags, TagNode</SPAN> 393 * elements found on the page. 394 */ 395 public final short numClosedDivs; 396 397 /** 398 * The total number of <SPAN CLASS="PAGESTATS"><SPAN>, TC.OpeningTags, TagNode</SPAN> 399 * elements found on the page. Any element-internal attributes / inner-tags actually found 400 * inside the HTML element will just be ignored for the purposes of this count. 401 */ 402 public final short numOpenSpans; 403 404 /** 405 * The total number of <SPAN CLASS="PAGESTATS"></SPAN>, TC.ClosingTags, TagNode</SPAN> 406 * elements found on the page. 407 */ 408 public final short numClosedSpans; 409 410 /** 411 * The total number of <SPAN CLASS="PAGESTATS"><SCRIPT>, TC.OpeningTags, TagNode</SPAN> 412 * elements found on the page. Any element-internal attributes / inner-tags actually found 413 * inside the HTML element will just be ignored for the purposes of this count. 414 */ 415 public final short numOpenScripts; 416 417 /** 418 * The total number of <SPAN CLASS="PAGESTATS"></SCRIPT>, TC.ClosingTags, TagNode</SPAN> 419 * elements found on the page. 420 */ 421 public final short numClosedScripts; 422 423 /** 424 * The total number of <SPAN CLASS="PAGESTATS"><STYLE>, TC.OpeningTags, TagNode</SPAN> 425 * elements found on the page. Any element-internal attributes / inner-tags actually found 426 * inside the HTML element will just be ignored for the purposes of this count. 427 */ 428 public final short numOpenStyles; 429 430 /** 431 * The total number of <SPAN CLASS="PAGESTATS"></STYLE>, TC.ClosingTags, TagNode</SPAN> 432 * elements found on the page. 433 */ 434 public final short numClosedStyles; 435 436 /** 437 * The total number of <SPAN CLASS="PAGESTATS"><FRAME>, TC.OpeningTags, TagNode</SPAN> 438 * elements found on the page. Any element-internal attributes / inner-tags actually found 439 * inside the HTML element will just be ignored for the purposes of this count. 440 */ 441 public final short numOpenFrames; 442 443 /** 444 * The total number of <SPAN CLASS="PAGESTATS"></FRAME>, TC.ClosingTags, TagNode</SPAN> 445 * elements found on the page. 446 */ 447 public final short numClosedFrames; 448 449 /** 450 * The total number of <SPAN CLASS="PAGESTATS"><IFRAME>, TC.OpeningTags, TagNode</SPAN> 451 * elements found on the page. Any element-internal attributes / inner-tags actually found 452 * inside the HTML element will just be ignored for the purposes of this count. 453 */ 454 public final short numOpenIFrames; 455 456 /** 457 * The total number of <SPAN CLASS="PAGESTATS"></IFRAME>, TC.ClosingTags, TagNode</SPAN> 458 * elements found on the page. 459 */ 460 public final short numClosedIFrames; 461 462 /** 463 * The total number of <SPAN CLASS="PAGESTATS"><FORM>, TC.OpeningTags, TagNode</SPAN> 464 * elements found on the page. Any element-internal attributes / inner-tags actually found 465 * inside the HTML element will just be ignored for the purposes of this count. 466 */ 467 public final short numOpenForms; 468 469 /** 470 * The total number of <SPAN CLASS="PAGESTATS"></FORM>, TC.ClosingTags, TagNode</SPAN> 471 * elements found on the page. 472 */ 473 public final short numClosedForms; 474 475 /** 476 * Internally used by the 'clone' method. 477 * 478 * @param otherPageStats This is the instance of PageStats to be copied 479 */ 480 protected PageStats(PageStats otherPageStats) 481 { 482 this.hasAVclass = otherPageStats.hasAVclass; 483 this.hasAVstyle = otherPageStats.hasAVstyle; 484 this.hasAVid = otherPageStats.hasAVid; 485 this.hasAVtitle = otherPageStats.hasAVtitle; 486 this.hasAVhref = otherPageStats.hasAVhref; 487 this.hasAVhreflang = otherPageStats.hasAVhreflang; 488 this.hasAVsrc = otherPageStats.hasAVsrc; 489 this.hasAVsrcset = otherPageStats.hasAVsrcset; 490 this.hasAVsrclang = otherPageStats.hasAVsrclang; 491 this.hasAVsrcdoc = otherPageStats.hasAVsrcdoc; 492 this.hasAValt = otherPageStats.hasAValt; 493 this.hasAVtarget = otherPageStats.hasAVtarget; 494 this.hasAVwidth = otherPageStats.hasAVwidth; 495 this.hasAVheight = otherPageStats.hasAVheight; 496 this.hasAVsize = otherPageStats.hasAVsize; 497 this.hasAVsizes = otherPageStats.hasAVsizes; 498 this.hasAVcols = otherPageStats.hasAVcols; 499 this.hasAVcolspan = otherPageStats.hasAVcolspan; 500 this.hasAVrows = otherPageStats.hasAVrows; 501 this.hasAVrowspan = otherPageStats.hasAVrowspan; 502 this.hasAVwrap = otherPageStats.hasAVwrap; 503 this.hasAVvalue = otherPageStats.hasAVvalue; 504 this.hasAVtype = otherPageStats.hasAVtype; 505 this.hasAVname = otherPageStats.hasAVname; 506 this.hasAVmin = otherPageStats.hasAVmin; 507 this.hasAVmax = otherPageStats.hasAVmax; 508 this.hasAVminlength = otherPageStats.hasAVminlength; 509 this.hasAVmaxlength = otherPageStats.hasAVmaxlength; 510 this.hasAVaccept = otherPageStats.hasAVaccept; 511 512 this.numImages = otherPageStats.numImages; 513 this.numMeta = otherPageStats.numMeta; 514 this.numLink = otherPageStats.numLink; 515 this.numInput = otherPageStats.numInput; 516 this.numEmbed = otherPageStats.numEmbed ; 517 this.numHR = otherPageStats.numHR; 518 this.numBR = otherPageStats.numBR; 519 520 this.numOpenTables = otherPageStats.numOpenTables; 521 this.numClosedTables = otherPageStats.numClosedTables; 522 this.numOpenAnchors = otherPageStats.numOpenAnchors; 523 this.numClosedAnchors = otherPageStats.numClosedAnchors; 524 this.numOpenParagraphs = otherPageStats.numOpenParagraphs; 525 this.numClosedParagraphs = otherPageStats.numClosedParagraphs; 526 this.numOpenDivs = otherPageStats.numOpenDivs; 527 this.numClosedDivs = otherPageStats.numClosedDivs; 528 this.numOpenSpans = otherPageStats.numOpenSpans; 529 this.numClosedSpans = otherPageStats.numClosedSpans; 530 this.numOpenScripts = otherPageStats.numOpenScripts; 531 this.numClosedScripts = otherPageStats.numClosedScripts; 532 this.numOpenStyles = otherPageStats.numOpenStyles; 533 this.numClosedStyles = otherPageStats.numClosedStyles; 534 this.numOpenFrames = otherPageStats.numOpenFrames; 535 this.numClosedFrames = otherPageStats.numClosedFrames; 536 this.numOpenIFrames = otherPageStats.numOpenIFrames; 537 this.numClosedIFrames = otherPageStats.numClosedIFrames; 538 this.numOpenForms = otherPageStats.numOpenForms; 539 this.numClosedForms = otherPageStats.numClosedForms; 540 541 this.strLength = otherPageStats.strLength; 542 this.hash = otherPageStats.hash; 543 this.numNodes = otherPageStats.numNodes; 544 this.numTagNodes = otherPageStats.numTagNodes; 545 this.numTextNodes = otherPageStats.numTextNodes; 546 this.numCommentNodes = otherPageStats.numCommentNodes; 547 this.numNewLines = otherPageStats.numNewLines; 548 } 549 550 /** 551 * Constructs a new instance of PageStats. Assigns values based on a vectorized-webpage to the 552 * individual fields in this class. 553 * 554 * @param page Any HTML page or sub-page. 555 * 556 * @see Util#strLength(Vector) 557 * @see Util#hashCode(Vector) 558 * @see Util.Count#tagNodes(Vector) 559 * @see Util.Count#textNodes(Vector) 560 * @see Util.Count#commentNodes(Vector) 561 * @see Util.Count#newLines(Vector) 562 * @see TagNodeFindInclusive 563 * @see TagNodeCount 564 * @see TagNode#AV(String) 565 */ 566 public PageStats(Vector<HTMLNode> page) 567 { 568 this.strLength = Util.strLength(page); 569 this.hash = Util.hashCode(page); 570 this.numNodes = page.size(); 571 this.numTagNodes = Util.Count.tagNodes(page); 572 this.numTextNodes = Util.Count.textNodes(page); 573 this.numCommentNodes = Util.Count.commentNodes(page); 574 this.numNewLines = Util.Count.newLines(page); 575 576 short numImages, numMeta, numLink, numInput, numEmbed, numHR, numBR; 577 numImages = numMeta = numLink = numInput = numEmbed = numHR = numBR = 0; 578 579 short hasAVclass, hasAVstyle, hasAVid, hasAVtitle, hasAVhref, hasAVhreflang, hasAVsrc; 580 short hasAVsrcset, hasAVsrclang, hasAVsrcdoc, hasAValt, hasAVtarget, hasAVwidth; 581 short hasAVheight, hasAVsize, hasAVsizes, hasAVcols, hasAVcolspan, hasAVrows, hasAVrowspan; 582 short hasAVwrap, hasAVvalue, hasAVtype, hasAVname, hasAVmin, hasAVmax, hasAVminlength; 583 short hasAVmaxlength, hasAVaccept; 584 585 hasAVclass = hasAVstyle = hasAVid = hasAVtitle = hasAVhref = hasAVhreflang = hasAVsrc = 586 hasAVsrcset = hasAVsrclang = hasAVsrcdoc = hasAValt = hasAVtarget = hasAVwidth = 587 hasAVheight = hasAVsize = hasAVsizes = hasAVcols = hasAVcolspan = hasAVrows = 588 hasAVrowspan = hasAVwrap = hasAVvalue = hasAVtype = hasAVname = hasAVmin = hasAVmax = 589 hasAVminlength = hasAVmaxlength = hasAVaccept = 0; 590 591 short numOpenTables, numClosedTables, numOpenAnchors, numClosedAnchors, numOpenParagraphs; 592 short numClosedParagraphs, numOpenDivs, numClosedDivs, numOpenSpans, numClosedSpans; 593 short numOpenScripts, numClosedScripts, numOpenStyles, numClosedStyles, numOpenFrames; 594 short numClosedFrames, numOpenIFrames, numClosedIFrames, numOpenForms, numClosedForms; 595 596 numOpenTables = numClosedTables = numOpenAnchors = numClosedAnchors = numOpenParagraphs = 597 numClosedParagraphs = numOpenDivs = numClosedDivs = numOpenSpans = numClosedSpans = 598 numOpenScripts = numClosedScripts = numOpenStyles = numClosedStyles = numOpenFrames = 599 numClosedFrames = numOpenIFrames = numClosedIFrames = numOpenForms = numClosedForms = 0; 600 601 for (HTMLNode n : page) if (n.isTagNode()) 602 { 603 TagNode tn = (TagNode) n; 604 List<String> l = tn.allAN().collect(Collectors.toList()); 605 606 if (l.size() > 0) 607 { 608 if (l.contains("class")) { l.remove("class"); hasAVclass++; } 609 if (l.contains("style")) { l.remove("style"); hasAVstyle++; } 610 if (l.contains("id")) { l.remove("id"); hasAVid++; } 611 if (l.contains("title")) { l.remove("title"); hasAVtitle++; } 612 if (l.contains("href")) { l.remove("href"); hasAVhref++; } 613 if (l.contains("hreflang")) { l.remove("hreflang"); hasAVhreflang++; } 614 if (l.contains("src")) { l.remove("src"); hasAVsrc++; } 615 if (l.contains("srcset")) { l.remove("srcset"); hasAVsrcset++; } 616 if (l.contains("srclang")) { l.remove("srclang"); hasAVsrclang++; } 617 if (l.contains("srcdoc")) { l.remove("srcdoc"); hasAVsrcdoc++; } 618 if (l.contains("alt")) { l.remove("alt"); hasAValt++; } 619 if (l.contains("target")) { l.remove("target"); hasAVtarget++; } 620 if (l.contains("width")) { l.remove("width"); hasAVwidth++; } 621 if (l.contains("height")) { l.remove("height"); hasAVheight++; } 622 if (l.contains("size")) { l.remove("size"); hasAVsize++; } 623 if (l.contains("sizes")) { l.remove("sizes"); hasAVsizes++; } 624 if (l.contains("cols")) { l.remove("cols"); hasAVcols++; } 625 if (l.contains("colspan")) { l.remove("colspan"); hasAVcolspan++; } 626 if (l.contains("rows")) { l.remove("rows"); hasAVrows++; } 627 if (l.contains("rowspan")) { l.remove("rowspan"); hasAVrowspan++; } 628 if (l.contains("wrap")) { l.remove("wrap"); hasAVwrap++; } 629 if (l.contains("value")) { l.remove("value"); hasAVvalue++; } 630 if (l.contains("type")) { l.remove("type"); hasAVtype++; } 631 if (l.contains("name")) { l.remove("name"); hasAVname++; } 632 if (l.contains("min")) { l.remove("min"); hasAVmin++; } 633 if (l.contains("max")) { l.remove("max"); hasAVmax++; } 634 if (l.contains("minlength")) { l.remove("minlength"); hasAVminlength++; } 635 if (l.contains("maxlength")) { l.remove("maxlength"); hasAVmaxlength++; } 636 if (l.contains("accept")) { l.remove("accept"); hasAVaccept++; } 637 } 638 639 if (tn.tok.equals("img")) numImages++; 640 else if (tn.tok.equals("meta")) numMeta++; 641 else if (tn.tok.equals("link")) numLink++; 642 else if (tn.tok.equals("input")) numInput++; 643 else if (tn.tok.equals("embed")) numEmbed++; 644 else if (tn.tok.equals("hr")) numHR++; 645 else if (tn.tok.equals("br")) numBR++; 646 647 648 if (tn.tok.equals("table")) 649 { if (tn.isClosing) numClosedTables++; else numOpenTables++; } 650 651 else if (tn.tok.equals("a")) 652 { if (tn.isClosing) numClosedAnchors++; else numOpenAnchors++; } 653 654 else if (tn.tok.equals("p")) 655 { if (tn.isClosing) numClosedParagraphs++; else numOpenParagraphs++; } 656 657 else if (tn.tok.equals("div")) 658 { if (tn.isClosing) numClosedDivs++; else numOpenDivs++; } 659 660 else if (tn.tok.equals("span")) 661 { if (tn.isClosing) numClosedSpans++; else numOpenSpans++; } 662 663 else if (tn.tok.equals("script")) 664 { if (tn.isClosing) numClosedScripts++; else numOpenScripts++; } 665 666 else if (tn.tok.equals("style")) 667 { if (tn.isClosing) numClosedStyles++; else numOpenStyles++; } 668 669 else if (tn.tok.equals("frame")) 670 { if (tn.isClosing) numClosedFrames++; else numOpenFrames++; } 671 672 else if (tn.tok.equals("iframe")) 673 { if (tn.isClosing) numClosedIFrames++; else numOpenIFrames++; } 674 675 else if (tn.tok.equals("form")) 676 { if (tn.isClosing) numClosedForms++; else numOpenForms++; } 677 } 678 679 this.numImages = numImages; 680 this.numMeta = numMeta; 681 this.numLink = numLink; 682 this.numInput = numInput; 683 this.numEmbed = numEmbed; 684 this.numHR = numHR; 685 this.numBR = numBR; 686 687 688 this.hasAVclass = hasAVclass; 689 this.hasAVstyle = hasAVstyle; 690 this.hasAVid = hasAVid; 691 this.hasAVtitle = hasAVtitle; 692 this.hasAVhref = hasAVhref; 693 this.hasAVhreflang = hasAVhreflang; 694 this.hasAVsrc = hasAVsrc; 695 this.hasAVsrcset = hasAVsrcset; 696 this.hasAVsrclang = hasAVsrclang; 697 this.hasAVsrcdoc = hasAVsrcdoc; 698 this.hasAValt = hasAValt; 699 this.hasAVtarget = hasAVtarget; 700 this.hasAVwidth = hasAVwidth; 701 this.hasAVheight = hasAVheight; 702 this.hasAVsize = hasAVsize; 703 this.hasAVsizes = hasAVsizes; 704 this.hasAVcols = hasAVcols; 705 this.hasAVcolspan = hasAVcolspan; 706 this.hasAVrows = hasAVrows; 707 this.hasAVrowspan = hasAVrowspan; 708 this.hasAVwrap = hasAVwrap; 709 this.hasAVvalue = hasAVvalue; 710 this.hasAVtype = hasAVtype; 711 this.hasAVname = hasAVname; 712 this.hasAVmin = hasAVmin; 713 this.hasAVmax = hasAVmax; 714 this.hasAVminlength = hasAVminlength; 715 this.hasAVmaxlength = hasAVmaxlength; 716 this.hasAVaccept = hasAVaccept; 717 718 719 this.numOpenTables = numOpenTables; 720 this.numClosedTables = numClosedTables; 721 this.numOpenAnchors = numOpenAnchors; 722 this.numClosedAnchors = numClosedAnchors; 723 this.numOpenParagraphs = numOpenParagraphs; 724 this.numClosedParagraphs = numClosedParagraphs; 725 this.numOpenDivs = numOpenDivs; 726 this.numClosedDivs = numClosedDivs; 727 this.numOpenSpans = numOpenSpans; 728 this.numClosedSpans = numClosedSpans; 729 this.numOpenScripts = numOpenScripts; 730 this.numClosedScripts = numClosedScripts; 731 this.numOpenStyles = numOpenStyles; 732 this.numClosedStyles = numClosedStyles; 733 this.numOpenFrames = numOpenFrames; 734 this.numClosedFrames = numClosedFrames; 735 this.numOpenIFrames = numOpenIFrames; 736 this.numClosedIFrames = numClosedIFrames; 737 this.numOpenForms = numOpenForms; 738 this.numClosedForms = numClosedForms; 739 } 740 741 /** 742 * Java's {@code public boolean equals(Object o)} requirements. 743 * 744 * @param o This may be any Java Object, but only ones of {@code 'this'} type whose 745 * internal-values are identical will cause this method to return {@code TRUE}. 746 * 747 * @return {@code TRUE} if {@code 'this'} instance of {@code PageStats} is identical to 748 * parameter {@code 'o'.} 749 */ 750 public boolean equals(Object o) 751 { 752 if (! (o instanceof PageStats)) return false; 753 754 PageStats otherPageStats = (PageStats) o; 755 756 return 757 (this.strLength == otherPageStats.strLength) && 758 (this.hash == otherPageStats.hash) && 759 (this.numNodes == otherPageStats.numNodes) && 760 (this.numTagNodes == otherPageStats.numTagNodes) && 761 (this.numTextNodes == otherPageStats.numTextNodes) && 762 (this.numCommentNodes == otherPageStats.numCommentNodes) && 763 (this.numNewLines == otherPageStats.numNewLines) && 764 765 766 (this.hasAVclass == otherPageStats.hasAVclass) && 767 (this.hasAVstyle == otherPageStats.hasAVstyle) && 768 (this.hasAVid == otherPageStats.hasAVid) && 769 (this.hasAVtitle == otherPageStats.hasAVtitle) && 770 (this.hasAVhref == otherPageStats.hasAVhref) && 771 (this.hasAVhreflang == otherPageStats.hasAVhreflang) && 772 (this.hasAVsrc == otherPageStats.hasAVsrc) && 773 (this.hasAVsrcset == otherPageStats.hasAVsrcset) && 774 (this.hasAVsrclang == otherPageStats.hasAVsrclang) && 775 (this.hasAVsrcdoc == otherPageStats.hasAVsrcdoc) && 776 (this.hasAValt == otherPageStats.hasAValt) && 777 (this.hasAVtarget == otherPageStats.hasAVtarget) && 778 (this.hasAVwidth == otherPageStats.hasAVwidth) && 779 (this.hasAVheight == otherPageStats.hasAVheight) && 780 (this.hasAVsize == otherPageStats.hasAVsize) && 781 (this.hasAVsizes == otherPageStats.hasAVsizes) && 782 (this.hasAVcols == otherPageStats.hasAVcols) && 783 (this.hasAVcolspan == otherPageStats.hasAVcolspan) && 784 (this.hasAVrows == otherPageStats.hasAVrows) && 785 (this.hasAVrowspan == otherPageStats.hasAVrowspan) && 786 (this.hasAVwrap == otherPageStats.hasAVwrap) && 787 (this.hasAVvalue == otherPageStats.hasAVvalue) && 788 (this.hasAVtype == otherPageStats.hasAVtype) && 789 (this.hasAVname == otherPageStats.hasAVname) && 790 (this.hasAVmin == otherPageStats.hasAVmin) && 791 (this.hasAVmax == otherPageStats.hasAVmax) && 792 (this.hasAVminlength == otherPageStats.hasAVminlength) && 793 (this.hasAVmaxlength == otherPageStats.hasAVmaxlength) && 794 (this.hasAVaccept == otherPageStats.hasAVaccept) && 795 796 797 (this.numImages == otherPageStats.numImages) && 798 (this.numMeta == otherPageStats.numMeta) && 799 (this.numLink == otherPageStats.numLink) && 800 (this.numInput == otherPageStats.numInput) && 801 (this.numEmbed == otherPageStats.numEmbed) && 802 (this.numHR == otherPageStats.numHR) && 803 (this.numBR == otherPageStats.numBR) && 804 805 806 (this.numOpenTables == otherPageStats.numOpenTables) && 807 (this.numClosedTables == otherPageStats.numClosedTables) && 808 809 (this.numOpenAnchors == otherPageStats.numOpenAnchors) && 810 (this.numClosedAnchors == otherPageStats.numClosedAnchors) && 811 812 (this.numOpenParagraphs == otherPageStats.numOpenParagraphs) && 813 (this.numClosedParagraphs == otherPageStats.numClosedParagraphs) && 814 815 (this.numOpenDivs == otherPageStats.numOpenDivs) && 816 (this.numClosedDivs == otherPageStats.numClosedDivs) && 817 818 (this.numOpenSpans == otherPageStats.numOpenSpans) && 819 (this.numClosedSpans == otherPageStats.numClosedSpans) && 820 821 (this.numOpenScripts == otherPageStats.numOpenScripts) && 822 (this.numClosedScripts == otherPageStats.numClosedScripts) && 823 824 (this.numOpenStyles == otherPageStats.numOpenStyles) && 825 (this.numClosedStyles == otherPageStats.numClosedStyles) && 826 827 (this.numOpenFrames == otherPageStats.numOpenFrames) && 828 (this.numClosedFrames == otherPageStats.numClosedFrames) && 829 830 (this.numOpenIFrames == otherPageStats.numOpenIFrames) && 831 (this.numClosedIFrames == otherPageStats.numClosedIFrames) && 832 833 (this.numOpenForms == otherPageStats.numOpenForms) && 834 (this.numClosedForms == otherPageStats.numClosedForms); 835 } 836 837 /** 838 * Java's hash-code requirement. Notice that this method is not static, and provides the 839 * hashCode that was computed for the vectorized-webpage when this instance of 840 * {@code PageStats} was created. Perhaps the subtlety is noticeable - <I>there is also a 841 * {@code public static} version of method {@code int hashCode(); }</I> Both of them will 842 * return the same number, but only one of them actually computes a hash-code (the 843 * static-method). This non-static merely retrieves the hash that was created when this 844 * instance was built by the constructor. 845 * 846 * @return A hash-code that may be used when storing this node in a java sorted-collection. 847 */ 848 public int hashCode() { return this.hash; } 849 850 /** 851 * Java's {@code interface Comparable<T>} requirements. This does a very simple comparison 852 * using the underlying field {@code final String str} that all HTMLNode's contain. 853 * 854 * @param other Any other {@code PageStats} to be compared to {@code 'this' PageStats} 855 * 856 * @return An integer that fulfills Java's {@code interface Comparable<T> public boolean 857 * compareTo(T t)} method requirements. 858 */ 859 public int compareTo(PageStats other) 860 { 861 int compare1 = this.numNodes - other.numNodes; 862 if (compare1 != 0) return compare1; 863 return this.strLength - other.strLength; 864 } 865 866 867 /** 868 * This converts a {@code PageStats} object to a simple-string (Base64 Encoded) object that may 869 * be passed and transmitted as a String. 870 * 871 * @return Zipped, Serialized, Base-64 Encoded String version of this object. 872 * 873 * @see StringParse#objToB64Str(Object) 874 */ 875 public String toB64String() 876 { try { return StringParse.objToB64Str(this); } catch (Exception e) { return null; } } 877 878 /** 879 * Convets a Base65 Encoded {@code String} into an instance of {@code PageStats} 880 * 881 * @param minimized A previously minimized, compressed, Serialized version of this object 882 * ({@code PageStats}). 883 * 884 * @return An instance of this class. 885 * 886 * @see StringParse#b64StrToObj(String) 887 */ 888 public static PageStats fromB64String(String minimized) 889 { 890 try 891 { return (PageStats) StringParse.b64StrToObj(minimized); } 892 893 catch (Exception e) { return null; } 894 } 895 896 /** 897 * Generates a carbon copy of passed reference instance {@code 'PageStats'} 898 * 899 * @return Returns a 'clone' of this vector. Utilizes {@code 'this' class, protected 900 * constructor}. 901 */ 902 public PageStats clone() { return new PageStats(this); } 903 904 905 /** 906 * Generates a java string representation of {@code 'this' instance} of {@code class PageStats} 907 * 908 * @return a java string of all the details encapsulated by a {@code PageStats} object 909 * reference. 910 */ 911 public String toString() 912 { 913 return 914 "strLength = " + strLength + '\n' + 915 "hash = " + hash + '\n' + 916 "numNodes = " + numNodes + '\n' + 917 "numTagNodes = " + numTagNodes + '\n' + 918 "numTextNodes = " + numTextNodes + '\n' + 919 "numCommentNodes = " + numCommentNodes + '\n' + 920 "numNewLines = " + numNewLines + '\n' + 921 922 "\n" + 923 924 "hasAVclass = " + hasAVclass + "\n" + 925 "hasAVstyle = " + hasAVstyle + "\n" + 926 "hasAVid = " + hasAVid + "\n" + 927 "hasAVtitle = " + hasAVtitle + "\n" + 928 "hasAVhref = " + hasAVhref + "\n" + 929 "hasAVhreflang = " + hasAVhreflang + "\n" + 930 "hasAVsrc = " + hasAVsrc + "\n" + 931 "hasAVsrcset = " + hasAVsrcset + "\n" + 932 "hasAVsrclang = " + hasAVsrclang + "\n" + 933 "hasAVsrcdoc = " + hasAVsrcdoc + "\n" + 934 "hasAValt = " + hasAValt + "\n" + 935 "hasAVtarget = " + hasAVtarget + "\n" + 936 "hasAVwidth = " + hasAVwidth + "\n" + 937 "hasAVheight = " + hasAVheight + "\n" + 938 "hasAVsize = " + hasAVsize + "\n" + 939 "hasAVsizes = " + hasAVsizes + "\n" + 940 "hasAVcols = " + hasAVcols + "\n" + 941 "hasAVcolspan = " + hasAVcolspan + "\n" + 942 "hasAVrows = " + hasAVrows + "\n" + 943 "hasAVrowspan = " + hasAVrowspan + "\n" + 944 "hasAVwrap = " + hasAVwrap + "\n" + 945 "hasAVvalue = " + hasAVvalue + "\n" + 946 "hasAVtype = " + hasAVtype + "\n" + 947 "hasAVname = " + hasAVname + "\n" + 948 "hasAVmin = " + hasAVmin + "\n" + 949 "hasAVmax = " + hasAVmax + "\n" + 950 "hasAVminlength = " + hasAVminlength + "\n" + 951 "hasAVmaxlength = " + hasAVmaxlength + "\n" + 952 "hasAVaccept = " + hasAVaccept + "\n" + 953 954 "\n" + 955 956 "numImages = " + numImages + '\n' + 957 "numMeta = " + numMeta + '\n' + 958 "numLink = " + numLink + '\n' + 959 "numInput = " + numInput + '\n' + 960 "numEmbed = " + numEmbed + '\n' + 961 "numHR = " + numHR + '\n' + 962 "numBR = " + numBR + '\n' + 963 964 "\n" + 965 966 "numOpenTables = " + numOpenTables + '\n' + 967 "numClosedTables = " + numClosedTables + '\n' + 968 969 "numOpenAnchors = " + numOpenAnchors + '\n' + 970 "numClosedAnchors = " + numClosedAnchors + '\n' + 971 972 "numOpenParagraphs = " + numOpenParagraphs + '\n' + 973 "numClosedParagraphs = " + numClosedParagraphs + '\n' + 974 975 "numOpenDivs = " + numOpenDivs + '\n' + 976 "numClosedDivs = " + numClosedDivs + '\n' + 977 978 "numOpenSpans = " + numOpenSpans + '\n' + 979 "numClosedSpans = " + numClosedSpans + '\n' + 980 981 "numOpenScripts = " + numOpenScripts + '\n' + 982 "numClosedScripts = " + numClosedScripts + '\n' + 983 984 "numOpenStyles = " + numOpenStyles + '\n' + 985 "numClosedStyles = " + numClosedStyles + '\n' + 986 987 "numOpenFrames = " + numOpenFrames + '\n' + 988 "numClosedFrames = " + numClosedFrames + '\n' + 989 990 "numOpenIFrames = " + numOpenIFrames + '\n' + 991 "numClosedIFrames = " + numClosedIFrames + '\n' + 992 993 "numOpenForms = " + numOpenForms + '\n' + 994 "numClosedForms = " + numClosedForms + '\n'; 995 } 996}