001package Torello.HTML;
002
003import java.util.*;
004import java.util.regex.*;
005import java.util.stream.*;
006import java.io.Serializable;
007import java.lang.reflect.*;
008
009import Torello.HTML.NodeSearch.*;
010import Torello.Java.*;
011
012/**
013 * Computes miscellaneous statistics for a web-page, or sub-page.
014 * 
015 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=PAGE_STATS>
016 *
017 * <STYLE TYPE="text/css">
018 * .PAGESTATS
019 * { color: red; font-weight: bold; font-family: "Courier New", Courier, monospace; }
020 * </STYLE>
021 */
022public class PageStats implements Serializable, Comparable<PageStats>, Cloneable
023{
024    /** <EMBED CLASS='external-html' DATA-FILE-ID=SVUID> */
025    public static final long serialVersionUID = 1;
026
027    /**
028     * If a Vectorized HTML page were converted to a String, this would be the length of that
029     * string.
030     * 
031     * @see Util#strLength(Vector)
032     * @see Util#strLength(Vector, int, int)
033     */
034    public final int strLength;
035
036    /**
037     * The string hash-code of the vectorized-HTML webpage, as if it were being represented as one
038     * single {@code java.lang.String.}
039     * 
040     * @see Util#hashCode(Vector)
041     * @see Util#hashCode(Vector, int, int)
042     */
043    public final int hash;
044
045
046
047
048
049    /**
050     * The number of HTML {@code TagNode} elements that have a {@code 'class'} attribute, when
051     * queried using {@code TagNode.AV("class")}
052     */
053    public final short hasAVclass;
054
055    /**
056     * The number of HTML {@code TagNode} elements that have a {@code 'style'} attribute, when
057     * queried using {@code TagNode.AV("style")}
058     */
059    public final short hasAVstyle;
060
061    /**
062     * The number of HTML {@code TagNode} elements that have an {@code 'id'} attribute, when
063     * queried using {@code TagNode.AV("id")}
064     */
065    public final short hasAVid;
066
067    /**
068     * The number of HTML {@code TagNode} elements that have a {@code 'title'} attribute, when
069     * queried using {@code TagNode.AV("title")}
070     */
071    public final short hasAVtitle;
072
073    /**
074     * The number of HTML {@code TagNode} elements that have an {@code 'href'} attribute, when
075     * queried using {@code TagNode.AV("href")}
076     */
077    public final short hasAVhref;
078
079    /**
080     * The number of HTML {@code TagNode} elements that have a {@code 'hreflang'} attribute, when
081     * queried using {@code TagNode.AV("hreflang")}
082     */
083    public final short hasAVhreflang;
084
085    /**
086     * The number of HTML {@code TagNode} elements that have a {@code 'src'} attribute, when
087     * queried using {@code TagNode.AV("src")}
088     */
089    public final short hasAVsrc;
090
091    /**
092     * The number of HTML {@code TagNode} elements that have a {@code 'srcset'} attribute, when
093     * queried using {@code TagNode.AV("srcset")}
094     */
095    public final short hasAVsrcset;
096
097    /**
098     * The number of HTML {@code TagNode} elements that have a {@code 'srclang'} attribute, when
099     * queried using {@code TagNode.AV("srclang")}
100     */
101    public final short hasAVsrclang;
102
103    /**
104     * The number of HTML {@code TagNode} elements that have a {@code 'srcdoc'} attribute, when
105     * queried using {@code TagNode.AV("srcdoc")}
106     */
107    public final short hasAVsrcdoc;
108
109    /**
110     * The number of HTML {@code TagNode} elements that have an {@code 'alt'} attribute, when
111     * queried using {@code TagNode.AV("alt")}
112     */
113    public final short hasAValt;
114
115    /**
116     * The number of HTML {@code TagNode} elements that have a {@code 'target'} attribute, when
117     * queried using {@code TagNode.AV("target")}
118     */
119    public final short hasAVtarget;
120
121    /**
122     * The number of HTML {@code TagNode} elements that have a {@code 'width'} attribute, when
123     * queried using {@code TagNode.AV("width")}
124     */
125    public final short hasAVwidth;
126
127    /**
128     * The number of HTML {@code TagNode} elements that have a {@code 'height'} attribute, when
129     * queried using {@code TagNode.AV("height")}
130     */
131    public final short hasAVheight;
132
133    /**
134     * The number of HTML {@code TagNode} elements that have a {@code 'size'} attribute, when
135     * queried using {@code TagNode.AV("size")}
136     */
137    public final short hasAVsize;
138
139    /**
140     * The number of HTML {@code TagNode} elements that have a {@code 'sizes'} attribute, when
141     * queried using {@code TagNode.AV("sizes")}
142     */
143    public final short hasAVsizes;
144
145    /**
146     * The number of HTML {@code TagNode} elements that have a {@code 'cols'} attribute, when
147     * queried using {@code TagNode.AV("cols")}
148     */
149    public final short hasAVcols;
150
151    /**
152     * The number of HTML {@code TagNode} elements that have a {@code 'colspan'} attribute, when
153     * queried using {@code TagNode.AV("colspan")}
154     */
155    public final short hasAVcolspan;
156
157    /**
158     * The number of HTML {@code TagNode} elements that have a {@code 'rows'} attribute, when
159     * queried using {@code TagNode.AV("rows")}
160     */
161    public final short hasAVrows;
162
163    /**
164     * The number of HTML {@code TagNode} elements that have a {@code 'rowspan'} attribute, when
165     * queried using {@code TagNode.AV("rowspan")}
166     */
167    public final short hasAVrowspan;
168
169    /**
170     * The number of HTML {@code TagNode} elements that have a {@code 'wrap'} attribute, when
171     * queried using {@code TagNode.AV("wrap")}
172     */
173    public final short hasAVwrap;
174
175    /**
176     * The number of HTML {@code TagNode} elements that have a {@code 'value'} attribute, when
177     * queried using {@code TagNode.AV("value")}
178     */
179    public final short hasAVvalue;
180
181    /**
182     * The number of HTML {@code TagNode} elements that have a {@code 'type'} attribute, when
183     * queried using {@code TagNode.AV("type")}
184     */
185    public final short hasAVtype;
186
187    /**
188     * The number of HTML {@code TagNode} elements that have a {@code 'name'} attribute, when
189     * queried using {@code TagNode.AV("name")}
190     */
191    public final short hasAVname;
192
193    /**
194     * The number of HTML {@code TagNode} elements that have a {@code 'min'} attribute, when
195     * queried using {@code TagNode.AV("min")}
196     */
197    public final short hasAVmin;
198
199    /**
200     * The number of HTML {@code TagNode} elements that have a {@code 'max'} attribute, when
201     * queried using {@code TagNode.AV("max")}
202     */
203    public final short hasAVmax;
204
205    /**
206     * The number of HTML {@code TagNode} elements that have a {@code 'minlength'} attribute, when
207     * queried using {@code TagNode.AV("minlength")}
208     */
209    public final short hasAVminlength;
210
211    /**
212     * The number of HTML {@code TagNode} elements that have a {@code 'maxlength'} attribute, when
213     * queried using {@code TagNode.AV("maxlength")}
214     */
215    public final short hasAVmaxlength;
216
217    /**
218     * The number of HTML {@code TagNode} elements that have a {@code 'accept'} attribute, when
219     * queried using {@code TagNode.AV("accept")}
220     */
221    public final short hasAVaccept;
222
223
224
225
226
227
228    /** This is identical to the value returned by: {@code pageVector.size()} */
229    public final int numNodes;
230
231    /**
232     * The number of {@code HTMLNode's} in the {@code Vector<HTMLNode>} that qualify as an
233     * {@code "instanceof" TagNode}.
234     * 
235     * @see Util.Count#tagNodes(Vector)
236     * @see Util.Count#tagNodes(Vector, int, int)
237     */
238    public final int numTagNodes;
239
240    /**
241     * The number of {@code HTMLNode's} in the {@code Vector<HTMLNode>} that qualify as an
242     * {@code "instanceof" TextNode}.
243     * 
244     * @see Util.Count#textNodes(Vector)
245     * @see Util.Count#textNodes(Vector, int, int)
246     */
247    public final int numTextNodes;
248
249    /**
250     * The number of {@code HTMLNode's} in the {@code Vector<HTMLNode>} that qualify as an
251     * {@code "instanceof" CommentNode}.
252     * 
253     * @see Util.Count#commentNodes(Vector)
254     * @see Util.Count#commentNodes(Vector, int, int)
255     */
256    public final int numCommentNodes;
257
258    /**
259     * This is the total number of new-line {@code '\n'} characters found inside any
260     * {@code TextNode} present in the page-vector.
261     * 
262     * @see Util.Count#newLines(Vector)
263     * @see Util.Count#newLines(Vector, int, int)
264     */
265    public final int numNewLines;
266
267    /**
268     * The total number of HTML <SPAN CLASS="PAGESTATS">&lt;IMG ...&gt;</SPAN> {@code 'TagNode'}
269     * elements found on the page.
270     * 
271     * <BR /><BR /><B><SPAN STYLE='color: red;'>NOTE:</B></SPAN> This is considered an HTML-5
272     * {@code 'Singleton'} element, and thusly should only have an "Opening-Tag Version" of the
273     * Element.  If there are (accidentally) closing-versions of this tag, they will not be counted
274     * by {@code class PageStats}
275     */
276    public final short numImages;
277
278    /**
279     * The total number of HTML <SPAN CLASS="PAGESTATS">&lt;META ...&gt;</SPAN> {@code 'TagNode'}
280     * elements found on the page.
281     * 
282     * <BR /><BR /><B><SPAN STYLE='color: red;'>NOTE:</B></SPAN> This is considered an HTML-5
283     * {@code 'Singleton'} element, and thusly should only have an "Opening-Tag Version" of the
284     * Element.  If there are (accidentally) closing-versions of this tag, they will not be counted
285     * by {@code class PageStats}
286     */
287    public final short numMeta;
288
289    /** 
290     * The total number of HTML <SPAN CLASS="PAGESTATS">&lt;LINK ...&gt;</SPAN> {@code 'TagNode'}
291     * elements found on the page.
292     * 
293     * <BR /><BR /><B><SPAN STYLE='color: red;'>NOTE:</B></SPAN> This is considered an HTML-5
294     * {@code 'Singleton'} element, and thusly should only have an "Opening-Tag Version" of the
295     * Element.  If there are (accidentally) closing-versions of this tag, they will not be counted
296     * by {@code class PageStats}
297     */
298    public final short numLink;
299
300    /**
301     * The total number of HTML <SPAN CLASS="PAGESTATS">&lt;INPUT ...&gt;</SPAN> {@code 'TagNode'}
302     * elements found on the page.
303     * 
304     * <BR /><BR /><B><SPAN STYLE='color: red;'>NOTE:</B></SPAN> This is considered an HTML-5
305     * {@code 'Singleton'} element, and thusly should only have an "Opening-Tag Version" of the
306     * Element.  If there are (accidentally) closing-versions of this tag, they will not be counted
307     * by {@code class PageStats}
308     */
309    public final short numInput;
310
311    /**
312     * The total number of HTML <SPAN CLASS="PAGESTATS">&lt;EMBED ...&gt;</SPAN> {@code 'TagNode'}
313     * elements found on this page.
314     * 
315     * <BR /><BR /><B><SPAN STYLE='color: red;'>NOTE:</B></SPAN> This is considered an HTML-5
316     * {@code 'Singleton'} element, and thusly should only have an "Opening-Tag Version" of the
317     * Element.  If there are (accidentally) closing-versions of this tag, they will not be counted
318     * by {@code class PageStats}
319     */
320    public final short numEmbed;
321
322    /**
323     * The total number of HTML <SPAN CLASS="PAGESTATS">&lt;HR&gt;</SPAN> {@code 'TagNode'}
324     * elements found on the page.
325     * 
326     * <BR /><BR /><B><SPAN STYLE='color: red;'>NOTE:</B></SPAN> This is considered an HTML-5
327     * {@code 'Singleton'} element, and thusly should only have an "Opening-Tag Version" of the
328     * Element.  If there are (accidentally) closing-versions of this tag, they will not be counted
329     * by {@code class PageStats}
330     */
331    public final short numHR;
332
333    /**
334     * The total number of HTML <SPAN CLASS="PAGESTATS">&lt;BR&gt;</SPAN> {@code 'TagNode'}
335     * elements found on the page.
336     * 
337     * <BR /><BR /><B><SPAN STYLE='color: red;'>NOTE:</B></SPAN> This is considered an HTML-5
338     * {@code 'Singleton'} element, and thusly should only have an "Opening-Tag Version" of the
339     * Element.  If there are (accidentally) closing-versions of this tag, they will not be counted
340     * by {@code class PageStats}
341     */
342    public final short numBR;
343
344
345    /**
346     * The total number of <SPAN CLASS="PAGESTATS">&lt;TABLE&gt;, TC.OpeningTags, TagNode</SPAN>
347     * elements found on the page.  Any element-internal attributes / inner-tags actually found
348     * inside the HTML element will just be ignored for the purposes of this count.
349     */
350    public final short numOpenTables;
351
352    /**
353     * The total number of <SPAN CLASS="PAGESTATS">&lt;/TABLE&gt;, TC.ClosingTags, TagNode</SPAN>
354     * elements found on the page.
355     */
356    public final short numClosedTables;
357
358    /**
359     * The total number of <SPAN CLASS="PAGESTATS">&lt;A&gt; (Anchor), TC.OpeningTags,
360     * TagNode</SPAN> elements found on the page.  Any element-internal attributes / inner-tags
361     * actually found inside the HTML element will just be ignored for the purposes of this count.
362     */
363    public final short numOpenAnchors;
364
365    /**
366     * The total number of <SPAN CLASS="PAGESTATS">&lt;/A&gt;, TC.ClosingTags, TagNode</SPAN>
367     * elements found on the page.
368     */
369    public final short numClosedAnchors;
370
371    /**
372     * The total number of <SPAN CLASS="PAGESTATS">&lt;P&gt; (Paragraph), TC.OpeningTags,
373     * TagNode</SPAN> elements found on the page.  Any element-internal attributes / inner-tags
374     * actually found inside the HTML element will just be ignored for the purposes of this count.
375     */
376    public final short numOpenParagraphs;
377
378    /**
379     * The total number of <SPAN CLASS="PAGESTATS">&lt;/P&gt;, TC.ClosingTags, TagNode</SPAN>
380     * elements found on the page.
381     */
382    public final short numClosedParagraphs;
383
384    /**
385     * The total number of <SPAN CLASS="PAGESTATS">&lt;DIV&gt; (Divider), TC.OpeningTags,
386     * TagNode</SPAN> elements found on the page.  Any element-internal attributes / inner-tags
387     * actually found inside the HTML element will just be ignored for the purposes of this count.
388     */
389    public final short numOpenDivs;
390
391    /**
392     * The total number of <SPAN CLASS="PAGESTATS">&lt;/DIV&gt;, TC.ClosingTags, TagNode</SPAN>
393     * elements found on the page.
394     */
395    public final short numClosedDivs;
396
397    /**
398     * The total number of <SPAN CLASS="PAGESTATS">&lt;SPAN&gt;, TC.OpeningTags, TagNode</SPAN>
399     * elements found on the page.  Any element-internal attributes / inner-tags actually found
400     * inside the HTML element will just be ignored for the purposes of this count.
401     */
402    public final short numOpenSpans;
403
404    /**
405     * The total number of <SPAN CLASS="PAGESTATS">&lt;/SPAN&gt;, TC.ClosingTags, TagNode</SPAN>
406     * elements found on the page.
407     */
408    public final short numClosedSpans;
409
410    /**
411     * The total number of <SPAN CLASS="PAGESTATS">&lt;SCRIPT&gt;, TC.OpeningTags, TagNode</SPAN>
412     * elements found on the page.  Any element-internal attributes / inner-tags actually found
413     * inside the HTML element will just be ignored for the purposes of this count.
414     */
415    public final short numOpenScripts;
416
417    /**
418     * The total number of <SPAN CLASS="PAGESTATS">&lt;/SCRIPT&gt;, TC.ClosingTags, TagNode</SPAN>
419     * elements found on the page.
420     */
421    public final short numClosedScripts;
422
423    /**
424     * The total number of <SPAN CLASS="PAGESTATS">&lt;STYLE&gt;, TC.OpeningTags, TagNode</SPAN>
425     * elements found on the page.  Any element-internal attributes / inner-tags actually found
426     * inside the HTML element will just be ignored for the purposes of this count.
427     */
428    public final short numOpenStyles;
429
430    /**
431     * The total number of <SPAN CLASS="PAGESTATS">&lt;/STYLE&gt;, TC.ClosingTags, TagNode</SPAN>
432     * elements found on the page.
433     */
434    public final short numClosedStyles;
435
436    /**
437     * The total number of <SPAN CLASS="PAGESTATS">&lt;FRAME&gt;, TC.OpeningTags, TagNode</SPAN>
438     * elements found on the page.  Any element-internal attributes / inner-tags actually found
439     * inside the HTML element will just be ignored for the purposes of this count.
440     */
441    public final short numOpenFrames;
442
443    /**
444     * The total number of <SPAN CLASS="PAGESTATS">&lt;/FRAME&gt;, TC.ClosingTags, TagNode</SPAN>
445     * elements found on the page.
446     */
447    public final short numClosedFrames;
448
449    /**
450     * The total number of <SPAN CLASS="PAGESTATS">&lt;IFRAME&gt;, TC.OpeningTags, TagNode</SPAN>
451     * elements found on the page.  Any element-internal attributes / inner-tags actually found
452     * inside the HTML element will just be ignored for the purposes of this count.
453     */
454    public final short numOpenIFrames;
455
456    /**
457     * The total number of <SPAN CLASS="PAGESTATS">&lt;/IFRAME&gt;, TC.ClosingTags, TagNode</SPAN>
458     * elements found on the page.
459     */
460    public final short numClosedIFrames;
461
462    /**
463     * The total number of <SPAN CLASS="PAGESTATS">&lt;FORM&gt;, TC.OpeningTags, TagNode</SPAN>
464     * elements found on the page.  Any element-internal attributes / inner-tags actually found
465     * inside the HTML element will just be ignored for the purposes of this count.
466     */
467    public final short numOpenForms;
468
469    /**
470     * The total number of <SPAN CLASS="PAGESTATS">&lt;/FORM&gt;, TC.ClosingTags, TagNode</SPAN>
471     * elements found on the page.
472     */
473    public final short numClosedForms;
474
475    /**
476     * Internally used by the 'clone' method.
477     * 
478     * @param otherPageStats This is the instance of PageStats to be copied
479     */
480    protected PageStats(PageStats otherPageStats)
481    {
482        this.hasAVclass             = otherPageStats.hasAVclass;
483        this.hasAVstyle             = otherPageStats.hasAVstyle;
484        this.hasAVid                = otherPageStats.hasAVid;
485        this.hasAVtitle             = otherPageStats.hasAVtitle;
486        this.hasAVhref              = otherPageStats.hasAVhref;
487        this.hasAVhreflang          = otherPageStats.hasAVhreflang;
488        this.hasAVsrc               = otherPageStats.hasAVsrc;
489        this.hasAVsrcset            = otherPageStats.hasAVsrcset;
490        this.hasAVsrclang           = otherPageStats.hasAVsrclang;
491        this.hasAVsrcdoc            = otherPageStats.hasAVsrcdoc;
492        this.hasAValt               = otherPageStats.hasAValt;
493        this.hasAVtarget            = otherPageStats.hasAVtarget;
494        this.hasAVwidth             = otherPageStats.hasAVwidth;
495        this.hasAVheight            = otherPageStats.hasAVheight;
496        this.hasAVsize              = otherPageStats.hasAVsize;
497        this.hasAVsizes             = otherPageStats.hasAVsizes;
498        this.hasAVcols              = otherPageStats.hasAVcols;
499        this.hasAVcolspan           = otherPageStats.hasAVcolspan;
500        this.hasAVrows              = otherPageStats.hasAVrows;
501        this.hasAVrowspan           = otherPageStats.hasAVrowspan;
502        this.hasAVwrap              = otherPageStats.hasAVwrap;
503        this.hasAVvalue             = otherPageStats.hasAVvalue;
504        this.hasAVtype              = otherPageStats.hasAVtype;
505        this.hasAVname              = otherPageStats.hasAVname;
506        this.hasAVmin               = otherPageStats.hasAVmin;
507        this.hasAVmax               = otherPageStats.hasAVmax;
508        this.hasAVminlength         = otherPageStats.hasAVminlength;
509        this.hasAVmaxlength         = otherPageStats.hasAVmaxlength;
510        this.hasAVaccept            = otherPageStats.hasAVaccept;
511
512        this.numImages              = otherPageStats.numImages;
513        this.numMeta                = otherPageStats.numMeta;
514        this.numLink                = otherPageStats.numLink;
515        this.numInput               = otherPageStats.numInput;
516        this.numEmbed               = otherPageStats.numEmbed ;
517        this.numHR                  = otherPageStats.numHR;
518        this.numBR                  = otherPageStats.numBR;
519
520        this.numOpenTables          = otherPageStats.numOpenTables;
521        this.numClosedTables        = otherPageStats.numClosedTables;
522        this.numOpenAnchors         = otherPageStats.numOpenAnchors;
523        this.numClosedAnchors       = otherPageStats.numClosedAnchors;
524        this.numOpenParagraphs      = otherPageStats.numOpenParagraphs;
525        this.numClosedParagraphs    = otherPageStats.numClosedParagraphs;
526        this.numOpenDivs            = otherPageStats.numOpenDivs;
527        this.numClosedDivs          = otherPageStats.numClosedDivs;
528        this.numOpenSpans           = otherPageStats.numOpenSpans;
529        this.numClosedSpans         = otherPageStats.numClosedSpans;
530        this.numOpenScripts         = otherPageStats.numOpenScripts;
531        this.numClosedScripts       = otherPageStats.numClosedScripts;
532        this.numOpenStyles          = otherPageStats.numOpenStyles;
533        this.numClosedStyles        = otherPageStats.numClosedStyles;
534        this.numOpenFrames          = otherPageStats.numOpenFrames;
535        this.numClosedFrames        = otherPageStats.numClosedFrames;
536        this.numOpenIFrames         = otherPageStats.numOpenIFrames;
537        this.numClosedIFrames       = otherPageStats.numClosedIFrames;
538        this.numOpenForms           = otherPageStats.numOpenForms;
539        this.numClosedForms         = otherPageStats.numClosedForms;
540
541        this.strLength              = otherPageStats.strLength;
542        this.hash                   = otherPageStats.hash;
543        this.numNodes               = otherPageStats.numNodes;
544        this.numTagNodes            = otherPageStats.numTagNodes;
545        this.numTextNodes           = otherPageStats.numTextNodes;
546        this.numCommentNodes        = otherPageStats.numCommentNodes;
547        this.numNewLines            = otherPageStats.numNewLines;
548    }
549
550    /**
551     * Constructs a new instance of PageStats.  Assigns values based on a vectorized-webpage to the
552     * individual fields in this class.
553     * 
554     * @param page Any HTML page or sub-page.
555     * 
556     * @see Util#strLength(Vector)
557     * @see Util#hashCode(Vector)
558     * @see Util.Count#tagNodes(Vector)
559     * @see Util.Count#textNodes(Vector)
560     * @see Util.Count#commentNodes(Vector)
561     * @see Util.Count#newLines(Vector)
562     * @see TagNodeFindInclusive
563     * @see TagNodeCount
564     * @see TagNode#AV(String)
565     */
566    public PageStats(Vector<HTMLNode> page)
567    {
568        this.strLength              = Util.strLength(page);
569        this.hash                   = Util.hashCode(page);
570        this.numNodes               = page.size();
571        this.numTagNodes            = Util.Count.tagNodes(page);
572        this.numTextNodes           = Util.Count.textNodes(page);
573        this.numCommentNodes        = Util.Count.commentNodes(page);
574        this.numNewLines            = Util.Count.newLines(page);
575
576        short numImages, numMeta, numLink, numInput, numEmbed, numHR, numBR;
577        numImages = numMeta = numLink = numInput = numEmbed = numHR = numBR = 0;
578
579        short hasAVclass, hasAVstyle, hasAVid, hasAVtitle, hasAVhref, hasAVhreflang, hasAVsrc;
580        short hasAVsrcset, hasAVsrclang, hasAVsrcdoc, hasAValt, hasAVtarget, hasAVwidth;
581        short hasAVheight, hasAVsize, hasAVsizes, hasAVcols, hasAVcolspan, hasAVrows, hasAVrowspan;
582        short hasAVwrap, hasAVvalue, hasAVtype, hasAVname, hasAVmin, hasAVmax, hasAVminlength;
583        short hasAVmaxlength, hasAVaccept;
584
585        hasAVclass = hasAVstyle = hasAVid = hasAVtitle = hasAVhref = hasAVhreflang = hasAVsrc = 
586        hasAVsrcset = hasAVsrclang = hasAVsrcdoc = hasAValt = hasAVtarget = hasAVwidth = 
587        hasAVheight = hasAVsize = hasAVsizes = hasAVcols = hasAVcolspan = hasAVrows =
588        hasAVrowspan = hasAVwrap = hasAVvalue = hasAVtype = hasAVname = hasAVmin = hasAVmax = 
589        hasAVminlength = hasAVmaxlength = hasAVaccept = 0;
590
591        short numOpenTables, numClosedTables, numOpenAnchors, numClosedAnchors, numOpenParagraphs;
592        short numClosedParagraphs, numOpenDivs, numClosedDivs, numOpenSpans, numClosedSpans;
593        short numOpenScripts, numClosedScripts, numOpenStyles, numClosedStyles, numOpenFrames;
594        short numClosedFrames, numOpenIFrames, numClosedIFrames, numOpenForms, numClosedForms;
595
596        numOpenTables = numClosedTables = numOpenAnchors = numClosedAnchors = numOpenParagraphs = 
597        numClosedParagraphs = numOpenDivs = numClosedDivs = numOpenSpans = numClosedSpans = 
598        numOpenScripts = numClosedScripts = numOpenStyles = numClosedStyles = numOpenFrames = 
599        numClosedFrames = numOpenIFrames = numClosedIFrames = numOpenForms = numClosedForms = 0;
600
601        for (HTMLNode n : page) if (n.isTagNode())
602        {
603            TagNode tn = (TagNode) n;
604            List<String> l = tn.allAN().collect(Collectors.toList());
605
606            if (l.size() > 0)
607            {
608                if (l.contains("class"))     { l.remove("class");       hasAVclass++;       }
609                if (l.contains("style"))     { l.remove("style");       hasAVstyle++;       }
610                if (l.contains("id"))        { l.remove("id");          hasAVid++;          }
611                if (l.contains("title"))     { l.remove("title");       hasAVtitle++;       }
612                if (l.contains("href"))      { l.remove("href");        hasAVhref++;        }
613                if (l.contains("hreflang"))  { l.remove("hreflang");    hasAVhreflang++;    }
614                if (l.contains("src"))       { l.remove("src");         hasAVsrc++;         }
615                if (l.contains("srcset"))    { l.remove("srcset");      hasAVsrcset++;      }
616                if (l.contains("srclang"))   { l.remove("srclang");     hasAVsrclang++;     }
617                if (l.contains("srcdoc"))    { l.remove("srcdoc");      hasAVsrcdoc++;      }
618                if (l.contains("alt"))       { l.remove("alt");         hasAValt++;         }
619                if (l.contains("target"))    { l.remove("target");      hasAVtarget++;      }
620                if (l.contains("width"))     { l.remove("width");       hasAVwidth++;       }
621                if (l.contains("height"))    { l.remove("height");      hasAVheight++;      }
622                if (l.contains("size"))      { l.remove("size");        hasAVsize++;        }
623                if (l.contains("sizes"))     { l.remove("sizes");       hasAVsizes++;       }
624                if (l.contains("cols"))      { l.remove("cols");        hasAVcols++;        }
625                if (l.contains("colspan"))   { l.remove("colspan");     hasAVcolspan++;     }
626                if (l.contains("rows"))      { l.remove("rows");        hasAVrows++;        }
627                if (l.contains("rowspan"))   { l.remove("rowspan");     hasAVrowspan++;     }
628                if (l.contains("wrap"))      { l.remove("wrap");        hasAVwrap++;        }
629                if (l.contains("value"))     { l.remove("value");       hasAVvalue++;       }
630                if (l.contains("type"))      { l.remove("type");        hasAVtype++;        }
631                if (l.contains("name"))      { l.remove("name");        hasAVname++;        }
632                if (l.contains("min"))       { l.remove("min");         hasAVmin++;         }
633                if (l.contains("max"))       { l.remove("max");         hasAVmax++;         }
634                if (l.contains("minlength")) { l.remove("minlength");   hasAVminlength++;   }
635                if (l.contains("maxlength")) { l.remove("maxlength");   hasAVmaxlength++;   }
636                if (l.contains("accept"))    { l.remove("accept");      hasAVaccept++;      }
637            }
638
639            if      (tn.tok.equals("img"))      numImages++;
640            else if (tn.tok.equals("meta"))     numMeta++;
641            else if (tn.tok.equals("link"))     numLink++;
642            else if (tn.tok.equals("input"))    numInput++;
643            else if (tn.tok.equals("embed"))    numEmbed++;
644            else if (tn.tok.equals("hr"))       numHR++;
645            else if (tn.tok.equals("br"))       numBR++;
646
647
648            if      (tn.tok.equals("table"))
649            { if (tn.isClosing)  numClosedTables++;     else numOpenTables++;       }
650
651            else if (tn.tok.equals("a"))
652            { if (tn.isClosing)  numClosedAnchors++;    else numOpenAnchors++;      }
653
654            else if (tn.tok.equals("p"))
655            { if (tn.isClosing)  numClosedParagraphs++; else numOpenParagraphs++;   }
656
657            else if (tn.tok.equals("div"))
658            { if (tn.isClosing)  numClosedDivs++;       else numOpenDivs++;         }
659
660            else if (tn.tok.equals("span"))
661            { if (tn.isClosing)  numClosedSpans++;      else numOpenSpans++;        }
662
663            else if (tn.tok.equals("script"))
664            { if (tn.isClosing)  numClosedScripts++;    else numOpenScripts++;      }
665
666            else if (tn.tok.equals("style"))
667            { if (tn.isClosing)  numClosedStyles++;     else numOpenStyles++;       }
668
669            else if (tn.tok.equals("frame"))
670            { if (tn.isClosing)  numClosedFrames++;     else numOpenFrames++;       }
671
672            else if (tn.tok.equals("iframe"))
673            { if (tn.isClosing)  numClosedIFrames++;    else numOpenIFrames++;      }
674
675            else if (tn.tok.equals("form"))
676            { if (tn.isClosing)  numClosedForms++;      else numOpenForms++;        }
677        }
678
679        this.numImages              = numImages;
680        this.numMeta                = numMeta;
681        this.numLink                = numLink;
682        this.numInput               = numInput;
683        this.numEmbed               = numEmbed;
684        this.numHR                  = numHR;
685        this.numBR                  = numBR;
686
687
688        this.hasAVclass             = hasAVclass;
689        this.hasAVstyle             = hasAVstyle;
690        this.hasAVid                = hasAVid;
691        this.hasAVtitle             = hasAVtitle;
692        this.hasAVhref              = hasAVhref;
693        this.hasAVhreflang          = hasAVhreflang;
694        this.hasAVsrc               = hasAVsrc;
695        this.hasAVsrcset            = hasAVsrcset;
696        this.hasAVsrclang           = hasAVsrclang;
697        this.hasAVsrcdoc            = hasAVsrcdoc;
698        this.hasAValt               = hasAValt;
699        this.hasAVtarget            = hasAVtarget;
700        this.hasAVwidth             = hasAVwidth;
701        this.hasAVheight            = hasAVheight;
702        this.hasAVsize              = hasAVsize;
703        this.hasAVsizes             = hasAVsizes;
704        this.hasAVcols              = hasAVcols;
705        this.hasAVcolspan           = hasAVcolspan;
706        this.hasAVrows              = hasAVrows;
707        this.hasAVrowspan           = hasAVrowspan;
708        this.hasAVwrap              = hasAVwrap;
709        this.hasAVvalue             = hasAVvalue;
710        this.hasAVtype              = hasAVtype;
711        this.hasAVname              = hasAVname;
712        this.hasAVmin               = hasAVmin;
713        this.hasAVmax               = hasAVmax;
714        this.hasAVminlength         = hasAVminlength;
715        this.hasAVmaxlength         = hasAVmaxlength;
716        this.hasAVaccept            = hasAVaccept;
717
718
719        this.numOpenTables          = numOpenTables;
720        this.numClosedTables        = numClosedTables;
721        this.numOpenAnchors         = numOpenAnchors;
722        this.numClosedAnchors       = numClosedAnchors;
723        this.numOpenParagraphs      = numOpenParagraphs;
724        this.numClosedParagraphs    = numClosedParagraphs;
725        this.numOpenDivs            = numOpenDivs;
726        this.numClosedDivs          = numClosedDivs;
727        this.numOpenSpans           = numOpenSpans;
728        this.numClosedSpans         = numClosedSpans;
729        this.numOpenScripts         = numOpenScripts;
730        this.numClosedScripts       = numClosedScripts;
731        this.numOpenStyles          = numOpenStyles;
732        this.numClosedStyles        = numClosedStyles;
733        this.numOpenFrames          = numOpenFrames;
734        this.numClosedFrames        = numClosedFrames;
735        this.numOpenIFrames         = numOpenIFrames;
736        this.numClosedIFrames       = numClosedIFrames;
737        this.numOpenForms           = numOpenForms;
738        this.numClosedForms         = numClosedForms;
739    }
740
741    /**
742     * Java's {@code public boolean equals(Object o)} requirements.
743     * 
744     * @param o This may be any Java Object, but only ones of {@code 'this'} type whose
745     * internal-values are identical will cause this method to return {@code TRUE}.
746     * 
747     * @return {@code TRUE} if {@code 'this'} instance of {@code PageStats} is identical to
748     * parameter {@code 'o'.}
749     */
750    public boolean equals(Object o)
751    {
752        if (! (o instanceof PageStats)) return false;
753
754        PageStats otherPageStats = (PageStats) o;
755
756        return
757            (this.strLength             == otherPageStats.strLength)            &&
758            (this.hash                  == otherPageStats.hash)                 &&
759            (this.numNodes              == otherPageStats.numNodes)             &&
760            (this.numTagNodes           == otherPageStats.numTagNodes)          &&
761            (this.numTextNodes          == otherPageStats.numTextNodes)         &&
762            (this.numCommentNodes       == otherPageStats.numCommentNodes)      &&
763            (this.numNewLines           == otherPageStats.numNewLines)          &&
764
765
766            (this.hasAVclass            == otherPageStats.hasAVclass)           &&
767            (this.hasAVstyle            == otherPageStats.hasAVstyle)           &&
768            (this.hasAVid               == otherPageStats.hasAVid)              &&
769            (this.hasAVtitle            == otherPageStats.hasAVtitle)           &&
770            (this.hasAVhref             == otherPageStats.hasAVhref)            &&
771            (this.hasAVhreflang         == otherPageStats.hasAVhreflang)        &&
772            (this.hasAVsrc              == otherPageStats.hasAVsrc)             &&
773            (this.hasAVsrcset           == otherPageStats.hasAVsrcset)          &&
774            (this.hasAVsrclang          == otherPageStats.hasAVsrclang)         &&
775            (this.hasAVsrcdoc           == otherPageStats.hasAVsrcdoc)          &&
776            (this.hasAValt              == otherPageStats.hasAValt)             &&
777            (this.hasAVtarget           == otherPageStats.hasAVtarget)          &&
778            (this.hasAVwidth            == otherPageStats.hasAVwidth)           &&
779            (this.hasAVheight           == otherPageStats.hasAVheight)          &&
780            (this.hasAVsize             == otherPageStats.hasAVsize)            &&
781            (this.hasAVsizes            == otherPageStats.hasAVsizes)           &&
782            (this.hasAVcols             == otherPageStats.hasAVcols)            &&
783            (this.hasAVcolspan          == otherPageStats.hasAVcolspan)         &&
784            (this.hasAVrows             == otherPageStats.hasAVrows)            &&
785            (this.hasAVrowspan          == otherPageStats.hasAVrowspan)         &&
786            (this.hasAVwrap             == otherPageStats.hasAVwrap)            &&
787            (this.hasAVvalue            == otherPageStats.hasAVvalue)           &&
788            (this.hasAVtype             == otherPageStats.hasAVtype)            &&
789            (this.hasAVname             == otherPageStats.hasAVname)            &&
790            (this.hasAVmin              == otherPageStats.hasAVmin)             &&
791            (this.hasAVmax              == otherPageStats.hasAVmax)             &&
792            (this.hasAVminlength        == otherPageStats.hasAVminlength)       &&
793            (this.hasAVmaxlength        == otherPageStats.hasAVmaxlength)       &&
794            (this.hasAVaccept           == otherPageStats.hasAVaccept)          &&
795
796
797            (this.numImages             == otherPageStats.numImages)            &&
798            (this.numMeta               == otherPageStats.numMeta)              &&
799            (this.numLink               == otherPageStats.numLink)              &&
800            (this.numInput              == otherPageStats.numInput)             &&
801            (this.numEmbed              == otherPageStats.numEmbed)             &&
802            (this.numHR                 == otherPageStats.numHR)                &&
803            (this.numBR                 == otherPageStats.numBR)                &&
804
805
806            (this.numOpenTables         == otherPageStats.numOpenTables)        &&
807            (this.numClosedTables       == otherPageStats.numClosedTables)      &&
808
809            (this.numOpenAnchors        == otherPageStats.numOpenAnchors)       &&
810            (this.numClosedAnchors      == otherPageStats.numClosedAnchors)     &&
811
812            (this.numOpenParagraphs     == otherPageStats.numOpenParagraphs)    &&
813            (this.numClosedParagraphs   == otherPageStats.numClosedParagraphs)  &&
814
815            (this.numOpenDivs           == otherPageStats.numOpenDivs)          &&
816            (this.numClosedDivs         == otherPageStats.numClosedDivs)        &&
817
818            (this.numOpenSpans          == otherPageStats.numOpenSpans)         &&
819            (this.numClosedSpans        == otherPageStats.numClosedSpans)       &&
820
821            (this.numOpenScripts        == otherPageStats.numOpenScripts)       &&
822            (this.numClosedScripts      == otherPageStats.numClosedScripts)     &&
823
824            (this.numOpenStyles         == otherPageStats.numOpenStyles)        &&
825            (this.numClosedStyles       == otherPageStats.numClosedStyles)      &&
826
827            (this.numOpenFrames         == otherPageStats.numOpenFrames)        &&
828            (this.numClosedFrames       == otherPageStats.numClosedFrames)      &&
829
830            (this.numOpenIFrames        == otherPageStats.numOpenIFrames)       &&
831            (this.numClosedIFrames      == otherPageStats.numClosedIFrames)     &&
832
833            (this.numOpenForms          == otherPageStats.numOpenForms)         &&
834            (this.numClosedForms        == otherPageStats.numClosedForms);
835    }
836
837    /**
838     * Java's hash-code requirement.  Notice that this method is not static, and provides the
839     * hashCode that was computed for the vectorized-webpage when this instance of
840     * {@code PageStats} was created.  Perhaps the subtlety is noticeable - <I>there is also a
841     * {@code public static} version of method {@code int hashCode(); }</I> Both of them will
842     * return the same number, but only one of them actually computes a hash-code (the
843     * static-method). This non-static merely retrieves the hash that was created when this
844     * instance was built by the constructor.
845     * 
846     * @return A hash-code that may be used when storing this node in a java sorted-collection.
847     */
848    public int hashCode() { return this.hash; }
849
850    /**
851     * Java's {@code interface Comparable<T>} requirements.  This does a very simple comparison
852     * using the underlying field {@code final String str} that all HTMLNode's contain.
853     * 
854     * @param other Any other {@code PageStats} to be compared to {@code 'this' PageStats}
855     * 
856     * @return An integer that fulfills Java's {@code interface Comparable<T> public boolean
857     * compareTo(T t)} method requirements.
858     */
859    public int compareTo(PageStats other)
860    {
861        int compare1 = this.numNodes - other.numNodes;
862        if (compare1 != 0) return compare1;
863        return this.strLength - other.strLength;
864    }
865
866
867    /**
868     * This converts a {@code PageStats} object to a simple-string (Base64 Encoded) object that may
869     * be passed and transmitted as a String.
870     * 
871     * @return Zipped, Serialized, Base-64 Encoded String version of this object.
872     * 
873     * @see StringParse#objToB64Str(Object)
874     */
875    public String toB64String()
876    { try { return StringParse.objToB64Str(this); } catch (Exception e) { return null; } }
877
878    /**
879     * Convets a Base65 Encoded {@code String} into an instance of {@code PageStats}
880     *
881     * @param minimized A previously minimized, compressed, Serialized version of this object
882     * ({@code PageStats}).
883     * 
884     * @return An instance of this class.
885     * 
886     * @see StringParse#b64StrToObj(String)
887     */
888    public static PageStats fromB64String(String minimized)
889    {
890        try
891            { return (PageStats) StringParse.b64StrToObj(minimized); }
892
893        catch (Exception e) { return null; }
894    }
895
896    /**
897     * Generates a carbon copy of passed reference instance {@code 'PageStats'}
898     * 
899     * @return Returns a 'clone' of this vector.  Utilizes {@code 'this' class, protected
900     * constructor}.
901     */
902    public PageStats clone() { return new PageStats(this); }
903
904
905    /**
906     * Generates a java string representation of {@code 'this' instance} of {@code class PageStats}
907     * 
908     * @return a java string of all the details encapsulated by a {@code PageStats} object
909     * reference.
910     */
911    public String toString()
912    {
913        return
914            "strLength            = " + strLength               + '\n' +
915            "hash                 = " + hash                    + '\n' +
916            "numNodes             = " + numNodes                + '\n' +
917            "numTagNodes          = " + numTagNodes             + '\n' +
918            "numTextNodes         = " + numTextNodes            + '\n' +
919            "numCommentNodes      = " + numCommentNodes         + '\n' +
920            "numNewLines          = " + numNewLines             + '\n' +
921
922            "\n" +
923
924            "hasAVclass           = " + hasAVclass              + "\n" + 
925            "hasAVstyle           = " + hasAVstyle              + "\n" + 
926            "hasAVid              = " + hasAVid                 + "\n" + 
927            "hasAVtitle           = " + hasAVtitle              + "\n" + 
928            "hasAVhref            = " + hasAVhref               + "\n" + 
929            "hasAVhreflang        = " + hasAVhreflang           + "\n" + 
930            "hasAVsrc             = " + hasAVsrc                + "\n" + 
931            "hasAVsrcset          = " + hasAVsrcset             + "\n" + 
932            "hasAVsrclang         = " + hasAVsrclang            + "\n" + 
933            "hasAVsrcdoc          = " + hasAVsrcdoc             + "\n" + 
934            "hasAValt             = " + hasAValt                + "\n" + 
935            "hasAVtarget          = " + hasAVtarget             + "\n" + 
936            "hasAVwidth           = " + hasAVwidth              + "\n" + 
937            "hasAVheight          = " + hasAVheight             + "\n" + 
938            "hasAVsize            = " + hasAVsize               + "\n" + 
939            "hasAVsizes           = " + hasAVsizes              + "\n" + 
940            "hasAVcols            = " + hasAVcols               + "\n" + 
941            "hasAVcolspan         = " + hasAVcolspan            + "\n" + 
942            "hasAVrows            = " + hasAVrows               + "\n" + 
943            "hasAVrowspan         = " + hasAVrowspan            + "\n" + 
944            "hasAVwrap            = " + hasAVwrap               + "\n" + 
945            "hasAVvalue           = " + hasAVvalue              + "\n" + 
946            "hasAVtype            = " + hasAVtype               + "\n" + 
947            "hasAVname            = " + hasAVname               + "\n" + 
948            "hasAVmin             = " + hasAVmin                + "\n" + 
949            "hasAVmax             = " + hasAVmax                + "\n" + 
950            "hasAVminlength       = " + hasAVminlength          + "\n" + 
951            "hasAVmaxlength       = " + hasAVmaxlength          + "\n" + 
952            "hasAVaccept          = " + hasAVaccept             + "\n" + 
953
954            "\n" +
955
956            "numImages            = " + numImages               + '\n' +
957            "numMeta              = " + numMeta                 + '\n' +
958            "numLink              = " + numLink                 + '\n' +
959            "numInput             = " + numInput                + '\n' +
960            "numEmbed             = " + numEmbed                + '\n' +
961            "numHR                = " + numHR                   + '\n' +
962            "numBR                = " + numBR                   + '\n' +
963
964            "\n" +
965
966            "numOpenTables        = " + numOpenTables           + '\n' +
967            "numClosedTables      = " + numClosedTables         + '\n' +
968
969            "numOpenAnchors       = " + numOpenAnchors          + '\n' +
970            "numClosedAnchors     = " + numClosedAnchors        + '\n' +
971
972            "numOpenParagraphs    = " + numOpenParagraphs       + '\n' +
973            "numClosedParagraphs  = " + numClosedParagraphs     + '\n' +
974
975            "numOpenDivs          = " + numOpenDivs             + '\n' +
976            "numClosedDivs        = " + numClosedDivs           + '\n' +
977
978            "numOpenSpans         = " + numOpenSpans            + '\n' +
979            "numClosedSpans       = " + numClosedSpans          + '\n' +
980
981            "numOpenScripts       = " + numOpenScripts          + '\n' +
982            "numClosedScripts     = " + numClosedScripts        + '\n' +
983
984            "numOpenStyles        = " + numOpenStyles           + '\n' +
985            "numClosedStyles      = " + numClosedStyles         + '\n' +
986
987            "numOpenFrames        = " + numOpenFrames           + '\n' +
988            "numClosedFrames      = " + numClosedFrames         + '\n' +
989
990            "numOpenIFrames       = " + numOpenIFrames          + '\n' +
991            "numClosedIFrames     = " + numClosedIFrames        + '\n' +
992
993            "numOpenForms         = " + numOpenForms            + '\n' +
994            "numClosedForms       = " + numClosedForms          + '\n';
995    }
996}