1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
package Torello.HTML.Tools.Images;

import Torello.Java.*;
import Torello.HTML.*;
import Torello.HTML.NodeSearch.*;

import static Torello.Java.C.*;

import java.util.*;
import java.util.function.*;
import java.net.*;
import java.io.*;

/**
 * An <B>experimental class</B> that can be used (with, albeit, way too much effort) to download
 * those photo-montages that are on major news-network web-sites.
 * 
 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=PBS>
 */
public class PhotoBombSite
{
    private PhotoBombSite() { }

    // Quite a number of the sites visited start using really annoying apostrophe
    // and quote characters.  This simply replaces these UNICODE characters with regular
    // quotation and apostrophe marks.

    private static final char[]      matchChars  = { '“', '”', '’' };
    private static final String[]    replaceStrs = { "\"", "\"", "'" };

    // NOTE: This is not the same as an HTML <BR /> Element
    private static final TextNode NEW_LINE = new TextNode("\n");

    // This a newline-BR-newline sequence
    private static final Vector<HTMLNode> BR_NEWLINE = HTMLPage.getPageTokens("\n<BR />\n", false);

    // A space character
    private static final TextNode SPACE = new TextNode(" ");

    /**
     * This is the HTML header that is inserted into the page.  It may be modified, but if it
     * is, note that the sub-string {@code URL_STR} should be there if the original page
     * {@code URL} is to be included in the HTML.  The internal-logic replaces this substring
     * by the actual {@code URL}, and <I>the replacement-code would fail if the text
     * {@code URL_STR} were removed.</I>  (Though, the code would not actually throw an 
     * exception either).
     */
    public static String HEADER = "" +
        "<HTML>\n<HEAD>\n<TITLE>TITLE_STR</TITLE>\n"                +
        "<META charset='utf-8'>\n"                                  +
        "<STYLE TYPE='text/css'>\n"                                 +
        "H1, H2, H3, h4     { color:            red;         \n"    +
        "                     margin: 1em 1em 1em 1em;      }\n"    +
        "BODY               { margin:           2em;        }\n"    +
        "P                  { margin: 1.5em 1em 1.5em 1em;   \n"    +
        "                     max-width:        75%;        }\n"    +
        "IMG                { margin: 1em;                   \n"    +
        "                     max-height:       90%;         \n"    +
        "                     max-width:        90%;        }\n"    +
        "DIV.PhotoSection   { margin: 7em 1em 1em 1em;       \n"    +
        "                     background:       lightgray;   \n"    +
        "                     border-radius:    2em;         \n"    +
        "                     padding:          1.5em;      }\n"    +
        "</STYLE>\n</HEAD>\n<BODY>\n"                               +
        "<H1>TITLE_STR</H1>\n"                                      +
        "<H2>Scraped From:</H2>\n"                                  +
        "<H3><A HREF='URL_STR' TARGET=_blank>\nURL_STR</A></H3>\n"  +
        "<BR /><BR /><BR />\n\n";


    /**
     * <B><I><SPAN STYLE="color: red;">This one works much better</I></B></SPAN>.  This is because
     * it accepts a "Getter" that ask the user to find the content on a page.  For all Photo Bomb
     * (and for likely 99% of websites in general) - the relevant HTML section is wrapped in an
     * HTML {@code <DIV>, <SECTION>, <ARTICLE>} or {@code <MAIN>} element open-close pair.  <I>If
     * the version {@code get01(...)} or {@code get02(...)} were dismal failures, then this method
     * is much more likely to produce better results.</I>
     *
     * <BR /><BR /><B>NOTE:</B> This does mean that for this method to work, the onus is on the
     * user to provide a "Getter" <B><I>by inspecting the HTML (the "View Source" Button in your
     * browser)</I></B> to retrieve the short HTML section that actually has the picture and the
     * notes.
     *
     * <BR /><BR /><B>EXAMPLE NOTE:</B> The example below is one of thousands of short stories
     * with little pictures attached that are served up by all the news networks and search
     * engines.  This is one is a collection of photos about the wild west.  If one looks at the
     * HTML, the programmer would (hopefully) notice that each photo-{@code URL} has it's photo
     * wrapped in an HTML Divider ({@code '<DIV>'}) element as:
     * {@code <SECTION ID="mvp-content-main">}.  Notice, in the example, the {@code 'getter'}
     * that is created to retrieve the photos.
     *
     * <EMBED CLASS='external-html' DATA-FILE-ID=PBSPRIME>
     *
     * @param iter An instance of {@code URLIterator} that iterates each page of the site.
     *
     * @param GETTER This method should retrieve the subsection of HTML <I>on each page</I>
     * that contains the photo and caption.  It ought to be a one line statement that identifies
     * how the photo is "wrapped" in HTML.  An "Inclusive" method on an HTML {@code '<DIV>',
     * '<SECTION>...</SECTION>,' '<MAIN>...</MAIN>'} or {@code '<ARTICLE>...</ARTICLE>'} is 
     * "99% likely" the right way to do this.
     *
     * @param CLEANER This ought to be a one line command that removes extraneous pieces of
     * text.
     *
     * @param log This is a log parameter, and may be used to send log information to the 
     * terminal.  This parameter may be null, and if it is, it shall be ignored.
     *
     * @param skipOnNotFoundException This can shunt the "Not Found Exceptions", and attempt
     * to skip to the next image.  Some sites have a missing photo returned here and there.
     *
     * @return This returns the HTML as a {@code String}.
     * 
     * @throws HTMLNotFoundException If the provided {@code 'GETTER'} does not find an HTML
     * section or element - <I>and returns null instead</I> - then rather than throwing a
     * {@code NullPointerException}, this exception shall throw.  If this exception does throw,
     * make sure to check and re-check the provided getter to make certain that the appropriate
     * Node-Search classes and methods were used in order to properly retrieve <I>the section that 
     * actually has the photo and the accompanying text.</I>
     * 
     * @throws NodeNotFoundException If the {@code 'GETTER'} provided does successfully retrieve
     * a portion of the photo-page, but no HTML {@code <IMG SRC=...>} is found or identified, then
     * this exception will throw.  Make sure that when writing the {@code 'GETTER'}, that the
     * appropriate HTML Element ({@code <DIV ...>, <MAIN>, <SECTION>, <ARTICLE>}, etc...) that
     * is selected actually wraps the photo on the page being downloaded.
     */
    public static String PRIMARY(
        URLIterator iter, SectionGet GETTER, TextCleaner CLEANER, 
        boolean skipOnNotFoundException, Appendable log
    )
        throws IOException
    {
        StringBuilder   sb      = new StringBuilder();
        boolean         first   = true;
        int             iterNum = 1;

        while (iter.hasNext())
        {
            URL url =  iter.next();

            // Visit the next URL produced by the URL Iterator:
            log.append("Visiting: " + BYELLOW + url.toString() + RESET + '\n');
            Vector<HTMLNode> page = HTMLPage.getPageTokens(url, false);

            // Make sure to insert the HTML header into the "index.html" main page.
            if (first)
            {
                // Do this only once.
                first = false;

                // Use the title of the page from the first URL returned by the iterator
                // use the URL from the first URL returned by the iterator.
                String titleStr = Util.textNodesString(Elements.getTitle(page));
                sb.append(
                    HEADER.replace("TITLE_STR", titleStr).replace("URL_STR", url.toString())
                );
            }

            // Retrieve the relevant part of the page
            Vector<HTMLNode> section = GETTER.apply(page);

            // The getter didn't get any HTML.
            if (section == null)
            {
                if (skipOnNotFoundException)
                {
                    log.append(
                        BRED + "SectionGet did not return any HTML.  As per request, " +
                        "Skipping...\n" + RESET
                    );
                    continue;
                }
                
                throw new HTMLNotFoundException(
                    "The lambda or method passed to parameter 'GETTER' did not retrieve any " +
                    "image nor any text from the photo-page being scraped.  Be sure to check " +
                    "that the specified HTML Elements (DIV, MAIN, SECTION, etc...) or whichever " +
                    "element was specified is actually present on the photo-collection web-site."
                );
            }

            // The HTML produced by the getter didn't have any photos.
            if (TagNodeCount.all(section, TC.OpeningTags, "img") == 0)
            {
                if (skipOnNotFoundException)
                {
                    log.append(
                        BRED + "HTML did not contain an <IMG>.  As per request, " +
                        "Skipping...\n" + RESET
                    );
                    continue;
                }

                throw new NodeNotFoundException(
                    "The lambda or method passed to parameter 'GETTER' did properly retrieve an " +
                    "HTML Section as expected.  Unfortunately, there were no <IMG ...> elements " +
                    "available in the section returned.  The purpose of this method is to " +
                    "spider and crawl photo-collection sites, and retrieve the image of a list " +
                    "of pages.  This page had no images; this is not allowed here."
                );
            }

            // Any HTML Element with these attributes will have those attributes removed
            // class, id, style, alt, itemtype, itemprop
            int c = Attributes.remove
                (section, "class", "id", "style", "title", "itemtype", "itemprop", "alt").length;
            if (log != null) log.append(
                BCYAN + "\tAttributes.remove(section, \"class\", \"id\", \"style\", \"title\", " +
                "\"itemtype\", \"itemprop\", \"alt\")\n" + RESET +
                "\t\tRemoved Attributes from [" + c + "] nodes.\n"
            );

            // Any HTML Element with a "data-..." attribute will have that attribute(s) removed
            c = Attributes.removeData(section).length;
            if (log != null) log.append(
                BCYAN + "\tAttributes.removeData(section)\n" + RESET +
                "\t\tRemoved Data-Attributes from [" + c + "] nodes.\n"
            );

            // Any <!-- --> found in the Photo/Text section retrieved by the getter are
            // removed from the section.  Comments only add clutter - since they are almost
            // always auto-generated.
            c = Util.Remove.allCommentNodes(section);
            if (log != null) log.append(
                BCYAN + "\tUtil.Remove.allCommentNodes(section)\n" + RESET +
                "\t\tRemoved [" + c + "] CommentNodes.\n"
            );

            // If there are any <SCRIPT> ... </SCRIPT> blocks contained in this Photo/Text section
            // they shall be removed.  They are almost invariably links to other advertisements.
            // NOTE: There are photo-sites that have contained the <IMG> and text-description inside
            //       Java-Script blocks, but they are very, VERY rare in 99% of "Photo Bomb Sites."
            //       If attempting to scrape a photo-story site where the description or photo are
            //       wrapped in Java-Script or JSON, then this class WILL NOT WORK on that site.
            c = Util.Remove.scriptNodeBlocks(section);
            if (log != null) log.append(
                BCYAN + "\tUtil.Remove.scriptNodeBlocks(section)\n" + RESET +
                "\t\tRemoved [" + c + "] <SCRIPT> ... </SCRIPT> Blocks.\n"
            );

            // This class provides an extremely simple CSS Style for the photo and the description
            // and is the primary reason for using this class.  If there are any CSS
            // <STYLE> ... </STYLE> blocks, they are removed here, immediately.
            c = Util.Remove.styleNodeBlocks(section);
            if (log != null) log.append(
                BCYAN + "\tUtil.Remove.styleNodeBlocks(section)\n" + RESET +
                "\t\tRemoved [" + c + "] <STYLE> ... </STYLE> Blocks.\n"
            );

            // Removes <DIV>...</DIV> where "..." may only be white-space.
            // (Empty <DIV>, <SPAN>, <P>, <I>...).
            // NOTE: The concept of "Inclusive Empty" means that the only content between the
            //       opening <DIV> and closing </DIV> is either white-space or NOTHING.  This
            //       process of removing empty <DIV>...</DIV> pairs (and <SPAN>...</SPAN> pairs,
            //       along with the complete list of HTML Elements provided in the list) is 
            //       applied RECURSIVELY.  This means that if the removing of an empty <I>...</I>
            //       pair creates another empty Element Pair, that pair is removed next.
            c = Util.Remove.inclusiveEmpty
                (section, "div", "picture", "span", "p", "b", "i", "em");
            if (log != null) log.append(
                BCYAN + "\tUtil.Remove.inclusiveEmpty(section, \"div\", \"picture\", " +
                "\"span\", \"p\", \"b\", \"i\", \"em\")\n" + RESET +
                "\t\tRemoved [" + c + "] Empty Tag Blocks.\n"
            );

            // Now removes all instances of <DIV>, </DIV>, <A>, </A>,
            // <CENTER>, </CENTER>, <SECTION>, </SECTION>.
            // Removing these is usually great.  The only HTML Elements that are really needed are
            // the Paragraph <P> Elements, and the <IMG SRC=...> Elements themselves.  Everything
            // else is always extraneous "HTML Bloat" and "Clutter."
            
            // NOTE: This process is not infallible, but it has worked on dozens and dozens of the
            //       "Extraneous Photo Collections" that repeatedly pop-up on major news sites at
            //       random times in their news feeds.

            c = TagNodeRemove.all
                (section, TC.Both, "div", "a", "center", "section", "picture", "source");
            if (log != null) log.append(
                BCYAN + "\tTagNodeRemove.all(section, TC.Both, \"div\", \"a\", \"center\", " +
                "\"section\", \"picture\", \"source\")\n" + RESET +
                "\t\tRemoved [" + c + "] HTML <DIV>, </DIV>, <A>, </A> Elements.\n"
            );

            // Applies the user-provided text-node cleaner
            // This may remove all kinds of miscellaneous text-nodes.  Sometimes a little button
            // that says "Next" or "Next Photo" remains on the page.  The best way to create a 
            // TextCleaner instance is to run this class, and see if there is a common piece of
            // text that has been repeatedly inserted into the descriptions... and remove it!
            c = CLEANER.applyAsInt(section);
            if (log != null) log.append(
                BCYAN + "\tCLEANER.applyAsInt(section)\n" + RESET +
                "\t\tRemoved [" + c + "] Text-Node's.\n"
            );

            // Compacts Adjoining textNodes.  Often, after removing all of the HTML TagNode 
            // elements from the Vector - there are consecutive TextNode's left next to each other
            // in the Vector.  This Util method will just remove any two adjacent TextNode's, and
            // copy the Strings out of both them, and then unite them into a single TextNode.
            // Nothing more, nothing less.
            c = Util.compactTextNodes(section);
            if (log != null) log.append(
                BCYAN + "\tUtil.compactTextNodes(section)\n" + RESET +
                "\t\tCompacted [" + c + "] Text-Node's.\n"
            );

            // Trims the text inside of TextNode's, removes them if they were only white-space
            // Often after stripping out many many nodes (in the previous steps), there are huge
            // patches of white-space.  This Util method simply calls the Java String method
            // String.trim() on each TextNode, and then removes that TextNode, and replaces it
            // with a trimmed version of the text.
            // NOTE: This will have no affect on text that is surrounded by HTML Paragraph (<P>
            //       ... </P>) elements.  Only TextNode's themselves are trimmed.  There is no
            //       need to worry about text "running together" as long as it is separated by
            //       <P> elements - which it always is in just about any photo-content website.
            c = Util.trimTextNodes(section, true);
            if (log != null) log.append(
                BCYAN + "\tUtil.trimTextNodes(section)\n" + RESET +
                "\t\tTrimmed [" + c + "] Text-Node's.\n"
            );

            // Performs another round of empty element checks.
            c = Util.Remove.inclusiveEmpty(section, "div", "span", "p", "b", "i", "em");
            if (log != null) log.append(
                BCYAN + "\tUtil.Remove.inclusiveEmpty(section, \"div\", \"span\", \"p\", \"b\", " +
                "\"i\", \"em\")\n" + RESET +
                "\t\tRemoved [" + c + "] Empty Tag Blocks.\n"
            );

            // inserts a new-line character before each <IMG>, <P>, and </P> element.
            // Makes the final HTML generated more readable.
            int[] posArr = TagNodeFind.all(section, TC.Both, "img", "p");
            for (int i=(posArr.length-1); i >= 0; i--) section.add(posArr[i], NEW_LINE);

            // inserts a \n<BR />\n (three nodes, the <BR />, and two new-lines '\n') after
            // each <IMG>.
            // This makes both the HTML more readable, and the page itself more readable
            posArr = TagNodeFind.all(section, TC.OpeningTags, "img");
            for (int i=(posArr.length-1); i >= 0; i--) section.addAll(posArr[i] + 1, BR_NEWLINE);

            // inserts a ' ' (space character) before and after each newline
            posArr = TagNodeFind.all(section, TC.Both, "b", "i", "em");
            {
                for (int i=(posArr.length-1); i >= 0; i--) section.add(posArr[i] + 1, SPACE);
                for (int i=(posArr.length-1); i >= 0; i--) section.add(posArr[i], SPACE);
            }

            // Resolve any partial URL's
            Links.resolveAllSRC(section, url, null, false);
    
            // NOTE: There is an annoying "special apostrophe" on a lot of them.
            sb.append(  "<DIV CLASS='PhotoSection'>\n" +
                        StrReplace.r(Util.pageToString(section), matchChars, replaceStrs) +
                        "\n</DIV>\n" +
                        "\n\n\n<!-- Photo Section Break Page " + 
                        StringParse.zeroPad(iterNum++) + "-->\n\n\n"
            );
        }

        return sb.toString() + "\n\n</BODY>\n</HTML>\n";
    }

    /**
     * <EMBED CLASS='external-html' DATA-FILE-ID=PBSLEGACY>
     *
     * This was the first version of photo-scraping.  There were more later - this is why
     * {@code '01'} is appended to this method.
     *
     * @param iter This iterator shall return all of the pages in the site.  Usually, it is just a
     * base {@code URL} followed by an integer - as in "page 1" " page 2" ... etc...
     *
     * @param emptyDIVs These are HTML divider elements who "class" equals the strings in this list.
     * HTML divider elements that contain these {@code String's} inside their {@code 'class'}
     * attribute  shall be removed (inclusively).  This is a string-array, and it may be null - and
     * if it is, it will be ignored - but it may not contain null-values, or an exception will
     * throw.
     *
     * @param textNodes <EMBED CLASS='external-html' DATA-FILE-ID=PBSTN>
     *
     * @param callTrimTextNodes <EMBED CLASS='external-html' DATA-FILE-ID=PBSCTTN>
     *
     * @param log Textual information shall be sent to the user/terminal using this log.  
     * <I><SPAN STYLE="color: red;">This parameter may <B>not</B> be null here.</SPAN></I>
     * <EMBED CLASS='external-html' DATA-FILE-ID=APPENDABLE>
     *
     * @return A {@code Vector<String>}.  The HTML will be in {@code String} format, not
     * {@code HTMLNode} format.
     *
     * @see TagNodeRemove
     * @see Util
     * @see TagNodeRemoveInclusive
     * @see TextNodeRemove
     */
    @Deprecated
    public static Vector<String> get01(
            Iterator<URL> iter, String[] emptyDIVs, String[] textNodes,
            boolean callTrimTextNodes, Appendable log
        )
        throws IOException
    {
        Vector<String> ret = new Vector<>();

        while (iter.hasNext())
        {
            URL url = iter.next();
            log.append("Visiting URL: " + url.toString() + '\n');
            Vector<HTMLNode> page = HTMLPage.getPageTokens(url, false);
            log.append("Removed " + TagNodeRemove.all(page, TC.Both, "meta") + " meta tags.\n");
            log.append("Removed " + TagNodeRemove.all(page, TC.Both, "link") + " link tags.\n");
            log.append("Removed " + Util.Remove.scriptNodeBlocks(page) + " Script Node Blocks.\n");
            log.append("Removed " + Util.Remove.styleNodeBlocks(page) + " Script Style Blocks.\n");
            log.append("Removed " + Util.Remove.allCommentNodes(page) + " Comment Nodes.\n");
            log.append("Removed " + TagNodeRemoveInclusive.all(page, "head", "noscript", "header") + " <HEAD>, <HEADER>, <NOSCRIPT> nodes.\n");

            // Removes all HTML <DIV> Elements where the "class" is in the String argument list
            if ((emptyDIVs != null) && (emptyDIVs.length > 0))
                log.append(
                    "Removed " + InnerTagRemoveInclusive.all(page, "div", "class", TextComparitor.C, emptyDIVs) +
                    " HTML <DIV> Elements.\n"
                );

            // Removes HTML <DIV> or <P> elements that are empty, recursively
            log.append("Removed [" + Util.Remove.inclusiveEmpty(page, "p", "div") + "] Empty <DIV> and <P> elements.\n");

            // Removes all opening and closing elements of the following:
            // Does not remove the content between these elements
            log.append("Removed " + TagNodeRemove.all(page, TC.Both, "div", "a", "html", "body", "li", "ul", "span") +
                                    " HTML Elements: div, a, html, body, li, ul, span.\n");

            // Removes TextNodes that contain the elements in the String argument list
            if ((textNodes != null) && (textNodes.length > 0))
                log.append("Removed " + TextNodeRemove.all(page, TextComparitor.CN_CI, textNodes) + " TextNodes.\n");

            // Many nodes have been removed, and this will convert multiple, adjacent TextNodes into a single
            // TextNode element.
            log.append("Removed " + Util.compactTextNodes(page) + " Nodes by compacting TextNodes.\n");

            // Long strings of spaces will be removed.
            // UNFORTUNATELY, New Lines will also disappear.
            if (callTrimTextNodes)
                log.append("Removed " + Util.trimTextNodes(page, true) + " Trimming Text Nodes.\n");

            // Remove id, class, and other attributes.
            log.append("Removed Attributes From " + Attributes.remove(page, "class", "id", "alt").length + " Nodes.\n");

            // Add some new-lines('\n' - not <BR />!)
            int[] posArr = TagNodeFind.all(page, TC.ClosingTags, "p", "img", "h1", "h2", "h3", "h4", "h5");
            for (int i = posArr.length - 1; i >= 0; i--) page.insertElementAt(NEW_LINE, posArr[i] + 1);

            // Save this page' image to the return vector.
            ret.addElement(Util.pageToString(page));
        }
        // Pass the Return Vector.  Each element of this Vector<String> will contain a picture and paragraph
        // about that picture.  The images will not have been downloaded, nor any partially resolved URL's
        // resolved.
        return ret;
    }

    /**
     * <EMBED CLASS='external-html' DATA-FILE-ID=PBSLEGACY>
     *
     * The code here is carbon copied from the above loop.  It is just the central loop body, that
     * does not iterate over many pages, but rather just one.
     * 
     * <BR /><BR /><B><SPAN STYLE="color: red;">CLONE NOTICE:</B></SPAN> This method modifies the
     * underlying {@code Vector}.  If you wish to avoid that, please call this method with using
     * the following parameter: {@code (Vector<HTMLNode>) yourOriginalPage.clone()}.  Make sure to
     * use the {@code SuppressWarnings("unchecked")} annotation.
     * 
     * @param page Any HTML page that has extraneous advertising and java-script junk.
     * 
     * @param emptyDIVs These are HTML divider elements who "class" equals the strings in this
     * list.  HTML divider elements that contain these strings inside their 'class' field shall be
     * removed (inclusively).  This is a string-array, and it may be null - and if it is, it will
     * be ignored - but it may not contain null-values, or an exception will throw.
     * 
     * @param textNodes <EMBED CLASS='external-html' DATA-FILE-ID=PBSTN>
     * 
     * @param callTrimTextNodes <EMBED CLASS='external-html' DATA-FILE-ID=PBSCTTN>
     * 
     * @param log This is a log, and <I><B>it may be null.</I></B>  If it is null, it will be
     * ignored. 
     * 
     * <EMBED CLASS='external-html' DATA-FILE-ID=APPENDABLE>
     *
     * @return a Stripped down version of the page, with most extraneous photo-bomb site junk
     * removed.
     *
     * @throws IOException This method throws {@code IOException} simply because it prints to the
     * {@code interface java.lang.Appendable}, which requires that {@code IOException} be
     * monitored / checked in code that uses this interface.
     */
    @Deprecated
    public static String get02( Vector<HTMLNode> page, String[] emptyDIVs, String[] textNodes,
                                boolean callTrimTextNodes, Appendable log)  throws IOException
    {
            int c = TagNodeRemove.all(page, TC.Both, "meta");
            if (log != null) log.append("Removed " + c + " meta tags.\n");

            c = TagNodeRemove.all(page, TC.Both, "link");
            if (log != null) log.append("Removed " + c + " link tags.\n");

            c = Util.Remove.scriptNodeBlocks(page);
            if (log != null) log.append("Removed " + c + " Script Node Blocks.\n");

            c = Util.Remove.styleNodeBlocks(page);
            if (log != null) log.append("Removed " + c + " Script Style Blocks.\n");

            c = Util.Remove.allCommentNodes(page);
            if (log != null) log.append("Removed " + c + " Comment Nodes.\n");

            c = TagNodeRemoveInclusive.all(page, "head", "noscript", "header");
            if (log != null) log.append("Removed " + c + " <HEAD> nodes.\n");

            // Removes all HTML <DIV> Elements where the "class" is in the String argument list
            if ((emptyDIVs != null) && (emptyDIVs.length > 0))
            {   
                c = InnerTagRemoveInclusive.all(page, "div", "class", TextComparitor.C, emptyDIVs);
                if (log != null) log.append("Removed " + c + " HTML <DIV> Elements.\n");
            }

            // Removes HTML <DIV> or <P> elements that are empty, recursively
            c = Util.Remove.inclusiveEmpty(page, "p", "div");
            if (log != null) log.append("Removed [" + c + "] Empty <DIV> and <P> elements.\n");

            // Removes all opening and closing elements of the following:
            // Does not remove the content between these elements
            c = TagNodeRemove.all(page, TC.Both, "div", "a", "html", "body", "li", "ul", "span");
            if (log != null) log.append("Removed " + c + " HTML Elements: div, a, html, body, li, ul, span.\n");

            // Removes TextNodes that contain the elements in the String argument list
            if ((textNodes != null) && (textNodes.length > 0))
            {
                c = TextNodeRemove.all(page, TextComparitor.CN_CI, textNodes);
                if (log != null) log.append("Removed " + c + " TextNodes.\n");
            }

            // Many nodes have been removed, and this will convert multiple, adjacent TextNodes into a single
            // TextNode element.
            c = Util.compactTextNodes(page);
            if (log != null) log.append("Removed " + c + " Nodes by compacting TextNodes.\n");

            // Long strings of spaces will be removed.
            // UNFORTUNATELY, New Lines will also disappear.
            if (callTrimTextNodes)
            {
                c = Util.trimTextNodes(page, true);
                if (log != null) log.append("Removed " + c + " Trimming Text Nodes.\n");
            }

            // Remove id, class, and other attributes.
            c = Attributes.remove(page, "class", "id", "alt").length;
            if (log != null) log.append("Removed Attributes From " + c + " Nodes.\n");

            // Add some new-lines('\n' - not <BR />!)
            int[] posArr = TagNodeFind.all(page, TC.ClosingTags, "p", "img", "h1", "h2", "h3", "h4", "h5");
            for (int i = posArr.length - 1; i >= 0; i--) page.insertElementAt(NEW_LINE, posArr[i] + 1);

            return Util.pageToString(page);
    }
}