001package Torello.HTML.Tools.Images;
002
003import Torello.Java.*;
004import Torello.HTML.*;
005import Torello.HTML.NodeSearch.*;
006import Torello.Java.Shell.C;
007
008import java.util.*;
009import java.util.function.*;
010import java.net.*;
011import java.io.*;
012
013/**
014 * An <B>experimental class</B> that can be used (with, albeit, way too much effort) to download
015 * those photo-montages that are on major news-network web-sites.
016 * 
017 * <BR /><BR /><EMBED CLASS="external-html" DATA-FILE-ID="PBS">
018 */
019public class PhotoBombSite
020{
021    private PhotoBombSite() { }
022
023    // Quite a number of the sites visited start using really annoying apostrophe
024    // and quote characters.  This simply replaces these UNICODE characters with regular
025    // quotation and apostrophe marks.
026
027    private static final char[]      matchChars  = { '“', '”', '’' };
028    private static final String[]    replaceStrs = { "\"", "\"", "'" };
029
030    /**
031     * A functional-interface for returning the portion of the page that contains the list of
032     * images on the site - usually one line from the Node-Search Package classes.
033     * 
034     * <BR /><BR />hould retrieve a sub-portion or sub-section of an HTML Page.
035     */
036    @FunctionalInterface
037    public static interface SectionGet extends Function<Vector<HTMLNode>, Vector<HTMLNode>> { }
038
039    /**
040     * If there is 'HTML Clutter' surrounding the image-html, this functional-interface can be used
041     * to eliminate such clutter.
042     * 
043     * <BR /><BR />
044     * Should remove any extraneous text from an HTML Page.  The returned integer should state
045     * exactly how many nodes were removed.
046     */
047    @FunctionalInterface
048    public static interface TextCleaner extends ToIntFunction<Vector<HTMLNode>> { }
049
050    // NOTE: This is not the same as an HTML <BR /> Element
051    private static final TextNode NEW_LINE = new TextNode("\n");
052
053    // This a newline-BR-newline sequence
054    private static final Vector<HTMLNode> BR_NEWLINE = HTMLPage.getPageTokens("\n<BR />\n", false);
055
056    // A space character
057    private static final TextNode SPACE = new TextNode(" ");
058
059    /**
060     * This is the HTML header that is inserted into the page.  It may be modified, but if it
061     * is, note that the sub-string {@code URL_STR} should be there if the original page
062     * {@code URL} is to be included in the HTML.  The internal-logic replaces this substring
063     * by the actual {@code URL}, and <I>the replacement-code would fail if the text
064     * {@code URL_STR} were removed.</I>  (Though, the code would not actually throw an 
065     * exception either).
066     */
067    public static String HEADER = "" +
068        "<HTML>\n<HEAD>\n<TITLE>TITLE_STR</TITLE>\n"                +
069        "<META charset='utf-8'>\n"                                  +
070        "<STYLE TYPE='text/css'>\n"                                 +
071        "H1, H2, H3, h4     { color:            red;         \n"    +
072        "                     margin: 1em 1em 1em 1em;      }\n"    +
073        "BODY               { margin:           2em;        }\n"    +
074        "P                  { margin: 1.5em 1em 1.5em 1em;   \n"    +
075        "                     max-width:        75%;        }\n"    +
076        "IMG                { margin: 1em;                   \n"    +
077        "                     max-height:       90%;         \n"    +
078        "                     max-width:        90%;        }\n"    +
079        "DIV.PhotoSection   { margin: 7em 1em 1em 1em;       \n"    +
080        "                     background:       lightgray;   \n"    +
081        "                     border-radius:    2em;         \n"    +
082        "                     padding:          1.5em;      }\n"    +
083        "</STYLE>\n</HEAD>\n<BODY>\n"                               +
084        "<H1>TITLE_STR</H1>\n"                                      +
085        "<H2>Scraped From:</H2>\n"                                  +
086        "<H3><A HREF='URL_STR' TARGET=_blank>\nURL_STR</A></H3>\n"  +
087        "<BR /><BR /><BR />\n\n";
088
089
090    /**
091     * <B><I><SPAN STYLE="color: red;">This one works much better</I></B></SPAN>.  This is because
092     * it accepts a "Getter" that ask the user to find the content on a page.  For all Photo Bomb
093     * (and for likely 99% of websites in general) - the relevant HTML section is wrapped in an
094     * HTML {@code <DIV>, <SECTION>, <ARTICLE>} or {@code <MAIN>} element open-close pair.  <I>If
095     * the version {@code get01(...)} or {@code get02(...)} were dismal failures, then this method
096     * is much more likely to produce better results.</I>
097     *
098     * <BR /><BR /><B>NOTE:</B> This does mean that for this method to work, the onus is on the
099     * user to provide a "Getter" <B><I>by inspecting the HTML (the "View Source" Button in your
100     * browser)</I></B> to retrieve the short HTML section that actually has the picture and the
101     * notes.
102     *
103     * <BR /><BR /><B>EXAMPLE NOTE:</B> The example below is one of thousands of short stories
104     * with little pictures attached that are served up by all the news networks and search
105     * engines.  This is one is a collection of photos about the wild west.  If one looks at the
106     * HTML, the programmer would (hopefully) notice that each photo-{@code URL} has it's photo
107     * wrapped in an HTML Divider ({@code '<DIV>'}) element as:
108     * {@code <SECTION ID="mvp-content-main">}.  Notice, in the example, the {@code 'getter'}
109     * that is created to retrieve the photos.
110     *
111     * <EMBED CLASS="external-html" DATA-FILE-ID="PBSPRIME">
112     *
113     * @param iter An instance of {@code URLIterator} that iterates each page of the site.
114     *
115     * @param GETTER This method should retrieve the subsection of HTML <I>on each page</I>
116     * that contains the photo and caption.  It ought to be a one line statement that identifies
117     * how the photo is "wrapped" in HTML.  An "Inclusive" method on an HTML {@code '<DIV>',
118     * '<SECTION>...</SECTION>,' '<MAIN>...</MAIN>'} or {@code '<ARTICLE>...</ARTICLE>'} is 
119     * "99% likely" the right way to do this.
120     *
121     * @param CLEANER This ought to be a one line command that removes extraneous pieces of
122     * text.
123     *
124     * @param log This is a log parameter, and may be used to send log information to the 
125     * terminal.  This parameter may be null, and if it is, it shall be ignored.
126     *
127     * @param skipOnNotFoundException This can shunt the "Not Found Exceptions", and attempt
128     * to skip to the next image.  Some sites have a missing photo returned here and there.
129     *
130     * @return This returns the HTML as a {@code String}.
131     * 
132     * @throws HTMLNotFoundException If the provided {@code 'GETTER'} does not find an HTML
133     * section or element - <I>and returns null instead</I> - then rather than throwing a
134     * {@code NullPointerException}, this exception shall throw.  If this exception does throw,
135     * make sure to check and re-check the provided getter to make certain that the appropriate
136     * Node-Search classes and methods were used in order to properly retrieve <I>the section that 
137     * actually has the photo and the accompanying text.</I>
138     * 
139     * @throws NodeNotFoundException If the {@code 'GETTER'} provided does successfully retrieve
140     * a portion of the photo-page, but no HTML {@code <IMG SRC=...>} is found or identified, then
141     * this exception will throw.  Make sure that when writing the {@code 'GETTER'}, that the
142     * appropriate HTML Element ({@code <DIV ...>, <MAIN>, <SECTION>, <ARTICLE>}, etc...) that
143     * is selected actually wraps the photo on the page being downloaded.
144     */
145    public static String PRIMARY(
146        URLIterator iter, SectionGet GETTER, TextCleaner CLEANER, 
147        boolean skipOnNotFoundException, Appendable log
148    )
149        throws IOException
150    {
151        StringBuilder   sb      = new StringBuilder();
152        boolean         first   = true;
153        int             iterNum = 1;
154
155        while (iter.hasNext())
156        {
157            URL url =  iter.next();
158
159            // Visit the next URL produced by the URL Iterator:
160            log.append("Visiting: " + C.BYELLOW + url.toString() + C.RESET + '\n');
161            Vector<HTMLNode> page = HTMLPage.getPageTokens(url, false);
162
163            // Make sure to insert the HTML header into the "index.html" main page.
164            if (first)
165            {
166                // Do this only once.
167                first = false;
168
169                // Use the title of the page from the first URL returned by the iterator
170                // use the URL from the first URL returned by the iterator.
171                String titleStr = Util.textNodesString(Elements.getTitle(page));
172                sb.append(
173                    HEADER.replace("TITLE_STR", titleStr).replace("URL_STR", url.toString())
174                );
175            }
176
177            // Retrieve the relevant part of the page
178            Vector<HTMLNode> section = GETTER.apply(page);
179
180            // The getter didn't get any HTML.
181            if (section == null)
182            {
183                if (skipOnNotFoundException)
184                {
185                    log.append(
186                        C.BRED + "SectionGet did not return any HTML.  As per request, " +
187                        "Skipping...\n" + C.RESET
188                    );
189                    continue;
190                }
191                
192                throw new HTMLNotFoundException(
193                    "The lambda or method passed to parameter 'GETTER' did not retrieve any " +
194                    "image nor any text from the photo-page being scraped.  Be sure to check " +
195                    "that the specified HTML Elements (DIV, MAIN, SECTION, etc...) or whichever " +
196                    "element was specified is actually present on the photo-collection web-site."
197                );
198            }
199
200            // The HTML produced by the getter didn't have any photos.
201            if (TagNodeCount.all(section, TC.OpeningTags, "img") == 0)
202            {
203                if (skipOnNotFoundException)
204                {
205                    log.append(
206                        C.BRED + "HTML did not contain an <IMG>.  As per request, " +
207                        "Skipping...\n" + C.RESET
208                    );
209                    continue;
210                }
211
212                throw new NodeNotFoundException(
213                    "The lambda or method passed to parameter 'GETTER' did properly retrieve an " +
214                    "HTML Section as expected.  Unfortunately, there were no <IMG ...> elements " +
215                    "available in the section returned.  The purpose of this method is to " +
216                    "spider and crawl photo-collection sites, and retrieve the image of a list " +
217                    "of pages.  This page had no images; this is not allowed here."
218                );
219            }
220
221            // Any HTML Element with these attributes will have those attributes removed
222            // class, id, style, alt, itemtype, itemprop
223            int c = Attributes.remove
224                (section, "class", "id", "style", "title", "itemtype", "itemprop", "alt").length;
225            if (log != null) log.append(
226                C.BCYAN + "\tAttributes.remove(section, \"class\", \"id\", \"style\", \"title\", " +
227                "\"itemtype\", \"itemprop\", \"alt\")\n" + C.RESET +
228                "\t\tRemoved Attributes from [" + c + "] nodes.\n"
229            );
230
231            // Any HTML Element with a "data-..." attribute will have that attribute(s) removed
232            c = Attributes.removeData(section).length;
233            if (log != null) log.append(
234                C.BCYAN + "\tAttributes.removeData(section)\n" + C.RESET +
235                "\t\tRemoved Data-Attributes from [" + c + "] nodes.\n"
236            );
237
238            // Any <!-- --> found in the Photo/Text section retrieved by the getter are
239            // removed from the section.  Comments only add clutter - since they are almost
240            // always auto-generated.
241            c = Util.removeAllCommentNodes(section);
242            if (log != null) log.append(
243                C.BCYAN + "\tUtil.removeAllCommentNodes(section)\n" + C.RESET +
244                "\t\tRemoved [" + c + "] CommentNodes.\n"
245            );
246
247            // If there are any <SCRIPT> ... </SCRIPT> blocks contained in this Photo/Text section
248            // they shall be removed.  They are almost invariably links to other advertisements.
249            // NOTE: There are photo-sites that have contained the <IMG> and text-description inside
250            //       Java-Script blocks, but they are very, VERY rare in 99% of "Photo Bomb Sites."
251            //       If attempting to scrape a photo-story site where the description or photo are
252            //       wrapped in Java-Script or JSON, then this class WILL NOT WORK on that site.
253            c = Util.removeScriptNodeBlocks(section);
254            if (log != null) log.append(
255                C.BCYAN + "\tUtil.removeScriptNodeBlocks(section)\n" + C.RESET +
256                "\t\tRemoved [" + c + "] <SCRIPT> ... </SCRIPT> Blocks.\n"
257            );
258
259            // This class provides an extremely simple CSS Style for the photo and the description
260            // and is the primary reason for using this class.  If there are any CSS
261            // <STYLE> ... </STYLE> blocks, they are removed here, immediately.
262            c = Util.removeStyleNodeBlocks(section);
263            if (log != null) log.append(
264                C.BCYAN + "\tUtil.removeStyleNodeBlocks(section)\n" + C.RESET +
265                "\t\tRemoved [" + c + "] <STYLE> ... </STYLE> Blocks.\n"
266            );
267
268            // Removes <DIV>...</DIV> where "..." may only be white-space.
269            // (Empty <DIV>, <SPAN>, <P>, <I>...).
270            // NOTE: The concept of "Inclusive Empty" means that the only content between the
271            //       opening <DIV> and closing </DIV> is either white-space or NOTHING.  This
272            //       process of removing empty <DIV>...</DIV> pairs (and <SPAN>...</SPAN> pairs,
273            //       along with the complete list of HTML Elements provided in the list) is 
274            //       applied RECURSIVELY.  This means that if the removing of an empty <I>...</I>
275            //       pair creates another empty Element Pair, that pair is removed next.
276            c = Util.removeInclusiveEmpty
277                (section, "div", "picture", "span", "p", "b", "i", "em");
278            if (log != null) log.append(
279                C.BCYAN + "\tUtil.removeInclusiveEmpty(section, \"div\", \"picture\", " +
280                "\"span\", \"p\", \"b\", \"i\", \"em\")\n" + C.RESET +
281                "\t\tRemoved [" + c + "] Empty Tag Blocks.\n"
282            );
283
284            // Now removes all instances of <DIV>, </DIV>, <A>, </A>,
285            // <CENTER>, </CENTER>, <SECTION>, </SECTION>.
286            // Removing these is usually great.  The only HTML Elements that are really needed are
287            // the Paragraph <P> Elements, and the <IMG SRC=...> Elements themselves.  Everything
288            // else is always extraneous "HTML Bloat" and "Clutter."
289            
290            // NOTE: This process is not infallible, but it has worked on dozens and dozens of the
291            //       "Extraneous Photo Collections" that repeatedly pop-up on major news sites at
292            //       random times in their news feeds.
293
294            c = TagNodeRemove.all
295                (section, TC.Both, "div", "a", "center", "section", "picture", "source");
296            if (log != null) log.append(
297                C.BCYAN + "\tTagNodeRemove.all(section, TC.Both, \"div\", \"a\", \"center\", " +
298                "\"section\", \"picture\", \"source\")\n" + C.RESET +
299                "\t\tRemoved [" + c + "] HTML <DIV>, </DIV>, <A>, </A> Elements.\n"
300            );
301
302            // Applies the user-provided text-node cleaner
303            // This may remove all kinds of miscellaneous text-nodes.  Sometimes a little button
304            // that says "Next" or "Next Photo" remains on the page.  The best way to create a 
305            // TextCleaner instance is to run this class, and see if there is a common piece of
306            // text that has been repeatedly inserted into the descriptions... and remove it!
307            c = CLEANER.applyAsInt(section);
308            if (log != null) log.append(
309                C.BCYAN + "\tCLEANER.applyAsInt(section)\n" + C.RESET +
310                "\t\tRemoved [" + c + "] Text-Node's.\n"
311            );
312
313            // Compacts Adjoining textNodes.  Often, after removing all of the HTML TagNode 
314            // elements from the Vector - there are consecutive TextNode's left next to each other
315            // in the Vector.  This Util method will just remove any two adjacent TextNode's, and
316            // copy the Strings out of both them, and then unite them into a single TextNode.
317            // Nothing more, nothing less.
318            c = Util.compactTextNodes(section);
319            if (log != null) log.append(
320                C.BCYAN + "\tUtil.compactTextNodes(section)\n" + C.RESET +
321                "\t\tCompacted [" + c + "] Text-Node's.\n"
322            );
323
324            // Trims the text inside of TextNode's, removes them if they were only white-space
325            // Often after stripping out many many nodes (in the previous steps), there are huge
326            // patches of white-space.  This Util method simply calls the Java String method
327            // String.trim() on each TextNode, and then removes that TextNode, and replaces it
328            // with a trimmed version of the text.
329            // NOTE: This will have no affect on text that is surrounded by HTML Paragraph (<P>
330            //       ... </P>) elements.  Only TextNode's themselves are trimmed.  There is no
331            //       need to worry about text "running together" as long as it is separated by
332            //       <P> elements - which it always is in just about any photo-content website.
333            c = Util.trimTextNodes(section, true);
334            if (log != null) log.append(
335                C.BCYAN + "\tUtil.trimTextNodes(section)\n" + C.RESET +
336                "\t\tTrimmed [" + c + "] Text-Node's.\n"
337            );
338
339            // Performs another round of empty element checks.
340            c = Util.removeInclusiveEmpty(section, "div", "span", "p", "b", "i", "em");
341            if (log != null) log.append(
342                C.BCYAN + "\tUtil.removeInclusiveEmpty(section, \"div\", \"span\", \"p\", \"b\", " +
343                "\"i\", \"em\")\n" + C.RESET +
344                "\t\tRemoved [" + c + "] Empty Tag Blocks.\n"
345            );
346
347            // inserts a new-line character before each <IMG>, <P>, and </P> element.
348            // Makes the final HTML generated more readable.
349            int[] posArr = TagNodeFind.all(section, TC.Both, "img", "p");
350            for (int i=(posArr.length-1); i >= 0; i--) section.add(posArr[i], NEW_LINE);
351
352            // inserts a \n<BR />\n (three nodes, the <BR />, and two new-lines '\n') after
353            // each <IMG>.
354            // This makes both the HTML more readable, and the page itself more readable
355            posArr = TagNodeFind.all(section, TC.OpeningTags, "img");
356            for (int i=(posArr.length-1); i >= 0; i--) section.addAll(posArr[i] + 1, BR_NEWLINE);
357
358            // inserts a ' ' (space character) before and after each newline
359            posArr = TagNodeFind.all(section, TC.Both, "b", "i", "em");
360            {
361                for (int i=(posArr.length-1); i >= 0; i--) section.add(posArr[i] + 1, SPACE);
362                for (int i=(posArr.length-1); i >= 0; i--) section.add(posArr[i], SPACE);
363            }
364
365            // Resolve any partial URL's
366            Links.resolveAllSRC(section, url, null, false);
367    
368            // NOTE: There is an annoying "special apostrophe" on a lot of them.
369            sb.append(  "<DIV CLASS='PhotoSection'>\n" +
370                        StrReplace.r(Util.pageToString(section), matchChars, replaceStrs) +
371                        "\n</DIV>\n" +
372                        "\n\n\n<!-- Photo Section Break Page " + 
373                        StringParse.zeroPad(iterNum++) + "-->\n\n\n"
374            );
375        }
376
377        return sb.toString() + "\n\n</BODY>\n</HTML>\n";
378    }
379
380    /**
381     * <EMBED CLASS="external-html" DATA-FILE-ID="PBSLEGACY">
382     *
383     * This was the first version of photo-scraping.  There were more later - this is why
384     * {@code '01'} is appended to this method.
385     *
386     * @param iter This iterator shall return all of the pages in the site.  Usually, it is just a
387     * base {@code URL} followed by an integer - as in "page 1" " page 2" ... etc...
388     *
389     * @param emptyDIVs These are HTML divider elements who "class" equals the strings in this list.
390     * HTML divider elements that contain these {@code String's} inside their {@code 'class'}
391     * attribute  shall be removed (inclusively).  This is a string-array, and it may be null - and
392     * if it is, it will be ignored - but it may not contain null-values, or an exception will
393     * throw.
394     *
395     * @param textNodes <EMBED CLASS="external-html" DATA-FILE-ID="PBSTN">
396     *
397     * @param callTrimTextNodes <EMBED CLASS="external-html" DATA-FILE-ID="PBSCTTN">
398     *
399     * @param log Textual information shall be sent to the user/terminal using this log.  
400     * <I><SPAN STYLE="color: red;">This parameter may <B>not</B> be null here.</SPAN></I>
401     * <EMBED CLASS="external-html" DATA-FILE-ID="APPENDABLE">
402     *
403     * @return A {@code Vector<String>}.  The HTML will be in {@code String} format, not
404     * {@code HTMLNode} format.
405     *
406     * @see TagNodeRemove
407     * @see Util
408     * @see TagNodeRemoveInclusive
409     * @see TextNodeRemove
410     */
411    @Deprecated
412    public static Vector<String> get01(
413            Iterator<URL> iter, String[] emptyDIVs, String[] textNodes,
414            boolean callTrimTextNodes, Appendable log
415        )
416        throws IOException
417    {
418        Vector<String> ret = new Vector<>();
419
420        while (iter.hasNext())
421        {
422            URL url = iter.next();
423            log.append("Visiting URL: " + url.toString() + '\n');
424            Vector<HTMLNode> page = HTMLPage.getPageTokens(url, false);
425            log.append("Removed " + TagNodeRemove.all(page, TC.Both, "meta") + " meta tags.\n");
426            log.append("Removed " + TagNodeRemove.all(page, TC.Both, "link") + " link tags.\n");
427            log.append("Removed " + Util.removeScriptNodeBlocks(page) + " Script Node Blocks.\n");
428            log.append("Removed " + Util.removeStyleNodeBlocks(page) + " Script Style Blocks.\n");
429            log.append("Removed " + Util.removeAllCommentNodes(page) + " Comment Nodes.\n");
430            log.append("Removed " + TagNodeRemoveInclusive.all(page, "head", "noscript", "header") + " <HEAD>, <HEADER>, <NOSCRIPT> nodes.\n");
431
432            // Removes all HTML <DIV> Elements where the "class" is in the String argument list
433            if ((emptyDIVs != null) && (emptyDIVs.length > 0))
434                log.append(
435                    "Removed " + InnerTagRemoveInclusive.all(page, "div", "class", TextComparitor.C, emptyDIVs) +
436                    " HTML <DIV> Elements.\n"
437                );
438
439            // Removes HTML <DIV> or <P> elements that are empty, recursively
440            log.append("Removed [" + Util.removeInclusiveEmpty(page, "p", "div") + "] Empty <DIV> and <P> elements.\n");
441
442            // Removes all opening and closing elements of the following:
443            // Does not remove the content between these elements
444            log.append("Removed " + TagNodeRemove.all(page, TC.Both, "div", "a", "html", "body", "li", "ul", "span") +
445                                    " HTML Elements: div, a, html, body, li, ul, span.\n");
446
447            // Removes TextNodes that contain the elements in the String argument list
448            if ((textNodes != null) && (textNodes.length > 0))
449                log.append("Removed " + TextNodeRemove.all(page, TextComparitor.CN_CI, textNodes) + " TextNodes.\n");
450
451            // Many nodes have been removed, and this will convert multiple, adjacent TextNodes into a single
452            // TextNode element.
453            log.append("Removed " + Util.compactTextNodes(page) + " Nodes by compacting TextNodes.\n");
454
455            // Long strings of spaces will be removed.
456            // UNFORTUNATELY, New Lines will also disappear.
457            if (callTrimTextNodes)
458                log.append("Removed " + Util.trimTextNodes(page, true) + " Trimming Text Nodes.\n");
459
460            // Remove id, class, and other attributes.
461            log.append("Removed Attributes From " + Attributes.remove(page, "class", "id", "alt").length + " Nodes.\n");
462
463            // Add some new-lines('\n' - not <BR />!)
464            int[] posArr = TagNodeFind.all(page, TC.ClosingTags, "p", "img", "h1", "h2", "h3", "h4", "h5");
465            for (int i = posArr.length - 1; i >= 0; i--) page.insertElementAt(NEW_LINE, posArr[i] + 1);
466
467            // Save this page' image to the return vector.
468            ret.addElement(Util.pageToString(page));
469        }
470        // Pass the Return Vector.  Each element of this Vector<String> will contain a picture and paragraph
471        // about that picture.  The images will not have been downloaded, nor any partially resolved URL's
472        // resolved.
473        return ret;
474    }
475
476    /**
477     * <EMBED CLASS="external-html" DATA-FILE-ID="PBSLEGACY">
478     *
479     * The code here is carbon copied from the above loop.  It is just the central loop body, that
480     * does not iterate over many pages, but rather just one.
481     * 
482     * <BR /><BR /><B><SPAN STYLE="color: red;">CLONE NOTICE:</B></SPAN> This method modifies the
483     * underlying {@code Vector}.  If you wish to avoid that, please call this method with using
484     * the following parameter: {@code (Vector<HTMLNode>) yourOriginalPage.clone()}.  Make sure to
485     * use the {@code SuppressWarnings("unchecked")} annotation.
486     * 
487     * @param page Any HTML page that has extraneous advertising and java-script junk.
488     * 
489     * @param emptyDIVs These are HTML divider elements who "class" equals the strings in this
490     * list.  HTML divider elements that contain these strings inside their 'class' field shall be
491     * removed (inclusively).  This is a string-array, and it may be null - and if it is, it will
492     * be ignored - but it may not contain null-values, or an exception will throw.
493     * 
494     * @param textNodes <EMBED CLASS="external-html" DATA-FILE-ID="PBSTN">
495     * 
496     * @param callTrimTextNodes <EMBED CLASS="external-html" DATA-FILE-ID="PBSCTTN">
497     * 
498     * @param log This is a log, and <I><B>it may be null.</I></B>  If it is null, it will be
499     * ignored. 
500     * 
501     * <EMBED CLASS="external-html" DATA-FILE-ID="APPENDABLE">
502     *
503     * @return a Stripped down version of the page, with most extraneous photo-bomb site junk
504     * removed.
505     *
506     * @throws IOException This method throws {@code IOException} simply because it prints to the
507     * {@code interface java.lang.Appendable}, which requires that {@code IOException} be
508     * monitored / checked in code that uses this interface.
509     */
510    @Deprecated
511    public static String get02( Vector<HTMLNode> page, String[] emptyDIVs, String[] textNodes,
512                                boolean callTrimTextNodes, Appendable log)  throws IOException
513    {
514            int c = TagNodeRemove.all(page, TC.Both, "meta");
515            if (log != null) log.append("Removed " + c + " meta tags.\n");
516
517            c = TagNodeRemove.all(page, TC.Both, "link");
518            if (log != null) log.append("Removed " + c + " link tags.\n");
519
520            c = Util.removeScriptNodeBlocks(page);
521            if (log != null) log.append("Removed " + c + " Script Node Blocks.\n");
522
523            c = Util.removeStyleNodeBlocks(page);
524            if (log != null) log.append("Removed " + c + " Script Style Blocks.\n");
525
526            c = Util.removeAllCommentNodes(page);
527            if (log != null) log.append("Removed " + c + " Comment Nodes.\n");
528
529            c = TagNodeRemoveInclusive.all(page, "head", "noscript", "header");
530            if (log != null) log.append("Removed " + c + " <HEAD> nodes.\n");
531
532            // Removes all HTML <DIV> Elements where the "class" is in the String argument list
533            if ((emptyDIVs != null) && (emptyDIVs.length > 0))
534            {   
535                c = InnerTagRemoveInclusive.all(page, "div", "class", TextComparitor.C, emptyDIVs);
536                if (log != null) log.append("Removed " + c + " HTML <DIV> Elements.\n");
537            }
538
539            // Removes HTML <DIV> or <P> elements that are empty, recursively
540            c = Util.removeInclusiveEmpty(page, "p", "div");
541            if (log != null) log.append("Removed [" + c + "] Empty <DIV> and <P> elements.\n");
542
543            // Removes all opening and closing elements of the following:
544            // Does not remove the content between these elements
545            c = TagNodeRemove.all(page, TC.Both, "div", "a", "html", "body", "li", "ul", "span");
546            if (log != null) log.append("Removed " + c + " HTML Elements: div, a, html, body, li, ul, span.\n");
547
548            // Removes TextNodes that contain the elements in the String argument list
549            if ((textNodes != null) && (textNodes.length > 0))
550            {
551                c = TextNodeRemove.all(page, TextComparitor.CN_CI, textNodes);
552                if (log != null) log.append("Removed " + c + " TextNodes.\n");
553            }
554
555            // Many nodes have been removed, and this will convert multiple, adjacent TextNodes into a single
556            // TextNode element.
557            c = Util.compactTextNodes(page);
558            if (log != null) log.append("Removed " + c + " Nodes by compacting TextNodes.\n");
559
560            // Long strings of spaces will be removed.
561            // UNFORTUNATELY, New Lines will also disappear.
562            if (callTrimTextNodes)
563            {
564                c = Util.trimTextNodes(page, true);
565                if (log != null) log.append("Removed " + c + " Trimming Text Nodes.\n");
566            }
567
568            // Remove id, class, and other attributes.
569            c = Attributes.remove(page, "class", "id", "alt").length;
570            if (log != null) log.append("Removed Attributes From " + c + " Nodes.\n");
571
572            // Add some new-lines('\n' - not <BR />!)
573            int[] posArr = TagNodeFind.all(page, TC.ClosingTags, "p", "img", "h1", "h2", "h3", "h4", "h5");
574            for (int i = posArr.length - 1; i >= 0; i--) page.insertElementAt(NEW_LINE, posArr[i] + 1);
575
576            return Util.pageToString(page);
577    }
578
579    /**
580     * An <CODE>Iterator</CODE> that is intended to be used for retrieving the image-URL's from
581     * the page.
582     * 
583     * <BR /><BR /><EMBED CLASS="external-html" DATA-FILE-ID="PBSURLI">
584     */
585    public static class URLIterator implements Iterator<URL>
586    {
587        private int start, end, cur;
588        private IntFunction<URL> getter;
589
590        /**
591         * Perhaps as more of these "wonderful" photo-bomb sites are published, more versions
592         * of this iterator shall occur.  Right now, the easiest way to deal with iterating through
593         * the forty or fifty pages of photos, is to indicate the start and end number of the pages,
594         * <I><B>and require the user/programmer to provide a lambda function</I></B> "making" the
595         * URL out of a cur-position number.
596         * 
597         * @param start This is the integer that is the "first" page of the site.
598         * 
599         * <DIV CLASS="HTML">{@code 
600         * <!-- This URL has a lot of "Cute Little Bears" being saved in Siberia
601         *      The way you can scrape all 39 photos quickly is to iterator through
602         *      each of the PHP calls via the value passed to "page=" -->
603         * <A HREF='https://www.jerusalemonline.com/view/bear-cubs-jol/?page=1'>
604         * }</DIV>
605         * 
606         * @param end This is the integer that contains the last page of the photo-site collection.
607         * In the particular case of the "Bears who lost their momma in Siberia" - the last page
608         * that is currently available is page number 39.
609         * 
610         * @param urlGetter  Any programmer that is familiar with Java Lambda Functions, should
611         * know this is just Java's version of a "Function Pointer" from C and C++.  This function
612         * pointer must be a function that takes as input and integer (which is a page number), and
613         * returns as output a URL.  This will be called once for each page on the site.
614         * 
615         * <DIV CLASS="EXAMPLE">{@code
616         * // Generally, one might think this should be a single-line lambda expression.  Though
617         * // single line function pointers are quite common, because calling the constructor to a
618         * // URL can generate a MalformedURLException, and because these exceptions are not 
619         * // sub-classes of RunTimeException, this short lambda has to include a try-catch.  Here,
620         * // the checked exception is simply converted to NullPointerException - which is
621         * // unchecked.  The reality is that if proper values are entered for start and end, no
622         * // exceptions will occur.
623         * URLIterator iter = new URLIterator(1, 39, (int curPage) ->
624         * {   
625         *     try
626         *         { return new URL(urlStr + curPage); }
627         *     catch (MalformedURLException e)
628         *         { throw new NullPointerException("Malformed URL Exception" + e.toString()); }
629         * }
630         * }</DIV>
631         */
632        public URLIterator(int start, int end, IntFunction<URL> urlGetter)
633        {
634            this.getter = urlGetter;
635            this.start  = start;
636            this.end    = end;
637            this.cur    = start - 1;
638        }
639
640        /**
641         * Just checks if there are more elements available.
642         * @return <B>TRUE</B> if there are more pages to check, and <B>FALSE</B> otherwise.
643         */
644        public boolean hasNext() { return cur < end; }
645
646        /**
647         * Meeting the requirements of an instance of Java's standard iterator instance.
648         * @return This shall return the "next" URL element from the Photo Site.
649         */
650        public URL next()
651        {
652            cur++;
653            if (cur > end) throw new NoSuchElementException(
654                "The current iteration counter is: " + cur +
655                " but unfortunately, the max-page-number you passed to the constructor is: " + end 
656            );
657            return getter.apply(cur);
658        }
659
660        public static URLIterator usual(String baseURLStr, int startPageNum, int lastPageNum)
661            throws MalformedURLException
662        {
663            CHECK_EXCEPTIONS(baseURLStr, startPageNum, lastPageNum);
664
665            return new URLIterator(startPageNum, lastPageNum, (int curPage) ->
666            {   
667                try
668                    { return new URL(baseURLStr + curPage); }
669                catch (MalformedURLException e)
670                    { throw new NullPointerException("Malformed URL Exception" + e.toString()); }
671                    // CHEAP-TRICK: Compile-Time Exception to Runtime Exception...  However, the 
672                    // base-URL has already been tested, and therefore this exception NEEDS to be 
673                    // suppressed...  NOTE: This exception should *NEVER* throw...
674            });
675        }
676
677        public static URLIterator usual
678            (String url, String appendParamStr, int startPageNum, int lastPageNum)
679            throws MalformedURLException
680        {
681            CHECK_EXCEPTIONS(url + 1 + appendParamStr, startPageNum, lastPageNum);
682
683            return new URLIterator(startPageNum, lastPageNum, (int curPage) ->
684            {   
685                try
686                    { return new URL(url + curPage + appendParamStr); }
687                catch (MalformedURLException e)
688                    { throw new NullPointerException("Malformed URL Exception" + e.toString()); }
689                    // CHEAP-TRICK: Compile-Time Exception to Runtime Exception...  However, the 
690                    // base-URL has already been tested, and therefore this exception NEEDS to be 
691                    // suppressed...  NOTE: This exception should *NEVER* throw...
692            });
693        }
694
695        public static void CHECK_EXCEPTIONS(String url, int startPageNum, int lastPageNum)
696            throws MalformedURLException
697        {
698            // FAIL-FAST: Check user input before the iterator starts iterating.
699            if (startPageNum < 0) throw new IllegalArgumentException(
700                "The value passed to the starting-page-number parameter [" + startPageNum + "], " +
701                "was negative.  Most often it is 1 or, possibly, 0."
702            );
703
704            if (lastPageNum < 0) throw new IllegalArgumentException(
705                "The value passed to the ending-page-number parameter [" + lastPageNum + "], was negative."
706            );
707
708            if (startPageNum > lastPageNum) throw new IllegalArgumentException(
709                "The value passed to the ending-page-number parameter [" + startPageNum + "], was greater " +
710                "than the value passed to ending-page-number parameter [" + lastPageNum + "]."
711            );
712
713            if (url == null) throw new NullPointerException
714                ("A null value was passed as a url.");
715
716            // FAIL-FAST:   This should be a valid URL as a String.  This invocation will throw the
717            //              MalformedURLException if it is not.
718            new URL(url);
719        }
720    }
721}