PhotoBombSite.java.html

package Torello.HTML.Tools.Images;

import Torello.Java.*;
import Torello.HTML.*;
import Torello.HTML.NodeSearch.*;

import static Torello.Java.C.*;

import java.util.*;
import java.util.function.*;
import java.net.*;
import java.io.*;

/**
 * An <B>experimental class</B> that can be used (with, albeit, way too much effort) to download
 * those photo-montages that are on major news-network web-sites.
 * 
 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=PBS>
 */
public class PhotoBombSite
{
    private PhotoBombSite() { }

    // Quite a number of the sites visited start using really annoying apostrophe
    // and quote characters.  This simply replaces these UNICODE characters with regular
    // quotation and apostrophe marks.

    private static final char[]      matchChars  = { '“', '”', '’' };
    private static final String[]    replaceStrs = { "\"", "\"", "'" };

    // NOTE: This is not the same as an HTML <BR /> Element
    private static final TextNode NEW_LINE = new TextNode("\n");

    // This a newline-BR-newline sequence
    private static final Vector<HTMLNode> BR_NEWLINE = HTMLPage.getPageTokens("\n<BR />\n", false);

    // A space character
    private static final TextNode SPACE = new TextNode(" ");

    /**
     * This is the HTML header that is inserted into the page.  It may be modified, but if it
     * is, note that the sub-string {@code URL_STR} should be there if the original page
     * {@code URL} is to be included in the HTML.  The internal-logic replaces this substring
     * by the actual {@code URL}, and <I>the replacement-code would fail if the text
     * {@code URL_STR} were removed.</I>  (Though, the code would not actually throw an 
     * exception either).
     */
    public static String HEADER = "" +
        "<HTML>\n<HEAD>\n<TITLE>TITLE_STR</TITLE>\n"                +
        "<META charset='utf-8'>\n"                                  +
        "<STYLE TYPE='text/css'>\n"                                 +
        "H1, H2, H3, h4     { color:            red;         \n"    +
        "                     margin: 1em 1em 1em 1em;      }\n"    +
        "BODY               { margin:           2em;        }\n"    +
        "P                  { margin: 1.5em 1em 1.5em 1em;   \n"    +
        "                     max-width:        75%;        }\n"    +
        "IMG                { margin: 1em;                   \n"    +
        "                     max-height:       90%;         \n"    +
        "                     max-width:        90%;        }\n"    +
        "DIV.PhotoSection   { margin: 7em 1em 1em 1em;       \n"    +
        "                     background:       lightgray;   \n"    +
        "                     border-radius:    2em;         \n"    +
        "                     padding:          1.5em;      }\n"    +
        "</STYLE>\n</HEAD>\n<BODY>\n"                               +
        "<H1>TITLE_STR</H1>\n"                                      +
        "<H2>Scraped From:</H2>\n"                                  +
        "<H3><A HREF='URL_STR' TARGET=_blank>\nURL_STR</A></H3>\n"  +
        "<BR /><BR /><BR />\n\n";


    /**
     * <B><I><SPAN STYLE="color: red;">This one works much better</I></B></SPAN>.  This is because
     * it accepts a "Getter" that ask the user to find the content on a page.  For all Photo Bomb
     * (and for likely 99% of websites in general) - the relevant HTML section is wrapped in an
     * HTML {@code <DIV>, <SECTION>, <ARTICLE>} or {@code <MAIN>} element open-close pair.  <I>If
     * the version {@code get01(...)} or {@code get02(...)} were dismal failures, then this method
     * is much more likely to produce better results.</I>
     *
     * <BR /><BR /><B>NOTE:</B> This does mean that for this method to work, the onus is on the
     * user to provide a "Getter" <B><I>by inspecting the HTML (the "View Source" Button in your
     * browser)</I></B> to retrieve the short HTML section that actually has the picture and the
     * notes.
     *
     * <BR /><BR /><B>EXAMPLE NOTE:</B> The example below is one of thousands of short stories
     * with little pictures attached that are served up by all the news networks and search
     * engines.  This is one is a collection of photos about the wild west.  If one looks at the
     * HTML, the programmer would (hopefully) notice that each photo-{@code URL} has it's photo
     * wrapped in an HTML Divider ({@code '<DIV>'}) element as:
     * {@code <SECTION ID="mvp-content-main">}.  Notice, in the example, the {@code 'getter'}
     * that is created to retrieve the photos.
     *
     * <EMBED CLASS='external-html' DATA-FILE-ID=PBSPRIME>
     *
     * @param iter An instance of {@code URLIterator} that iterates each page of the site.
     *
     * @param GETTER This method should retrieve the subsection of HTML <I>on each page</I>
     * that contains the photo and caption.  It ought to be a one line statement that identifies
     * how the photo is "wrapped" in HTML.  An "Inclusive" method on an HTML {@code '<DIV>',
     * '<SECTION>...</SECTION>,' '<MAIN>...</MAIN>'} or {@code '<ARTICLE>...</ARTICLE>'} is 
     * "99% likely" the right way to do this.
     *
     * @param CLEANER This ought to be a one line command that removes extraneous pieces of
     * text.
     *
     * @param log This is a log parameter, and may be used to send log information to the 
     * terminal.  This parameter may be null, and if it is, it shall be ignored.
     *
     * @param skipOnNotFoundException This can shunt the "Not Found Exceptions", and attempt
     * to skip to the next image.  Some sites have a missing photo returned here and there.
     *
     * @return This returns the HTML as a {@code String}.
     * 
     * @throws HTMLNotFoundException If the provided {@code 'GETTER'} does not find an HTML
     * section or element - <I>and returns null instead</I> - then rather than throwing a
     * {@code NullPointerException}, this exception shall throw.  If this exception does throw,
     * make sure to check and re-check the provided getter to make certain that the appropriate
     * Node-Search classes and methods were used in order to properly retrieve <I>the section that 
     * actually has the photo and the accompanying text.</I>
     * 
     * @throws NodeNotFoundException If the {@code 'GETTER'} provided does successfully retrieve
     * a portion of the photo-page, but no HTML {@code <IMG SRC=...>} is found or identified, then
     * this exception will throw.  Make sure that when writing the {@code 'GETTER'}, that the
     * appropriate HTML Element ({@code <DIV ...>, <MAIN>, <SECTION>, <ARTICLE>}, etc...) that
     * is selected actually wraps the photo on the page being downloaded.
     */
    public static String PRIMARY(
        URLIterator iter, SectionGet GETTER, TextCleaner CLEANER, 
        boolean skipOnNotFoundException, Appendable log
    )
        throws IOException
    {
        StringBuilder   sb      = new StringBuilder();
        boolean         first   = true;
        int             iterNum = 1;

        while (iter.hasNext())
        {
            URL url =  iter.next();

            // Visit the next URL produced by the URL Iterator:
            log.append("Visiting: " + BYELLOW + url.toString() + RESET + '\n');
            Vector<HTMLNode> page = HTMLPage.getPageTokens(url, false);

            // Make sure to insert the HTML header into the "index.html" main page.
            if (first)
            {
                // Do this only once.
                first = false;

                // Use the title of the page from the first URL returned by the iterator
                // use the URL from the first URL returned by the iterator.
                String titleStr = Util.textNodesString(Elements.getTitle(page));
                sb.append(
                    HEADER.replace("TITLE_STR", titleStr).replace("URL_STR", url.toString())
                );
            }

            // Retrieve the relevant part of the page
            Vector<HTMLNode> section = GETTER.apply(page);

            // The getter didn't get any HTML.
            if (section == null)
            {
                if (skipOnNotFoundException)
                {
                    log.append(
                        BRED + "SectionGet did not return any HTML.  As per request, " +
                        "Skipping...\n" + RESET
                    );
                    continue;
                }
                
                throw new HTMLNotFoundException(
                    "The lambda or method passed to parameter 'GETTER' did not retrieve any " +
                    "image nor any text from the photo-page being scraped.  Be sure to check " +
                    "that the specified HTML Elements (DIV, MAIN, SECTION, etc...) or whichever " +
                    "element was specified is actually present on the photo-collection web-site."
                );
            }

            // The HTML produced by the getter didn't have any photos.
            if (TagNodeCount.all(section, TC.OpeningTags, "img") == 0)
            {
                if (skipOnNotFoundException)
                {
                    log.append(
                        BRED + "HTML did not contain an <IMG>.  As per request, " +
                        "Skipping...\n" + RESET
                    );
                    continue;
                }

                throw new NodeNotFoundException(
                    "The lambda or method passed to parameter 'GETTER' did properly retrieve an " +
                    "HTML Section as expected.  Unfortunately, there were no <IMG ...> elements " +
                    "available in the section returned.  The purpose of this method is to " +
                    "spider and crawl photo-collection sites, and retrieve the image of a list " +
                    "of pages.  This page had no images; this is not allowed here."
                );
            }

            // Any HTML Element with these attributes will have those attributes removed
            // class, id, style, alt, itemtype, itemprop
            int c = Attributes.remove
                (section, "class", "id", "style", "title", "itemtype", "itemprop", "alt").length;
            if (log != null) log.append(
                BCYAN + "\tAttributes.remove(section, \"class\", \"id\", \"style\", \"title\", " +
                "\"itemtype\", \"itemprop\", \"alt\")\n" + RESET +
                "\t\tRemoved Attributes from [" + c + "] nodes.\n"
            );

            // Any HTML Element with a "data-..." attribute will have that attribute(s) removed
            c = Attributes.removeData(section).length;
            if (log != null) log.append(
                BCYAN + "\tAttributes.removeData(section)\n" + RESET +
                "\t\tRemoved Data-Attributes from [" + c + "] nodes.\n"
            );

            // Any <!-- --> found in the Photo/Text section retrieved by the getter are
            // removed from the section.  Comments only add clutter - since they are almost
            // always auto-generated.
            c = Util.Remove.allCommentNodes(section);
            if (log != null) log.append(
                BCYAN + "\tUtil.Remove.allCommentNodes(section)\n" + RESET +
                "\t\tRemoved [" + c + "] CommentNodes.\n"
            );

            // If there are any <SCRIPT> ... </SCRIPT> blocks contained in this Photo/Text section
            // they shall be removed.  They are almost invariably links to other advertisements.
            // NOTE: There are photo-sites that have contained the <IMG> and text-description inside
            //       Java-Script blocks, but they are very, VERY rare in 99% of "Photo Bomb Sites."
            //       If attempting to scrape a photo-story site where the description or photo are
            //       wrapped in Java-Script or JSON, then this class WILL NOT WORK on that site.
            c = Util.Remove.scriptNodeBlocks(section);
            if (log != null) log.append(
                BCYAN + "\tUtil.Remove.scriptNodeBlocks(section)\n" + RESET +
                "\t\tRemoved [" + c + "] <SCRIPT> ... </SCRIPT> Blocks.\n"
            );

            // This class provides an extremely simple CSS Style for the photo and the description
            // and is the primary reason for using this class.  If there are any CSS
            // <STYLE> ... </STYLE> blocks, they are removed here, immediately.
            c = Util.Remove.styleNodeBlocks(section);
            if (log != null) log.append(
                BCYAN + "\tUtil.Remove.styleNodeBlocks(section)\n" + RESET +
                "\t\tRemoved [" + c + "] <STYLE> ... </STYLE> Blocks.\n"
            );

            // Removes <DIV>...</DIV> where "..." may only be white-space.
            // (Empty <DIV>, <SPAN>, <P>, <I>...).
            // NOTE: The concept of "Inclusive Empty" means that the only content between the
            //       opening <DIV> and closing </DIV> is either white-space or NOTHING.  This
            //       process of removing empty <DIV>...</DIV> pairs (and <SPAN>...</SPAN> pairs,
            //       along with the complete list of HTML Elements provided in the list) is 
            //       applied RECURSIVELY.  This means that if the removing of an empty <I>...</I>
            //       pair creates another empty Element Pair, that pair is removed next.
            c = Util.Remove.inclusiveEmpty
                (section, "div", "picture", "span", "p", "b", "i", "em");
            if (log != null) log.append(
                BCYAN + "\tUtil.Remove.inclusiveEmpty(section, \"div\", \"picture\", " +
                "\"span\", \"p\", \"b\", \"i\", \"em\")\n" + RESET +
                "\t\tRemoved [" + c + "] Empty Tag Blocks.\n"
            );

            // Now removes all instances of <DIV>, </DIV>, <A>, </A>,
            // <CENTER>, </CENTER>, <SECTION>, </SECTION>.
            // Removing these is usually great.  The only HTML Elements that are really needed are
            // the Paragraph <P> Elements, and the <IMG SRC=...> Elements themselves.  Everything
            // else is always extraneous "HTML Bloat" and "Clutter."
            
            // NOTE: This process is not infallible, but it has worked on dozens and dozens of the
            //       "Extraneous Photo Collections" that repeatedly pop-up on major news sites at
            //       random times in their news feeds.

            c = TagNodeRemove.all
                (section, TC.Both, "div", "a", "center", "section", "picture", "source");
            if (log != null) log.append(
                BCYAN + "\tTagNodeRemove.all(section, TC.Both, \"div\", \"a\", \"center\", " +
                "\"section\", \"picture\", \"source\")\n" + RESET +
                "\t\tRemoved [" + c + "] HTML <DIV>, </DIV>, <A>, </A> Elements.\n"
            );

            // Applies the user-provided text-node cleaner
            // This may remove all kinds of miscellaneous text-nodes.  Sometimes a little button
            // that says "Next" or "Next Photo" remains on the page.  The best way to create a 
            // TextCleaner instance is to run this class, and see if there is a common piece of
            // text that has been repeatedly inserted into the descriptions... and remove it!
            c = CLEANER.applyAsInt(section);
            if (log != null) log.append(
                BCYAN + "\tCLEANER.applyAsInt(section)\n" + RESET +
                "\t\tRemoved [" + c + "] Text-Node's.\n"
            );

            // Compacts Adjoining textNodes.  Often, after removing all of the HTML TagNode 
            // elements from the Vector - there are consecutive TextNode's left next to each other
            // in the Vector.  This Util method will just remove any two adjacent TextNode's, and
            // copy the Strings out of both them, and then unite them into a single TextNode.
            // Nothing more, nothing less.
            c = Util.compactTextNodes(section);
            if (log != null) log.append(
                BCYAN + "\tUtil.compactTextNodes(section)\n" + RESET +
                "\t\tCompacted [" + c + "] Text-Node's.\n"
            );

            // Trims the text inside of TextNode's, removes them if they were only white-space
            // Often after stripping out many many nodes (in the previous steps), there are huge
            // patches of white-space.  This Util method simply calls the Java String method
            // String.trim() on each TextNode, and then removes that TextNode, and replaces it
            // with a trimmed version of the text.
            // NOTE: This will have no affect on text that is surrounded by HTML Paragraph (<P>
            //       ... </P>) elements.  Only TextNode's themselves are trimmed.  There is no
            //       need to worry about text "running together" as long as it is separated by
            //       <P> elements - which it always is in just about any photo-content website.
            c = Util.trimTextNodes(section, true);
            if (log != null) log.append(
                BCYAN + "\tUtil.trimTextNodes(section)\n" + RESET +
                "\t\tTrimmed [" + c + "] Text-Node's.\n"
            );

            // Performs another round of empty element checks.
            c = Util.Remove.inclusiveEmpty(section, "div", "span", "p", "b", "i", "em");
            if (log != null) log.append(
                BCYAN + "\tUtil.Remove.inclusiveEmpty(section, \"div\", \"span\", \"p\", \"b\", " +
                "\"i\", \"em\")\n" + RESET +
                "\t\tRemoved [" + c + "] Empty Tag Blocks.\n"
            );

            // inserts a new-line character before each <IMG>, <P>, and </P> element.
            // Makes the final HTML generated more readable.
            int[] posArr = TagNodeFind.all(section, TC.Both, "img", "p");
            for (int i=(posArr.length-1); i >= 0; i--) section.add(posArr[i], NEW_LINE);

            // inserts a \n<BR />\n (three nodes, the <BR />, and two new-lines '\n') after
            // each <IMG>.
            // This makes both the HTML more readable, and the page itself more readable
            posArr = TagNodeFind.all(section, TC.OpeningTags, "img");
            for (int i=(posArr.length-1); i >= 0; i--) section.addAll(posArr[i] + 1, BR_NEWLINE);

            // inserts a ' ' (space character) before and after each newline
            posArr = TagNodeFind.all(section, TC.Both, "b", "i", "em");
            {
                for (int i=(posArr.length-1); i >= 0; i--) section.add(posArr[i] + 1, SPACE);
                for (int i=(posArr.length-1); i >= 0; i--) section.add(posArr[i], SPACE);
            }

            // Resolve any partial URL's
            Links.resolveAllSRC(section, url, null, false);
    
            // NOTE: There is an annoying "special apostrophe" on a lot of them.
            sb.append(  "<DIV CLASS='PhotoSection'>\n" +
                        StrReplace.r(Util.pageToString(section), matchChars, replaceStrs) +
                        "\n</DIV>\n" +
                        "\n\n\n<!-- Photo Section Break Page " + 
                        StrPrint.zeroPad(iterNum++) + "-->\n\n\n"
            );
        }

        return sb.toString() + "\n\n</BODY>\n</HTML>\n";
    }

    /**
     * <EMBED CLASS='external-html' DATA-FILE-ID=PBSLEGACY>
     *
     * This was the first version of photo-scraping.  There were more later - this is why
     * {@code '01'} is appended to this method.
     *
     * @param iter This iterator shall return all of the pages in the site.  Usually, it is just a
     * base {@code URL} followed by an integer - as in "page 1" " page 2" ... etc...
     *
     * @param emptyDIVs These are HTML divider elements who "class" equals the strings in this list.
     * HTML divider elements that contain these {@code String's} inside their {@code 'class'}
     * attribute  shall be removed (inclusively).  This is a string-array, and it may be null - and
     * if it is, it will be ignored - but it may not contain null-values, or an exception will
     * throw.
     *
     * @param textNodes <EMBED CLASS='external-html' DATA-FILE-ID=PBSTN>
     *
     * @param callTrimTextNodes <EMBED CLASS='external-html' DATA-FILE-ID=PBSCTTN>
     *
     * @param log Textual information shall be sent to the user/terminal using this log.  
     * <I><SPAN STYLE="color: red;">This parameter may <B>not</B> be null here.</SPAN></I>
     * <EMBED CLASS='external-html' DATA-FILE-ID=APPENDABLE>
     *
     * @return A {@code Vector<String>}.  The HTML will be in {@code String} format, not
     * {@code HTMLNode} format.
     *
     * @see TagNodeRemove
     * @see Util
     * @see TagNodeRemoveInclusive
     * @see TextNodeRemove
     */
    @Deprecated
    public static Vector<String> get01(
            Iterator<URL> iter, String[] emptyDIVs, String[] textNodes,
            boolean callTrimTextNodes, Appendable log
        )
        throws IOException
    {
        Vector<String> ret = new Vector<>();

        while (iter.hasNext())
        {
            URL url = iter.next();
            log.append("Visiting URL: " + url.toString() + '\n');
            Vector<HTMLNode> page = HTMLPage.getPageTokens(url, false);
            log.append("Removed " + TagNodeRemove.all(page, TC.Both, "meta") + " meta tags.\n");
            log.append("Removed " + TagNodeRemove.all(page, TC.Both, "link") + " link tags.\n");
            log.append("Removed " + Util.Remove.scriptNodeBlocks(page) + " Script Node Blocks.\n");
            log.append("Removed " + Util.Remove.styleNodeBlocks(page) + " Script Style Blocks.\n");
            log.append("Removed " + Util.Remove.allCommentNodes(page) + " Comment Nodes.\n");
            log.append("Removed " + TagNodeRemoveInclusive.all(page, "head", "noscript", "header") + " <HEAD>, <HEADER>, <NOSCRIPT> nodes.\n");

            // Removes all HTML <DIV> Elements where the "class" is in the String argument list
            if ((emptyDIVs != null) && (emptyDIVs.length > 0))
                log.append(
                    "Removed " + InnerTagRemoveInclusive.all(page, "div", "class", TextComparitor.C, emptyDIVs) +
                    " HTML <DIV> Elements.\n"
                );

            // Removes HTML <DIV> or <P> elements that are empty, recursively
            log.append("Removed [" + Util.Remove.inclusiveEmpty(page, "p", "div") + "] Empty <DIV> and <P> elements.\n");

            // Removes all opening and closing elements of the following:
            // Does not remove the content between these elements
            log.append("Removed " + TagNodeRemove.all(page, TC.Both, "div", "a", "html", "body", "li", "ul", "span") +
                                    " HTML Elements: div, a, html, body, li, ul, span.\n");

            // Removes TextNodes that contain the elements in the String argument list
            if ((textNodes != null) && (textNodes.length > 0))
                log.append("Removed " + TextNodeRemove.all(page, TextComparitor.CN_CI, textNodes) + " TextNodes.\n");

            // Many nodes have been removed, and this will convert multiple, adjacent TextNodes into a single
            // TextNode element.
            log.append("Removed " + Util.compactTextNodes(page) + " Nodes by compacting TextNodes.\n");

            // Long strings of spaces will be removed.
            // UNFORTUNATELY, New Lines will also disappear.
            if (callTrimTextNodes)
                log.append("Removed " + Util.trimTextNodes(page, true) + " Trimming Text Nodes.\n");

            // Remove id, class, and other attributes.
            log.append("Removed Attributes From " + Attributes.remove(page, "class", "id", "alt").length + " Nodes.\n");

            // Add some new-lines('\n' - not <BR />!)
            int[] posArr = TagNodeFind.all(page, TC.ClosingTags, "p", "img", "h1", "h2", "h3", "h4", "h5");
            for (int i = posArr.length - 1; i >= 0; i--) page.insertElementAt(NEW_LINE, posArr[i] + 1);

            // Save this page' image to the return vector.
            ret.addElement(Util.pageToString(page));
        }
        // Pass the Return Vector.  Each element of this Vector<String> will contain a picture and paragraph
        // about that picture.  The images will not have been downloaded, nor any partially resolved URL's
        // resolved.
        return ret;
    }

    /**
     * <EMBED CLASS='external-html' DATA-FILE-ID=PBSLEGACY>
     *
     * The code here is carbon copied from the above loop.  It is just the central loop body, that
     * does not iterate over many pages, but rather just one.
     * 
     * <BR /><BR /><B><SPAN STYLE="color: red;">CLONE NOTICE:</B></SPAN> This method modifies the
     * underlying {@code Vector}.  If you wish to avoid that, please call this method with using
     * the following parameter: {@code (Vector<HTMLNode>) yourOriginalPage.clone()}.  Make sure to
     * use the {@code SuppressWarnings("unchecked")} annotation.
     * 
     * @param page Any HTML page that has extraneous advertising and java-script junk.
     * 
     * @param emptyDIVs These are HTML divider elements who "class" equals the strings in this
     * list.  HTML divider elements that contain these strings inside their 'class' field shall be
     * removed (inclusively).  This is a string-array, and it may be null - and if it is, it will
     * be ignored - but it may not contain null-values, or an exception will throw.
     * 
     * @param textNodes <EMBED CLASS='external-html' DATA-FILE-ID=PBSTN>
     * 
     * @param callTrimTextNodes <EMBED CLASS='external-html' DATA-FILE-ID=PBSCTTN>
     * 
     * @param log This is a log, and <I><B>it may be null.</I></B>  If it is null, it will be
     * ignored. 
     * 
     * <EMBED CLASS='external-html' DATA-FILE-ID=APPENDABLE>
     *
     * @return a Stripped down version of the page, with most extraneous photo-bomb site junk
     * removed.
     *
     * @throws IOException This method throws {@code IOException} simply because it prints to the
     * {@code interface java.lang.Appendable}, which requires that {@code IOException} be
     * monitored / checked in code that uses this interface.
     */
    @Deprecated
    public static String get02( Vector<HTMLNode> page, String[] emptyDIVs, String[] textNodes,
                                boolean callTrimTextNodes, Appendable log)  throws IOException
    {
            int c = TagNodeRemove.all(page, TC.Both, "meta");
            if (log != null) log.append("Removed " + c + " meta tags.\n");

            c = TagNodeRemove.all(page, TC.Both, "link");
            if (log != null) log.append("Removed " + c + " link tags.\n");

            c = Util.Remove.scriptNodeBlocks(page);
            if (log != null) log.append("Removed " + c + " Script Node Blocks.\n");

            c = Util.Remove.styleNodeBlocks(page);
            if (log != null) log.append("Removed " + c + " Script Style Blocks.\n");

            c = Util.Remove.allCommentNodes(page);
            if (log != null) log.append("Removed " + c + " Comment Nodes.\n");

            c = TagNodeRemoveInclusive.all(page, "head", "noscript", "header");
            if (log != null) log.append("Removed " + c + " <HEAD> nodes.\n");

            // Removes all HTML <DIV> Elements where the "class" is in the String argument list
            if ((emptyDIVs != null) && (emptyDIVs.length > 0))
            {   
                c = InnerTagRemoveInclusive.all(page, "div", "class", TextComparitor.C, emptyDIVs);
                if (log != null) log.append("Removed " + c + " HTML <DIV> Elements.\n");
            }

            // Removes HTML <DIV> or <P> elements that are empty, recursively
            c = Util.Remove.inclusiveEmpty(page, "p", "div");
            if (log != null) log.append("Removed [" + c + "] Empty <DIV> and <P> elements.\n");

            // Removes all opening and closing elements of the following:
            // Does not remove the content between these elements
            c = TagNodeRemove.all(page, TC.Both, "div", "a", "html", "body", "li", "ul", "span");
            if (log != null) log.append("Removed " + c + " HTML Elements: div, a, html, body, li, ul, span.\n");

            // Removes TextNodes that contain the elements in the String argument list
            if ((textNodes != null) && (textNodes.length > 0))
            {
                c = TextNodeRemove.all(page, TextComparitor.CN_CI, textNodes);
                if (log != null) log.append("Removed " + c + " TextNodes.\n");
            }

            // Many nodes have been removed, and this will convert multiple, adjacent TextNodes into a single
            // TextNode element.
            c = Util.compactTextNodes(page);
            if (log != null) log.append("Removed " + c + " Nodes by compacting TextNodes.\n");

            // Long strings of spaces will be removed.
            // UNFORTUNATELY, New Lines will also disappear.
            if (callTrimTextNodes)
            {
                c = Util.trimTextNodes(page, true);
                if (log != null) log.append("Removed " + c + " Trimming Text Nodes.\n");
            }

            // Remove id, class, and other attributes.
            c = Attributes.remove(page, "class", "id", "alt").length;
            if (log != null) log.append("Removed Attributes From " + c + " Nodes.\n");

            // Add some new-lines('\n' - not <BR />!)
            int[] posArr = TagNodeFind.all(page, TC.ClosingTags, "p", "img", "h1", "h2", "h3", "h4", "h5");
            for (int i = posArr.length - 1; i >= 0; i--) page.insertElementAt(NEW_LINE, posArr[i] + 1);

            return Util.pageToString(page);
    }
}