Source code

001package Torello.HTML.Tools.Images;
002
003import Torello.Java.*;
004import Torello.HTML.*;
005import Torello.HTML.NodeSearch.*;
006
007import static Torello.Java.C.*;
008
009import java.util.*;
010import java.util.function.*;
011import java.net.*;
012import java.io.*;
013
014/**
015 * An <B>experimental class</B> that can be used (with, albeit, way too much effort) to download
016 * those photo-montages that are on major news-network web-sites.
017 * 
018 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=PBS>
019 */
020public class PhotoBombSite
021{
022    private PhotoBombSite() { }
023
024    // Quite a number of the sites visited start using really annoying apostrophe
025    // and quote characters.  This simply replaces these UNICODE characters with regular
026    // quotation and apostrophe marks.
027
028    private static final char[]      matchChars  = { '“', '”', '’' };
029    private static final String[]    replaceStrs = { "\"", "\"", "'" };
030
031    // NOTE: This is not the same as an HTML <BR /> Element
032    private static final TextNode NEW_LINE = new TextNode("\n");
033
034    // This a newline-BR-newline sequence
035    private static final Vector<HTMLNode> BR_NEWLINE = HTMLPage.getPageTokens("\n<BR />\n", false);
036
037    // A space character
038    private static final TextNode SPACE = new TextNode(" ");
039
040    /**
041     * This is the HTML header that is inserted into the page.  It may be modified, but if it
042     * is, note that the sub-string {@code URL_STR} should be there if the original page
043     * {@code URL} is to be included in the HTML.  The internal-logic replaces this substring
044     * by the actual {@code URL}, and <I>the replacement-code would fail if the text
045     * {@code URL_STR} were removed.</I>  (Though, the code would not actually throw an 
046     * exception either).
047     */
048    public static String HEADER = "" +
049        "<HTML>\n<HEAD>\n<TITLE>TITLE_STR</TITLE>\n"                +
050        "<META charset='utf-8'>\n"                                  +
051        "<STYLE TYPE='text/css'>\n"                                 +
052        "H1, H2, H3, h4     { color:            red;         \n"    +
053        "                     margin: 1em 1em 1em 1em;      }\n"    +
054        "BODY               { margin:           2em;        }\n"    +
055        "P                  { margin: 1.5em 1em 1.5em 1em;   \n"    +
056        "                     max-width:        75%;        }\n"    +
057        "IMG                { margin: 1em;                   \n"    +
058        "                     max-height:       90%;         \n"    +
059        "                     max-width:        90%;        }\n"    +
060        "DIV.PhotoSection   { margin: 7em 1em 1em 1em;       \n"    +
061        "                     background:       lightgray;   \n"    +
062        "                     border-radius:    2em;         \n"    +
063        "                     padding:          1.5em;      }\n"    +
064        "</STYLE>\n</HEAD>\n<BODY>\n"                               +
065        "<H1>TITLE_STR</H1>\n"                                      +
066        "<H2>Scraped From:</H2>\n"                                  +
067        "<H3><A HREF='URL_STR' TARGET=_blank>\nURL_STR</A></H3>\n"  +
068        "<BR /><BR /><BR />\n\n";
069
070
071    /**
072     * <B><I><SPAN STYLE="color: red;">This one works much better</I></B></SPAN>.  This is because
073     * it accepts a "Getter" that ask the user to find the content on a page.  For all Photo Bomb
074     * (and for likely 99% of websites in general) - the relevant HTML section is wrapped in an
075     * HTML {@code <DIV>, <SECTION>, <ARTICLE>} or {@code <MAIN>} element open-close pair.  <I>If
076     * the version {@code get01(...)} or {@code get02(...)} were dismal failures, then this method
077     * is much more likely to produce better results.</I>
078     *
079     * <BR /><BR /><B>NOTE:</B> This does mean that for this method to work, the onus is on the
080     * user to provide a "Getter" <B><I>by inspecting the HTML (the "View Source" Button in your
081     * browser)</I></B> to retrieve the short HTML section that actually has the picture and the
082     * notes.
083     *
084     * <BR /><BR /><B>EXAMPLE NOTE:</B> The example below is one of thousands of short stories
085     * with little pictures attached that are served up by all the news networks and search
086     * engines.  This is one is a collection of photos about the wild west.  If one looks at the
087     * HTML, the programmer would (hopefully) notice that each photo-{@code URL} has it's photo
088     * wrapped in an HTML Divider ({@code '<DIV>'}) element as:
089     * {@code <SECTION ID="mvp-content-main">}.  Notice, in the example, the {@code 'getter'}
090     * that is created to retrieve the photos.
091     *
092     * <EMBED CLASS='external-html' DATA-FILE-ID=PBSPRIME>
093     *
094     * @param iter An instance of {@code URLIterator} that iterates each page of the site.
095     *
096     * @param GETTER This method should retrieve the subsection of HTML <I>on each page</I>
097     * that contains the photo and caption.  It ought to be a one line statement that identifies
098     * how the photo is "wrapped" in HTML.  An "Inclusive" method on an HTML {@code '<DIV>',
099     * '<SECTION>...</SECTION>,' '<MAIN>...</MAIN>'} or {@code '<ARTICLE>...</ARTICLE>'} is 
100     * "99% likely" the right way to do this.
101     *
102     * @param CLEANER This ought to be a one line command that removes extraneous pieces of
103     * text.
104     *
105     * @param log This is a log parameter, and may be used to send log information to the 
106     * terminal.  This parameter may be null, and if it is, it shall be ignored.
107     *
108     * @param skipOnNotFoundException This can shunt the "Not Found Exceptions", and attempt
109     * to skip to the next image.  Some sites have a missing photo returned here and there.
110     *
111     * @return This returns the HTML as a {@code String}.
112     * 
113     * @throws HTMLNotFoundException If the provided {@code 'GETTER'} does not find an HTML
114     * section or element - <I>and returns null instead</I> - then rather than throwing a
115     * {@code NullPointerException}, this exception shall throw.  If this exception does throw,
116     * make sure to check and re-check the provided getter to make certain that the appropriate
117     * Node-Search classes and methods were used in order to properly retrieve <I>the section that 
118     * actually has the photo and the accompanying text.</I>
119     * 
120     * @throws NodeNotFoundException If the {@code 'GETTER'} provided does successfully retrieve
121     * a portion of the photo-page, but no HTML {@code <IMG SRC=...>} is found or identified, then
122     * this exception will throw.  Make sure that when writing the {@code 'GETTER'}, that the
123     * appropriate HTML Element ({@code <DIV ...>, <MAIN>, <SECTION>, <ARTICLE>}, etc...) that
124     * is selected actually wraps the photo on the page being downloaded.
125     */
126    public static String PRIMARY(
127        URLIterator iter, SectionGet GETTER, TextCleaner CLEANER, 
128        boolean skipOnNotFoundException, Appendable log
129    )
130        throws IOException
131    {
132        StringBuilder   sb      = new StringBuilder();
133        boolean         first   = true;
134        int             iterNum = 1;
135
136        while (iter.hasNext())
137        {
138            URL url =  iter.next();
139
140            // Visit the next URL produced by the URL Iterator:
141            log.append("Visiting: " + BYELLOW + url.toString() + RESET + '\n');
142            Vector<HTMLNode> page = HTMLPage.getPageTokens(url, false);
143
144            // Make sure to insert the HTML header into the "index.html" main page.
145            if (first)
146            {
147                // Do this only once.
148                first = false;
149
150                // Use the title of the page from the first URL returned by the iterator
151                // use the URL from the first URL returned by the iterator.
152                String titleStr = Util.textNodesString(Elements.getTitle(page));
153                sb.append(
154                    HEADER.replace("TITLE_STR", titleStr).replace("URL_STR", url.toString())
155                );
156            }
157
158            // Retrieve the relevant part of the page
159            Vector<HTMLNode> section = GETTER.apply(page);
160
161            // The getter didn't get any HTML.
162            if (section == null)
163            {
164                if (skipOnNotFoundException)
165                {
166                    log.append(
167                        BRED + "SectionGet did not return any HTML.  As per request, " +
168                        "Skipping...\n" + RESET
169                    );
170                    continue;
171                }
172                
173                throw new HTMLNotFoundException(
174                    "The lambda or method passed to parameter 'GETTER' did not retrieve any " +
175                    "image nor any text from the photo-page being scraped.  Be sure to check " +
176                    "that the specified HTML Elements (DIV, MAIN, SECTION, etc...) or whichever " +
177                    "element was specified is actually present on the photo-collection web-site."
178                );
179            }
180
181            // The HTML produced by the getter didn't have any photos.
182            if (TagNodeCount.all(section, TC.OpeningTags, "img") == 0)
183            {
184                if (skipOnNotFoundException)
185                {
186                    log.append(
187                        BRED + "HTML did not contain an <IMG>.  As per request, " +
188                        "Skipping...\n" + RESET
189                    );
190                    continue;
191                }
192
193                throw new NodeNotFoundException(
194                    "The lambda or method passed to parameter 'GETTER' did properly retrieve an " +
195                    "HTML Section as expected.  Unfortunately, there were no <IMG ...> elements " +
196                    "available in the section returned.  The purpose of this method is to " +
197                    "spider and crawl photo-collection sites, and retrieve the image of a list " +
198                    "of pages.  This page had no images; this is not allowed here."
199                );
200            }
201
202            // Any HTML Element with these attributes will have those attributes removed
203            // class, id, style, alt, itemtype, itemprop
204            int c = Attributes.remove
205                (section, "class", "id", "style", "title", "itemtype", "itemprop", "alt").length;
206            if (log != null) log.append(
207                BCYAN + "\tAttributes.remove(section, \"class\", \"id\", \"style\", \"title\", " +
208                "\"itemtype\", \"itemprop\", \"alt\")\n" + RESET +
209                "\t\tRemoved Attributes from [" + c + "] nodes.\n"
210            );
211
212            // Any HTML Element with a "data-..." attribute will have that attribute(s) removed
213            c = Attributes.removeData(section).length;
214            if (log != null) log.append(
215                BCYAN + "\tAttributes.removeData(section)\n" + RESET +
216                "\t\tRemoved Data-Attributes from [" + c + "] nodes.\n"
217            );
218
219            // Any <!-- --> found in the Photo/Text section retrieved by the getter are
220            // removed from the section.  Comments only add clutter - since they are almost
221            // always auto-generated.
222            c = Util.Remove.allCommentNodes(section);
223            if (log != null) log.append(
224                BCYAN + "\tUtil.Remove.allCommentNodes(section)\n" + RESET +
225                "\t\tRemoved [" + c + "] CommentNodes.\n"
226            );
227
228            // If there are any <SCRIPT> ... </SCRIPT> blocks contained in this Photo/Text section
229            // they shall be removed.  They are almost invariably links to other advertisements.
230            // NOTE: There are photo-sites that have contained the <IMG> and text-description inside
231            //       Java-Script blocks, but they are very, VERY rare in 99% of "Photo Bomb Sites."
232            //       If attempting to scrape a photo-story site where the description or photo are
233            //       wrapped in Java-Script or JSON, then this class WILL NOT WORK on that site.
234            c = Util.Remove.scriptNodeBlocks(section);
235            if (log != null) log.append(
236                BCYAN + "\tUtil.Remove.scriptNodeBlocks(section)\n" + RESET +
237                "\t\tRemoved [" + c + "] <SCRIPT> ... </SCRIPT> Blocks.\n"
238            );
239
240            // This class provides an extremely simple CSS Style for the photo and the description
241            // and is the primary reason for using this class.  If there are any CSS
242            // <STYLE> ... </STYLE> blocks, they are removed here, immediately.
243            c = Util.Remove.styleNodeBlocks(section);
244            if (log != null) log.append(
245                BCYAN + "\tUtil.Remove.styleNodeBlocks(section)\n" + RESET +
246                "\t\tRemoved [" + c + "] <STYLE> ... </STYLE> Blocks.\n"
247            );
248
249            // Removes <DIV>...</DIV> where "..." may only be white-space.
250            // (Empty <DIV>, <SPAN>, <P>, <I>...).
251            // NOTE: The concept of "Inclusive Empty" means that the only content between the
252            //       opening <DIV> and closing </DIV> is either white-space or NOTHING.  This
253            //       process of removing empty <DIV>...</DIV> pairs (and <SPAN>...</SPAN> pairs,
254            //       along with the complete list of HTML Elements provided in the list) is 
255            //       applied RECURSIVELY.  This means that if the removing of an empty <I>...</I>
256            //       pair creates another empty Element Pair, that pair is removed next.
257            c = Util.Remove.inclusiveEmpty
258                (section, "div", "picture", "span", "p", "b", "i", "em");
259            if (log != null) log.append(
260                BCYAN + "\tUtil.Remove.inclusiveEmpty(section, \"div\", \"picture\", " +
261                "\"span\", \"p\", \"b\", \"i\", \"em\")\n" + RESET +
262                "\t\tRemoved [" + c + "] Empty Tag Blocks.\n"
263            );
264
265            // Now removes all instances of <DIV>, </DIV>, <A>, </A>,
266            // <CENTER>, </CENTER>, <SECTION>, </SECTION>.
267            // Removing these is usually great.  The only HTML Elements that are really needed are
268            // the Paragraph <P> Elements, and the <IMG SRC=...> Elements themselves.  Everything
269            // else is always extraneous "HTML Bloat" and "Clutter."
270            
271            // NOTE: This process is not infallible, but it has worked on dozens and dozens of the
272            //       "Extraneous Photo Collections" that repeatedly pop-up on major news sites at
273            //       random times in their news feeds.
274
275            c = TagNodeRemove.all
276                (section, TC.Both, "div", "a", "center", "section", "picture", "source");
277            if (log != null) log.append(
278                BCYAN + "\tTagNodeRemove.all(section, TC.Both, \"div\", \"a\", \"center\", " +
279                "\"section\", \"picture\", \"source\")\n" + RESET +
280                "\t\tRemoved [" + c + "] HTML <DIV>, </DIV>, <A>, </A> Elements.\n"
281            );
282
283            // Applies the user-provided text-node cleaner
284            // This may remove all kinds of miscellaneous text-nodes.  Sometimes a little button
285            // that says "Next" or "Next Photo" remains on the page.  The best way to create a 
286            // TextCleaner instance is to run this class, and see if there is a common piece of
287            // text that has been repeatedly inserted into the descriptions... and remove it!
288            c = CLEANER.applyAsInt(section);
289            if (log != null) log.append(
290                BCYAN + "\tCLEANER.applyAsInt(section)\n" + RESET +
291                "\t\tRemoved [" + c + "] Text-Node's.\n"
292            );
293
294            // Compacts Adjoining textNodes.  Often, after removing all of the HTML TagNode 
295            // elements from the Vector - there are consecutive TextNode's left next to each other
296            // in the Vector.  This Util method will just remove any two adjacent TextNode's, and
297            // copy the Strings out of both them, and then unite them into a single TextNode.
298            // Nothing more, nothing less.
299            c = Util.compactTextNodes(section);
300            if (log != null) log.append(
301                BCYAN + "\tUtil.compactTextNodes(section)\n" + RESET +
302                "\t\tCompacted [" + c + "] Text-Node's.\n"
303            );
304
305            // Trims the text inside of TextNode's, removes them if they were only white-space
306            // Often after stripping out many many nodes (in the previous steps), there are huge
307            // patches of white-space.  This Util method simply calls the Java String method
308            // String.trim() on each TextNode, and then removes that TextNode, and replaces it
309            // with a trimmed version of the text.
310            // NOTE: This will have no affect on text that is surrounded by HTML Paragraph (<P>
311            //       ... </P>) elements.  Only TextNode's themselves are trimmed.  There is no
312            //       need to worry about text "running together" as long as it is separated by
313            //       <P> elements - which it always is in just about any photo-content website.
314            c = Util.trimTextNodes(section, true);
315            if (log != null) log.append(
316                BCYAN + "\tUtil.trimTextNodes(section)\n" + RESET +
317                "\t\tTrimmed [" + c + "] Text-Node's.\n"
318            );
319
320            // Performs another round of empty element checks.
321            c = Util.Remove.inclusiveEmpty(section, "div", "span", "p", "b", "i", "em");
322            if (log != null) log.append(
323                BCYAN + "\tUtil.Remove.inclusiveEmpty(section, \"div\", \"span\", \"p\", \"b\", " +
324                "\"i\", \"em\")\n" + RESET +
325                "\t\tRemoved [" + c + "] Empty Tag Blocks.\n"
326            );
327
328            // inserts a new-line character before each <IMG>, <P>, and </P> element.
329            // Makes the final HTML generated more readable.
330            int[] posArr = TagNodeFind.all(section, TC.Both, "img", "p");
331            for (int i=(posArr.length-1); i >= 0; i--) section.add(posArr[i], NEW_LINE);
332
333            // inserts a \n<BR />\n (three nodes, the <BR />, and two new-lines '\n') after
334            // each <IMG>.
335            // This makes both the HTML more readable, and the page itself more readable
336            posArr = TagNodeFind.all(section, TC.OpeningTags, "img");
337            for (int i=(posArr.length-1); i >= 0; i--) section.addAll(posArr[i] + 1, BR_NEWLINE);
338
339            // inserts a ' ' (space character) before and after each newline
340            posArr = TagNodeFind.all(section, TC.Both, "b", "i", "em");
341            {
342                for (int i=(posArr.length-1); i >= 0; i--) section.add(posArr[i] + 1, SPACE);
343                for (int i=(posArr.length-1); i >= 0; i--) section.add(posArr[i], SPACE);
344            }
345
346            // Resolve any partial URL's
347            Links.resolveAllSRC(section, url, null, false);
348    
349            // NOTE: There is an annoying "special apostrophe" on a lot of them.
350            sb.append(  "<DIV CLASS='PhotoSection'>\n" +
351                        StrReplace.r(Util.pageToString(section), matchChars, replaceStrs) +
352                        "\n</DIV>\n" +
353                        "\n\n\n<!-- Photo Section Break Page " + 
354                        StringParse.zeroPad(iterNum++) + "-->\n\n\n"
355            );
356        }
357
358        return sb.toString() + "\n\n</BODY>\n</HTML>\n";
359    }
360
361    /**
362     * <EMBED CLASS='external-html' DATA-FILE-ID=PBSLEGACY>
363     *
364     * This was the first version of photo-scraping.  There were more later - this is why
365     * {@code '01'} is appended to this method.
366     *
367     * @param iter This iterator shall return all of the pages in the site.  Usually, it is just a
368     * base {@code URL} followed by an integer - as in "page 1" " page 2" ... etc...
369     *
370     * @param emptyDIVs These are HTML divider elements who "class" equals the strings in this list.
371     * HTML divider elements that contain these {@code String's} inside their {@code 'class'}
372     * attribute  shall be removed (inclusively).  This is a string-array, and it may be null - and
373     * if it is, it will be ignored - but it may not contain null-values, or an exception will
374     * throw.
375     *
376     * @param textNodes <EMBED CLASS='external-html' DATA-FILE-ID=PBSTN>
377     *
378     * @param callTrimTextNodes <EMBED CLASS='external-html' DATA-FILE-ID=PBSCTTN>
379     *
380     * @param log Textual information shall be sent to the user/terminal using this log.  
381     * <I><SPAN STYLE="color: red;">This parameter may <B>not</B> be null here.</SPAN></I>
382     * <EMBED CLASS='external-html' DATA-FILE-ID=APPENDABLE>
383     *
384     * @return A {@code Vector<String>}.  The HTML will be in {@code String} format, not
385     * {@code HTMLNode} format.
386     *
387     * @see TagNodeRemove
388     * @see Util
389     * @see TagNodeRemoveInclusive
390     * @see TextNodeRemove
391     */
392    @Deprecated
393    public static Vector<String> get01(
394            Iterator<URL> iter, String[] emptyDIVs, String[] textNodes,
395            boolean callTrimTextNodes, Appendable log
396        )
397        throws IOException
398    {
399        Vector<String> ret = new Vector<>();
400
401        while (iter.hasNext())
402        {
403            URL url = iter.next();
404            log.append("Visiting URL: " + url.toString() + '\n');
405            Vector<HTMLNode> page = HTMLPage.getPageTokens(url, false);
406            log.append("Removed " + TagNodeRemove.all(page, TC.Both, "meta") + " meta tags.\n");
407            log.append("Removed " + TagNodeRemove.all(page, TC.Both, "link") + " link tags.\n");
408            log.append("Removed " + Util.Remove.scriptNodeBlocks(page) + " Script Node Blocks.\n");
409            log.append("Removed " + Util.Remove.styleNodeBlocks(page) + " Script Style Blocks.\n");
410            log.append("Removed " + Util.Remove.allCommentNodes(page) + " Comment Nodes.\n");
411            log.append("Removed " + TagNodeRemoveInclusive.all(page, "head", "noscript", "header") + " <HEAD>, <HEADER>, <NOSCRIPT> nodes.\n");
412
413            // Removes all HTML <DIV> Elements where the "class" is in the String argument list
414            if ((emptyDIVs != null) && (emptyDIVs.length > 0))
415                log.append(
416                    "Removed " + InnerTagRemoveInclusive.all(page, "div", "class", TextComparitor.C, emptyDIVs) +
417                    " HTML <DIV> Elements.\n"
418                );
419
420            // Removes HTML <DIV> or <P> elements that are empty, recursively
421            log.append("Removed [" + Util.Remove.inclusiveEmpty(page, "p", "div") + "] Empty <DIV> and <P> elements.\n");
422
423            // Removes all opening and closing elements of the following:
424            // Does not remove the content between these elements
425            log.append("Removed " + TagNodeRemove.all(page, TC.Both, "div", "a", "html", "body", "li", "ul", "span") +
426                                    " HTML Elements: div, a, html, body, li, ul, span.\n");
427
428            // Removes TextNodes that contain the elements in the String argument list
429            if ((textNodes != null) && (textNodes.length > 0))
430                log.append("Removed " + TextNodeRemove.all(page, TextComparitor.CN_CI, textNodes) + " TextNodes.\n");
431
432            // Many nodes have been removed, and this will convert multiple, adjacent TextNodes into a single
433            // TextNode element.
434            log.append("Removed " + Util.compactTextNodes(page) + " Nodes by compacting TextNodes.\n");
435
436            // Long strings of spaces will be removed.
437            // UNFORTUNATELY, New Lines will also disappear.
438            if (callTrimTextNodes)
439                log.append("Removed " + Util.trimTextNodes(page, true) + " Trimming Text Nodes.\n");
440
441            // Remove id, class, and other attributes.
442            log.append("Removed Attributes From " + Attributes.remove(page, "class", "id", "alt").length + " Nodes.\n");
443
444            // Add some new-lines('\n' - not <BR />!)
445            int[] posArr = TagNodeFind.all(page, TC.ClosingTags, "p", "img", "h1", "h2", "h3", "h4", "h5");
446            for (int i = posArr.length - 1; i >= 0; i--) page.insertElementAt(NEW_LINE, posArr[i] + 1);
447
448            // Save this page' image to the return vector.
449            ret.addElement(Util.pageToString(page));
450        }
451        // Pass the Return Vector.  Each element of this Vector<String> will contain a picture and paragraph
452        // about that picture.  The images will not have been downloaded, nor any partially resolved URL's
453        // resolved.
454        return ret;
455    }
456
457    /**
458     * <EMBED CLASS='external-html' DATA-FILE-ID=PBSLEGACY>
459     *
460     * The code here is carbon copied from the above loop.  It is just the central loop body, that
461     * does not iterate over many pages, but rather just one.
462     * 
463     * <BR /><BR /><B><SPAN STYLE="color: red;">CLONE NOTICE:</B></SPAN> This method modifies the
464     * underlying {@code Vector}.  If you wish to avoid that, please call this method with using
465     * the following parameter: {@code (Vector<HTMLNode>) yourOriginalPage.clone()}.  Make sure to
466     * use the {@code SuppressWarnings("unchecked")} annotation.
467     * 
468     * @param page Any HTML page that has extraneous advertising and java-script junk.
469     * 
470     * @param emptyDIVs These are HTML divider elements who "class" equals the strings in this
471     * list.  HTML divider elements that contain these strings inside their 'class' field shall be
472     * removed (inclusively).  This is a string-array, and it may be null - and if it is, it will
473     * be ignored - but it may not contain null-values, or an exception will throw.
474     * 
475     * @param textNodes <EMBED CLASS='external-html' DATA-FILE-ID=PBSTN>
476     * 
477     * @param callTrimTextNodes <EMBED CLASS='external-html' DATA-FILE-ID=PBSCTTN>
478     * 
479     * @param log This is a log, and <I><B>it may be null.</I></B>  If it is null, it will be
480     * ignored. 
481     * 
482     * <EMBED CLASS='external-html' DATA-FILE-ID=APPENDABLE>
483     *
484     * @return a Stripped down version of the page, with most extraneous photo-bomb site junk
485     * removed.
486     *
487     * @throws IOException This method throws {@code IOException} simply because it prints to the
488     * {@code interface java.lang.Appendable}, which requires that {@code IOException} be
489     * monitored / checked in code that uses this interface.
490     */
491    @Deprecated
492    public static String get02( Vector<HTMLNode> page, String[] emptyDIVs, String[] textNodes,
493                                boolean callTrimTextNodes, Appendable log)  throws IOException
494    {
495            int c = TagNodeRemove.all(page, TC.Both, "meta");
496            if (log != null) log.append("Removed " + c + " meta tags.\n");
497
498            c = TagNodeRemove.all(page, TC.Both, "link");
499            if (log != null) log.append("Removed " + c + " link tags.\n");
500
501            c = Util.Remove.scriptNodeBlocks(page);
502            if (log != null) log.append("Removed " + c + " Script Node Blocks.\n");
503
504            c = Util.Remove.styleNodeBlocks(page);
505            if (log != null) log.append("Removed " + c + " Script Style Blocks.\n");
506
507            c = Util.Remove.allCommentNodes(page);
508            if (log != null) log.append("Removed " + c + " Comment Nodes.\n");
509
510            c = TagNodeRemoveInclusive.all(page, "head", "noscript", "header");
511            if (log != null) log.append("Removed " + c + " <HEAD> nodes.\n");
512
513            // Removes all HTML <DIV> Elements where the "class" is in the String argument list
514            if ((emptyDIVs != null) && (emptyDIVs.length > 0))
515            {   
516                c = InnerTagRemoveInclusive.all(page, "div", "class", TextComparitor.C, emptyDIVs);
517                if (log != null) log.append("Removed " + c + " HTML <DIV> Elements.\n");
518            }
519
520            // Removes HTML <DIV> or <P> elements that are empty, recursively
521            c = Util.Remove.inclusiveEmpty(page, "p", "div");
522            if (log != null) log.append("Removed [" + c + "] Empty <DIV> and <P> elements.\n");
523
524            // Removes all opening and closing elements of the following:
525            // Does not remove the content between these elements
526            c = TagNodeRemove.all(page, TC.Both, "div", "a", "html", "body", "li", "ul", "span");
527            if (log != null) log.append("Removed " + c + " HTML Elements: div, a, html, body, li, ul, span.\n");
528
529            // Removes TextNodes that contain the elements in the String argument list
530            if ((textNodes != null) && (textNodes.length > 0))
531            {
532                c = TextNodeRemove.all(page, TextComparitor.CN_CI, textNodes);
533                if (log != null) log.append("Removed " + c + " TextNodes.\n");
534            }
535
536            // Many nodes have been removed, and this will convert multiple, adjacent TextNodes into a single
537            // TextNode element.
538            c = Util.compactTextNodes(page);
539            if (log != null) log.append("Removed " + c + " Nodes by compacting TextNodes.\n");
540
541            // Long strings of spaces will be removed.
542            // UNFORTUNATELY, New Lines will also disappear.
543            if (callTrimTextNodes)
544            {
545                c = Util.trimTextNodes(page, true);
546                if (log != null) log.append("Removed " + c + " Trimming Text Nodes.\n");
547            }
548
549            // Remove id, class, and other attributes.
550            c = Attributes.remove(page, "class", "id", "alt").length;
551            if (log != null) log.append("Removed Attributes From " + c + " Nodes.\n");
552
553            // Add some new-lines('\n' - not <BR />!)
554            int[] posArr = TagNodeFind.all(page, TC.ClosingTags, "p", "img", "h1", "h2", "h3", "h4", "h5");
555            for (int i = posArr.length - 1; i >= 0; i--) page.insertElementAt(NEW_LINE, posArr[i] + 1);
556
557            return Util.pageToString(page);
558    }
559}