001package Torello.HTML.Tools.Images; 002 003import Torello.Java.*; 004import Torello.HTML.*; 005import Torello.HTML.NodeSearch.*; 006 007import static Torello.Java.C.*; 008 009import java.util.*; 010import java.util.function.*; 011import java.net.*; 012import java.io.*; 013 014/** 015 * An <B>experimental class</B> that can be used (with, albeit, way too much effort) to download 016 * those photo-montages that are on major news-network web-sites. 017 * 018 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=PBS> 019 */ 020public class PhotoBombSite 021{ 022 private PhotoBombSite() { } 023 024 // Quite a number of the sites visited start using really annoying apostrophe 025 // and quote characters. This simply replaces these UNICODE characters with regular 026 // quotation and apostrophe marks. 027 028 private static final char[] matchChars = { '“', '”', '’' }; 029 private static final String[] replaceStrs = { "\"", "\"", "'" }; 030 031 // NOTE: This is not the same as an HTML <BR /> Element 032 private static final TextNode NEW_LINE = new TextNode("\n"); 033 034 // This a newline-BR-newline sequence 035 private static final Vector<HTMLNode> BR_NEWLINE = HTMLPage.getPageTokens("\n<BR />\n", false); 036 037 // A space character 038 private static final TextNode SPACE = new TextNode(" "); 039 040 /** 041 * This is the HTML header that is inserted into the page. It may be modified, but if it 042 * is, note that the sub-string {@code URL_STR} should be there if the original page 043 * {@code URL} is to be included in the HTML. The internal-logic replaces this substring 044 * by the actual {@code URL}, and <I>the replacement-code would fail if the text 045 * {@code URL_STR} were removed.</I> (Though, the code would not actually throw an 046 * exception either). 047 */ 048 public static String HEADER = "" + 049 "<HTML>\n<HEAD>\n<TITLE>TITLE_STR</TITLE>\n" + 050 "<META charset='utf-8'>\n" + 051 "<STYLE TYPE='text/css'>\n" + 052 "H1, H2, H3, h4 { color: red; \n" + 053 " margin: 1em 1em 1em 1em; }\n" + 054 "BODY { margin: 2em; }\n" + 055 "P { margin: 1.5em 1em 1.5em 1em; \n" + 056 " max-width: 75%; }\n" + 057 "IMG { margin: 1em; \n" + 058 " max-height: 90%; \n" + 059 " max-width: 90%; }\n" + 060 "DIV.PhotoSection { margin: 7em 1em 1em 1em; \n" + 061 " background: lightgray; \n" + 062 " border-radius: 2em; \n" + 063 " padding: 1.5em; }\n" + 064 "</STYLE>\n</HEAD>\n<BODY>\n" + 065 "<H1>TITLE_STR</H1>\n" + 066 "<H2>Scraped From:</H2>\n" + 067 "<H3><A HREF='URL_STR' TARGET=_blank>\nURL_STR</A></H3>\n" + 068 "<BR /><BR /><BR />\n\n"; 069 070 071 /** 072 * <B><I><SPAN STYLE="color: red;">This one works much better</I></B></SPAN>. This is because 073 * it accepts a "Getter" that ask the user to find the content on a page. For all Photo Bomb 074 * (and for likely 99% of websites in general) - the relevant HTML section is wrapped in an 075 * HTML {@code <DIV>, <SECTION>, <ARTICLE>} or {@code <MAIN>} element open-close pair. <I>If 076 * the version {@code get01(...)} or {@code get02(...)} were dismal failures, then this method 077 * is much more likely to produce better results.</I> 078 * 079 * <BR /><BR /><B>NOTE:</B> This does mean that for this method to work, the onus is on the 080 * user to provide a "Getter" <B><I>by inspecting the HTML (the "View Source" Button in your 081 * browser)</I></B> to retrieve the short HTML section that actually has the picture and the 082 * notes. 083 * 084 * <BR /><BR /><B>EXAMPLE NOTE:</B> The example below is one of thousands of short stories 085 * with little pictures attached that are served up by all the news networks and search 086 * engines. This is one is a collection of photos about the wild west. If one looks at the 087 * HTML, the programmer would (hopefully) notice that each photo-{@code URL} has it's photo 088 * wrapped in an HTML Divider ({@code '<DIV>'}) element as: 089 * {@code <SECTION ID="mvp-content-main">}. Notice, in the example, the {@code 'getter'} 090 * that is created to retrieve the photos. 091 * 092 * <EMBED CLASS='external-html' DATA-FILE-ID=PBSPRIME> 093 * 094 * @param iter An instance of {@code URLIterator} that iterates each page of the site. 095 * 096 * @param GETTER This method should retrieve the subsection of HTML <I>on each page</I> 097 * that contains the photo and caption. It ought to be a one line statement that identifies 098 * how the photo is "wrapped" in HTML. An "Inclusive" method on an HTML {@code '<DIV>', 099 * '<SECTION>...</SECTION>,' '<MAIN>...</MAIN>'} or {@code '<ARTICLE>...</ARTICLE>'} is 100 * "99% likely" the right way to do this. 101 * 102 * @param CLEANER This ought to be a one line command that removes extraneous pieces of 103 * text. 104 * 105 * @param log This is a log parameter, and may be used to send log information to the 106 * terminal. This parameter may be null, and if it is, it shall be ignored. 107 * 108 * @param skipOnNotFoundException This can shunt the "Not Found Exceptions", and attempt 109 * to skip to the next image. Some sites have a missing photo returned here and there. 110 * 111 * @return This returns the HTML as a {@code String}. 112 * 113 * @throws HTMLNotFoundException If the provided {@code 'GETTER'} does not find an HTML 114 * section or element - <I>and returns null instead</I> - then rather than throwing a 115 * {@code NullPointerException}, this exception shall throw. If this exception does throw, 116 * make sure to check and re-check the provided getter to make certain that the appropriate 117 * Node-Search classes and methods were used in order to properly retrieve <I>the section that 118 * actually has the photo and the accompanying text.</I> 119 * 120 * @throws NodeNotFoundException If the {@code 'GETTER'} provided does successfully retrieve 121 * a portion of the photo-page, but no HTML {@code <IMG SRC=...>} is found or identified, then 122 * this exception will throw. Make sure that when writing the {@code 'GETTER'}, that the 123 * appropriate HTML Element ({@code <DIV ...>, <MAIN>, <SECTION>, <ARTICLE>}, etc...) that 124 * is selected actually wraps the photo on the page being downloaded. 125 */ 126 public static String PRIMARY( 127 URLIterator iter, SectionGet GETTER, TextCleaner CLEANER, 128 boolean skipOnNotFoundException, Appendable log 129 ) 130 throws IOException 131 { 132 StringBuilder sb = new StringBuilder(); 133 boolean first = true; 134 int iterNum = 1; 135 136 while (iter.hasNext()) 137 { 138 URL url = iter.next(); 139 140 // Visit the next URL produced by the URL Iterator: 141 log.append("Visiting: " + BYELLOW + url.toString() + RESET + '\n'); 142 Vector<HTMLNode> page = HTMLPage.getPageTokens(url, false); 143 144 // Make sure to insert the HTML header into the "index.html" main page. 145 if (first) 146 { 147 // Do this only once. 148 first = false; 149 150 // Use the title of the page from the first URL returned by the iterator 151 // use the URL from the first URL returned by the iterator. 152 String titleStr = Util.textNodesString(Elements.getTitle(page)); 153 sb.append( 154 HEADER.replace("TITLE_STR", titleStr).replace("URL_STR", url.toString()) 155 ); 156 } 157 158 // Retrieve the relevant part of the page 159 Vector<HTMLNode> section = GETTER.apply(page); 160 161 // The getter didn't get any HTML. 162 if (section == null) 163 { 164 if (skipOnNotFoundException) 165 { 166 log.append( 167 BRED + "SectionGet did not return any HTML. As per request, " + 168 "Skipping...\n" + RESET 169 ); 170 continue; 171 } 172 173 throw new HTMLNotFoundException( 174 "The lambda or method passed to parameter 'GETTER' did not retrieve any " + 175 "image nor any text from the photo-page being scraped. Be sure to check " + 176 "that the specified HTML Elements (DIV, MAIN, SECTION, etc...) or whichever " + 177 "element was specified is actually present on the photo-collection web-site." 178 ); 179 } 180 181 // The HTML produced by the getter didn't have any photos. 182 if (TagNodeCount.all(section, TC.OpeningTags, "img") == 0) 183 { 184 if (skipOnNotFoundException) 185 { 186 log.append( 187 BRED + "HTML did not contain an <IMG>. As per request, " + 188 "Skipping...\n" + RESET 189 ); 190 continue; 191 } 192 193 throw new NodeNotFoundException( 194 "The lambda or method passed to parameter 'GETTER' did properly retrieve an " + 195 "HTML Section as expected. Unfortunately, there were no <IMG ...> elements " + 196 "available in the section returned. The purpose of this method is to " + 197 "spider and crawl photo-collection sites, and retrieve the image of a list " + 198 "of pages. This page had no images; this is not allowed here." 199 ); 200 } 201 202 // Any HTML Element with these attributes will have those attributes removed 203 // class, id, style, alt, itemtype, itemprop 204 int c = Attributes.remove 205 (section, "class", "id", "style", "title", "itemtype", "itemprop", "alt").length; 206 if (log != null) log.append( 207 BCYAN + "\tAttributes.remove(section, \"class\", \"id\", \"style\", \"title\", " + 208 "\"itemtype\", \"itemprop\", \"alt\")\n" + RESET + 209 "\t\tRemoved Attributes from [" + c + "] nodes.\n" 210 ); 211 212 // Any HTML Element with a "data-..." attribute will have that attribute(s) removed 213 c = Attributes.removeData(section).length; 214 if (log != null) log.append( 215 BCYAN + "\tAttributes.removeData(section)\n" + RESET + 216 "\t\tRemoved Data-Attributes from [" + c + "] nodes.\n" 217 ); 218 219 // Any <!-- --> found in the Photo/Text section retrieved by the getter are 220 // removed from the section. Comments only add clutter - since they are almost 221 // always auto-generated. 222 c = Util.Remove.allCommentNodes(section); 223 if (log != null) log.append( 224 BCYAN + "\tUtil.Remove.allCommentNodes(section)\n" + RESET + 225 "\t\tRemoved [" + c + "] CommentNodes.\n" 226 ); 227 228 // If there are any <SCRIPT> ... </SCRIPT> blocks contained in this Photo/Text section 229 // they shall be removed. They are almost invariably links to other advertisements. 230 // NOTE: There are photo-sites that have contained the <IMG> and text-description inside 231 // Java-Script blocks, but they are very, VERY rare in 99% of "Photo Bomb Sites." 232 // If attempting to scrape a photo-story site where the description or photo are 233 // wrapped in Java-Script or JSON, then this class WILL NOT WORK on that site. 234 c = Util.Remove.scriptNodeBlocks(section); 235 if (log != null) log.append( 236 BCYAN + "\tUtil.Remove.scriptNodeBlocks(section)\n" + RESET + 237 "\t\tRemoved [" + c + "] <SCRIPT> ... </SCRIPT> Blocks.\n" 238 ); 239 240 // This class provides an extremely simple CSS Style for the photo and the description 241 // and is the primary reason for using this class. If there are any CSS 242 // <STYLE> ... </STYLE> blocks, they are removed here, immediately. 243 c = Util.Remove.styleNodeBlocks(section); 244 if (log != null) log.append( 245 BCYAN + "\tUtil.Remove.styleNodeBlocks(section)\n" + RESET + 246 "\t\tRemoved [" + c + "] <STYLE> ... </STYLE> Blocks.\n" 247 ); 248 249 // Removes <DIV>...</DIV> where "..." may only be white-space. 250 // (Empty <DIV>, <SPAN>, <P>, <I>...). 251 // NOTE: The concept of "Inclusive Empty" means that the only content between the 252 // opening <DIV> and closing </DIV> is either white-space or NOTHING. This 253 // process of removing empty <DIV>...</DIV> pairs (and <SPAN>...</SPAN> pairs, 254 // along with the complete list of HTML Elements provided in the list) is 255 // applied RECURSIVELY. This means that if the removing of an empty <I>...</I> 256 // pair creates another empty Element Pair, that pair is removed next. 257 c = Util.Remove.inclusiveEmpty 258 (section, "div", "picture", "span", "p", "b", "i", "em"); 259 if (log != null) log.append( 260 BCYAN + "\tUtil.Remove.inclusiveEmpty(section, \"div\", \"picture\", " + 261 "\"span\", \"p\", \"b\", \"i\", \"em\")\n" + RESET + 262 "\t\tRemoved [" + c + "] Empty Tag Blocks.\n" 263 ); 264 265 // Now removes all instances of <DIV>, </DIV>, <A>, </A>, 266 // <CENTER>, </CENTER>, <SECTION>, </SECTION>. 267 // Removing these is usually great. The only HTML Elements that are really needed are 268 // the Paragraph <P> Elements, and the <IMG SRC=...> Elements themselves. Everything 269 // else is always extraneous "HTML Bloat" and "Clutter." 270 271 // NOTE: This process is not infallible, but it has worked on dozens and dozens of the 272 // "Extraneous Photo Collections" that repeatedly pop-up on major news sites at 273 // random times in their news feeds. 274 275 c = TagNodeRemove.all 276 (section, TC.Both, "div", "a", "center", "section", "picture", "source"); 277 if (log != null) log.append( 278 BCYAN + "\tTagNodeRemove.all(section, TC.Both, \"div\", \"a\", \"center\", " + 279 "\"section\", \"picture\", \"source\")\n" + RESET + 280 "\t\tRemoved [" + c + "] HTML <DIV>, </DIV>, <A>, </A> Elements.\n" 281 ); 282 283 // Applies the user-provided text-node cleaner 284 // This may remove all kinds of miscellaneous text-nodes. Sometimes a little button 285 // that says "Next" or "Next Photo" remains on the page. The best way to create a 286 // TextCleaner instance is to run this class, and see if there is a common piece of 287 // text that has been repeatedly inserted into the descriptions... and remove it! 288 c = CLEANER.applyAsInt(section); 289 if (log != null) log.append( 290 BCYAN + "\tCLEANER.applyAsInt(section)\n" + RESET + 291 "\t\tRemoved [" + c + "] Text-Node's.\n" 292 ); 293 294 // Compacts Adjoining textNodes. Often, after removing all of the HTML TagNode 295 // elements from the Vector - there are consecutive TextNode's left next to each other 296 // in the Vector. This Util method will just remove any two adjacent TextNode's, and 297 // copy the Strings out of both them, and then unite them into a single TextNode. 298 // Nothing more, nothing less. 299 c = Util.compactTextNodes(section); 300 if (log != null) log.append( 301 BCYAN + "\tUtil.compactTextNodes(section)\n" + RESET + 302 "\t\tCompacted [" + c + "] Text-Node's.\n" 303 ); 304 305 // Trims the text inside of TextNode's, removes them if they were only white-space 306 // Often after stripping out many many nodes (in the previous steps), there are huge 307 // patches of white-space. This Util method simply calls the Java String method 308 // String.trim() on each TextNode, and then removes that TextNode, and replaces it 309 // with a trimmed version of the text. 310 // NOTE: This will have no affect on text that is surrounded by HTML Paragraph (<P> 311 // ... </P>) elements. Only TextNode's themselves are trimmed. There is no 312 // need to worry about text "running together" as long as it is separated by 313 // <P> elements - which it always is in just about any photo-content website. 314 c = Util.trimTextNodes(section, true); 315 if (log != null) log.append( 316 BCYAN + "\tUtil.trimTextNodes(section)\n" + RESET + 317 "\t\tTrimmed [" + c + "] Text-Node's.\n" 318 ); 319 320 // Performs another round of empty element checks. 321 c = Util.Remove.inclusiveEmpty(section, "div", "span", "p", "b", "i", "em"); 322 if (log != null) log.append( 323 BCYAN + "\tUtil.Remove.inclusiveEmpty(section, \"div\", \"span\", \"p\", \"b\", " + 324 "\"i\", \"em\")\n" + RESET + 325 "\t\tRemoved [" + c + "] Empty Tag Blocks.\n" 326 ); 327 328 // inserts a new-line character before each <IMG>, <P>, and </P> element. 329 // Makes the final HTML generated more readable. 330 int[] posArr = TagNodeFind.all(section, TC.Both, "img", "p"); 331 for (int i=(posArr.length-1); i >= 0; i--) section.add(posArr[i], NEW_LINE); 332 333 // inserts a \n<BR />\n (three nodes, the <BR />, and two new-lines '\n') after 334 // each <IMG>. 335 // This makes both the HTML more readable, and the page itself more readable 336 posArr = TagNodeFind.all(section, TC.OpeningTags, "img"); 337 for (int i=(posArr.length-1); i >= 0; i--) section.addAll(posArr[i] + 1, BR_NEWLINE); 338 339 // inserts a ' ' (space character) before and after each newline 340 posArr = TagNodeFind.all(section, TC.Both, "b", "i", "em"); 341 { 342 for (int i=(posArr.length-1); i >= 0; i--) section.add(posArr[i] + 1, SPACE); 343 for (int i=(posArr.length-1); i >= 0; i--) section.add(posArr[i], SPACE); 344 } 345 346 // Resolve any partial URL's 347 Links.resolveAllSRC(section, url, null, false); 348 349 // NOTE: There is an annoying "special apostrophe" on a lot of them. 350 sb.append( "<DIV CLASS='PhotoSection'>\n" + 351 StrReplace.r(Util.pageToString(section), matchChars, replaceStrs) + 352 "\n</DIV>\n" + 353 "\n\n\n<!-- Photo Section Break Page " + 354 StringParse.zeroPad(iterNum++) + "-->\n\n\n" 355 ); 356 } 357 358 return sb.toString() + "\n\n</BODY>\n</HTML>\n"; 359 } 360 361 /** 362 * <EMBED CLASS='external-html' DATA-FILE-ID=PBSLEGACY> 363 * 364 * This was the first version of photo-scraping. There were more later - this is why 365 * {@code '01'} is appended to this method. 366 * 367 * @param iter This iterator shall return all of the pages in the site. Usually, it is just a 368 * base {@code URL} followed by an integer - as in "page 1" " page 2" ... etc... 369 * 370 * @param emptyDIVs These are HTML divider elements who "class" equals the strings in this list. 371 * HTML divider elements that contain these {@code String's} inside their {@code 'class'} 372 * attribute shall be removed (inclusively). This is a string-array, and it may be null - and 373 * if it is, it will be ignored - but it may not contain null-values, or an exception will 374 * throw. 375 * 376 * @param textNodes <EMBED CLASS='external-html' DATA-FILE-ID=PBSTN> 377 * 378 * @param callTrimTextNodes <EMBED CLASS='external-html' DATA-FILE-ID=PBSCTTN> 379 * 380 * @param log Textual information shall be sent to the user/terminal using this log. 381 * <I><SPAN STYLE="color: red;">This parameter may <B>not</B> be null here.</SPAN></I> 382 * <EMBED CLASS='external-html' DATA-FILE-ID=APPENDABLE> 383 * 384 * @return A {@code Vector<String>}. The HTML will be in {@code String} format, not 385 * {@code HTMLNode} format. 386 * 387 * @see TagNodeRemove 388 * @see Util 389 * @see TagNodeRemoveInclusive 390 * @see TextNodeRemove 391 */ 392 @Deprecated 393 public static Vector<String> get01( 394 Iterator<URL> iter, String[] emptyDIVs, String[] textNodes, 395 boolean callTrimTextNodes, Appendable log 396 ) 397 throws IOException 398 { 399 Vector<String> ret = new Vector<>(); 400 401 while (iter.hasNext()) 402 { 403 URL url = iter.next(); 404 log.append("Visiting URL: " + url.toString() + '\n'); 405 Vector<HTMLNode> page = HTMLPage.getPageTokens(url, false); 406 log.append("Removed " + TagNodeRemove.all(page, TC.Both, "meta") + " meta tags.\n"); 407 log.append("Removed " + TagNodeRemove.all(page, TC.Both, "link") + " link tags.\n"); 408 log.append("Removed " + Util.Remove.scriptNodeBlocks(page) + " Script Node Blocks.\n"); 409 log.append("Removed " + Util.Remove.styleNodeBlocks(page) + " Script Style Blocks.\n"); 410 log.append("Removed " + Util.Remove.allCommentNodes(page) + " Comment Nodes.\n"); 411 log.append("Removed " + TagNodeRemoveInclusive.all(page, "head", "noscript", "header") + " <HEAD>, <HEADER>, <NOSCRIPT> nodes.\n"); 412 413 // Removes all HTML <DIV> Elements where the "class" is in the String argument list 414 if ((emptyDIVs != null) && (emptyDIVs.length > 0)) 415 log.append( 416 "Removed " + InnerTagRemoveInclusive.all(page, "div", "class", TextComparitor.C, emptyDIVs) + 417 " HTML <DIV> Elements.\n" 418 ); 419 420 // Removes HTML <DIV> or <P> elements that are empty, recursively 421 log.append("Removed [" + Util.Remove.inclusiveEmpty(page, "p", "div") + "] Empty <DIV> and <P> elements.\n"); 422 423 // Removes all opening and closing elements of the following: 424 // Does not remove the content between these elements 425 log.append("Removed " + TagNodeRemove.all(page, TC.Both, "div", "a", "html", "body", "li", "ul", "span") + 426 " HTML Elements: div, a, html, body, li, ul, span.\n"); 427 428 // Removes TextNodes that contain the elements in the String argument list 429 if ((textNodes != null) && (textNodes.length > 0)) 430 log.append("Removed " + TextNodeRemove.all(page, TextComparitor.CN_CI, textNodes) + " TextNodes.\n"); 431 432 // Many nodes have been removed, and this will convert multiple, adjacent TextNodes into a single 433 // TextNode element. 434 log.append("Removed " + Util.compactTextNodes(page) + " Nodes by compacting TextNodes.\n"); 435 436 // Long strings of spaces will be removed. 437 // UNFORTUNATELY, New Lines will also disappear. 438 if (callTrimTextNodes) 439 log.append("Removed " + Util.trimTextNodes(page, true) + " Trimming Text Nodes.\n"); 440 441 // Remove id, class, and other attributes. 442 log.append("Removed Attributes From " + Attributes.remove(page, "class", "id", "alt").length + " Nodes.\n"); 443 444 // Add some new-lines('\n' - not <BR />!) 445 int[] posArr = TagNodeFind.all(page, TC.ClosingTags, "p", "img", "h1", "h2", "h3", "h4", "h5"); 446 for (int i = posArr.length - 1; i >= 0; i--) page.insertElementAt(NEW_LINE, posArr[i] + 1); 447 448 // Save this page' image to the return vector. 449 ret.addElement(Util.pageToString(page)); 450 } 451 // Pass the Return Vector. Each element of this Vector<String> will contain a picture and paragraph 452 // about that picture. The images will not have been downloaded, nor any partially resolved URL's 453 // resolved. 454 return ret; 455 } 456 457 /** 458 * <EMBED CLASS='external-html' DATA-FILE-ID=PBSLEGACY> 459 * 460 * The code here is carbon copied from the above loop. It is just the central loop body, that 461 * does not iterate over many pages, but rather just one. 462 * 463 * <BR /><BR /><B><SPAN STYLE="color: red;">CLONE NOTICE:</B></SPAN> This method modifies the 464 * underlying {@code Vector}. If you wish to avoid that, please call this method with using 465 * the following parameter: {@code (Vector<HTMLNode>) yourOriginalPage.clone()}. Make sure to 466 * use the {@code SuppressWarnings("unchecked")} annotation. 467 * 468 * @param page Any HTML page that has extraneous advertising and java-script junk. 469 * 470 * @param emptyDIVs These are HTML divider elements who "class" equals the strings in this 471 * list. HTML divider elements that contain these strings inside their 'class' field shall be 472 * removed (inclusively). This is a string-array, and it may be null - and if it is, it will 473 * be ignored - but it may not contain null-values, or an exception will throw. 474 * 475 * @param textNodes <EMBED CLASS='external-html' DATA-FILE-ID=PBSTN> 476 * 477 * @param callTrimTextNodes <EMBED CLASS='external-html' DATA-FILE-ID=PBSCTTN> 478 * 479 * @param log This is a log, and <I><B>it may be null.</I></B> If it is null, it will be 480 * ignored. 481 * 482 * <EMBED CLASS='external-html' DATA-FILE-ID=APPENDABLE> 483 * 484 * @return a Stripped down version of the page, with most extraneous photo-bomb site junk 485 * removed. 486 * 487 * @throws IOException This method throws {@code IOException} simply because it prints to the 488 * {@code interface java.lang.Appendable}, which requires that {@code IOException} be 489 * monitored / checked in code that uses this interface. 490 */ 491 @Deprecated 492 public static String get02( Vector<HTMLNode> page, String[] emptyDIVs, String[] textNodes, 493 boolean callTrimTextNodes, Appendable log) throws IOException 494 { 495 int c = TagNodeRemove.all(page, TC.Both, "meta"); 496 if (log != null) log.append("Removed " + c + " meta tags.\n"); 497 498 c = TagNodeRemove.all(page, TC.Both, "link"); 499 if (log != null) log.append("Removed " + c + " link tags.\n"); 500 501 c = Util.Remove.scriptNodeBlocks(page); 502 if (log != null) log.append("Removed " + c + " Script Node Blocks.\n"); 503 504 c = Util.Remove.styleNodeBlocks(page); 505 if (log != null) log.append("Removed " + c + " Script Style Blocks.\n"); 506 507 c = Util.Remove.allCommentNodes(page); 508 if (log != null) log.append("Removed " + c + " Comment Nodes.\n"); 509 510 c = TagNodeRemoveInclusive.all(page, "head", "noscript", "header"); 511 if (log != null) log.append("Removed " + c + " <HEAD> nodes.\n"); 512 513 // Removes all HTML <DIV> Elements where the "class" is in the String argument list 514 if ((emptyDIVs != null) && (emptyDIVs.length > 0)) 515 { 516 c = InnerTagRemoveInclusive.all(page, "div", "class", TextComparitor.C, emptyDIVs); 517 if (log != null) log.append("Removed " + c + " HTML <DIV> Elements.\n"); 518 } 519 520 // Removes HTML <DIV> or <P> elements that are empty, recursively 521 c = Util.Remove.inclusiveEmpty(page, "p", "div"); 522 if (log != null) log.append("Removed [" + c + "] Empty <DIV> and <P> elements.\n"); 523 524 // Removes all opening and closing elements of the following: 525 // Does not remove the content between these elements 526 c = TagNodeRemove.all(page, TC.Both, "div", "a", "html", "body", "li", "ul", "span"); 527 if (log != null) log.append("Removed " + c + " HTML Elements: div, a, html, body, li, ul, span.\n"); 528 529 // Removes TextNodes that contain the elements in the String argument list 530 if ((textNodes != null) && (textNodes.length > 0)) 531 { 532 c = TextNodeRemove.all(page, TextComparitor.CN_CI, textNodes); 533 if (log != null) log.append("Removed " + c + " TextNodes.\n"); 534 } 535 536 // Many nodes have been removed, and this will convert multiple, adjacent TextNodes into a single 537 // TextNode element. 538 c = Util.compactTextNodes(page); 539 if (log != null) log.append("Removed " + c + " Nodes by compacting TextNodes.\n"); 540 541 // Long strings of spaces will be removed. 542 // UNFORTUNATELY, New Lines will also disappear. 543 if (callTrimTextNodes) 544 { 545 c = Util.trimTextNodes(page, true); 546 if (log != null) log.append("Removed " + c + " Trimming Text Nodes.\n"); 547 } 548 549 // Remove id, class, and other attributes. 550 c = Attributes.remove(page, "class", "id", "alt").length; 551 if (log != null) log.append("Removed Attributes From " + c + " Nodes.\n"); 552 553 // Add some new-lines('\n' - not <BR />!) 554 int[] posArr = TagNodeFind.all(page, TC.ClosingTags, "p", "img", "h1", "h2", "h3", "h4", "h5"); 555 for (int i = posArr.length - 1; i >= 0; i--) page.insertElementAt(NEW_LINE, posArr[i] + 1); 556 557 return Util.pageToString(page); 558 } 559}