001package Torello.HTML.Tools.NewsSite; 002 003import Torello.HTML.*; 004import Torello.Java.*; 005 006import Torello.HTML.Tools.Images.ImageScraper; 007import Torello.HTML.Tools.Images.Request; 008 009import static Torello.Java.C.*; 010 011import java.util.*; 012import java.io.*; 013import java.util.regex.*; 014 015import java.net.URL; 016import java.util.concurrent.TimeUnit; 017 018/** 019 * Converts Serialized Object Files of HTML-Vectors into <CODE>'.html'</CODE> Files, and can 020 * also be used to do any user-defined, customized post-processing (using a function-pointer) on 021 * news-articles (after downloading them). 022 * 023 * <EMBED CLASS='external-html' DATA-FILE-ID=TO_HTML> 024 */ 025@Torello.JavaDoc.StaticFunctional 026public class ToHTML 027{ 028 private ToHTML() { } 029 030 private static final Pattern P1 = Pattern.compile(".*?(\\d{3,}).(\\d{3,}).*"); 031 032 /** 033 * This method is a 'convenience' method that converts the data-files (Java Serialized Objects) 034 * generated by the {@code ScrapeArticles.download(...)} method into partial HTML files whose 035 * images have even been download. This method performs two primary operations: 036 * 037 * <BR /><BR /><OL CLASS=JDOL> 038 * <LI> Retrieves {@code '.vdat'} files from the directory where the 039 * {@code ScrapeArticles.download(...)} left the page data-files. Uses standard java 040 * object de-serialization to load the HTML page-{@code Vector<HTMLNode>} into memory, 041 * and saves the files as standard {@code .html} text-files. 042 * <BR /><BR /> 043 * </LI> 044 * <LI> Invokes the {@code ImageScraper.localizeImages} method to download any images that are 045 * present on the web-page into the local directory, and replaces the HTML 046 * {@code <IMG SRC=...>} links with the downloaded-image file-name. 047 * </LI> 048 * </OL> 049 * 050 * @param inputDir This parameter should contain the name of the directory that was used with 051 * method {@code download(...)} from {@code ScrapeArticle's}. This directory must exist and it 052 * must contain the files that were saved. 053 * 054 * @param outputDir This parameter should contain the name of the directory where the expanded 055 * and de-serialized {@code '.html'} files will be stored, along with their downloaded images. 056 * 057 * @param cleanIt When this parameter is set to {@code TRUE}, then some HTML data will be 058 * stripped from each page before it is re-written disk. The benefit here is that it can make 059 * reading the pages a lot easier (Without losing anything important about the article). When 060 * scraping news articles, the CSS classes used by the web-site stop having much use, as does 061 * any java-script that is latent on the page. If you would like to keep this information, 062 * just pass {@code FALSE} to this parameter to skip this 'cleaning' step. 063 * 064 * <BR /><BR />When {@code TRUE}, you will be making a request to remove the following HTML 065 * Elements from article-pages: 066 * 067 * <BR /><BR /><UL CLASS=JDUL> 068 * <LI>{@code <SCRIPT>...</SCRIPT>} blocks are removed</LI> 069 * <LI>{@code <STYLE>...</STYLE>} blocks are removed</LI> 070 * <LI>{@code 'class=...'} and {@code 'id=...'} HTML Element Attributes are stripped</LI> 071 * </UL> 072 * 073 * @param modifyOrRetrieve This {@code Functional Interface} allows a user to pass a method 074 * or a lambda-expression that performs a customized "Clean Up" of the Newspaper 075 * {@code Article's}. Customized clean up could be anything from removing advertisements to 076 * extracting the Author's Name and Article Data and placing it somewhere. You may even get 077 * rid of (or move) the (very common) "Post to Twitter" or "Post to Facebook" thumbnails. 078 * 079 * <BR /><BR /><B><SPAN STYLE="color: red;">NULLABLE:</B></SPAN> This parameter may be null, 080 * and if it is it will be ignored. Just to be frank, the {@code ArticleGet} that is used to 081 * retrieve the {@code Article}-body HTML could just as easily be used to perform any needed 082 * cleanup on the news-paper articles. Having an additional entry-point for tweaking the HTML 083 * here is only provided to make things easier. This only a function-pointer parameter, and it 084 * may just as easily be passed null as it may be passed a complex HTML Modificatin procedure. 085 * 086 * <BR /><BR /><B>NOTE:</B> Once a good understanding of how the classes and methods in the 087 * {@code package HTML.NodeSearch} package is attained, using those methods to move, update or 088 * modify HTML becomes second-nature. Cleaning up large numbers of newspaper articles to get 089 * rid of the "View Related Articles" links-portion of the page (for example), or banners at 090 * the top that say "Send Via E-Mail" and "Pin to Pinterest" will usually take a couple lines 091 * of code (with {@code 'NodeSearch'}). 092 * 093 * <BR /><BR /><B>ALSO:</B> Another good use for this {@code Functional Interface} would be 094 * to extract data that is inside HTML {@code <SCRIPT> ... </SCRIPT>} tags. There might be 095 * additional images or article "Meta Data" (author, title, date, reporter-name, etc..) that 096 * the programmer might consider important - and would need to be parsed using a {@code JSON} 097 * parser which is freely available for download on the internet as well. 098 * 099 * @param log Output text is sent to this log. This parameter may be null, and if it is, it 100 * shall be ignored. If this program is running on UNIX, color-codes will be included in the 101 * log data. 102 * 103 * <EMBED CLASS='external-html' DATA-FILE-ID=APPENDABLE> 104 * 105 * @throws IOException If there any I/O Exceptions when writing image files to the file-system, 106 * then this exception will throw. 107 */ 108 @SuppressWarnings("unchecked") 109 public static void convert( 110 String inputDir, String outputDir, boolean cleanIt, HTMLModifier modifyOrRetrieve, 111 Appendable log 112 ) 113 throws IOException 114 { 115 if (log !=null) log.append( 116 "\n" + BRED + 117 "*****************************************************************************************\n" + 118 "*****************************************************************************************\n" + 119 RESET + " Converting Vector<HTMLNode> to '.html' files, and downloading Pictures." + BRED + "\n" + 120 "*****************************************************************************************\n" + 121 "*****************************************************************************************\n" + 122 RESET + '\n' 123 ); 124 125 if (! outputDir.endsWith(File.separator)) outputDir = outputDir + File.separator; 126 127 // Uses the FileNode class to build an iterator of all '.dat' files that are found in the 128 // 'inputDir' directory-parameter. 129 130 Iterator<FileNode> iter = FileNode 131 .createRoot(inputDir) 132 .loadTree() 133 .getDirContentsFiles 134 (RTC.ITERATOR(), (FileNode fn) -> fn.name.endsWith(".dat")); 135 136 // Iterate through each of the data-files. 137 while (iter.hasNext()) 138 try 139 { 140 // Retrieve next article, using the iterator 141 FileNode fn = iter.next(); 142 143 // Load the instance of 'Article' into memory, using Object De-Serialization 144 Article page = FileRW.readObjectFromFileNOCNFE(fn.toString(), Article.class, true); 145 146 // If there are customized modifications to the page (or retrieval operations) 147 // that were requested, they are done here. 148 149 if (modifyOrRetrieve != null) 150 { 151 // Retrieves the section-number and article-number from file-name 152 Matcher m = P1.matcher(fn.toString()); 153 154 // These will be set to -1, and if the directoryName/fileName did not use the 155 // standard "factory-generated" file-save, then these will STILL BE -1 when 156 // passed to the modifier lambda. 157 158 int sectionNum = -1; 159 int articleNum = -1; 160 161 if (m.find()) 162 { 163 sectionNum = Integer.parseInt(m.group(1)); 164 articleNum = Integer.parseInt(m.group(2)); 165 } 166 167 // pass the articleBody (and it's URL and filename) to the customized 168 // HTML Modifier provided by the user who called this method 169 170 modifyOrRetrieve.modifyOrRetrieve 171 (page.articleBody, page.url, sectionNum, articleNum); 172 } 173 174 // We need to build a "Sub-Directory" name for the HTML page where the download 175 // images will be stored 176 177 int dotPos = fn.name.lastIndexOf("."); 178 String outDirName = outputDir + fn.name.substring(0, dotPos).replace("\\.", "/") + '/'; 179 180 // Make sure the subdirectory exists. 181 new File(outDirName).mkdirs(); 182 183 // This process may be skipped, but it makes the output HTML much cleaner and more 184 // readable for most Internet News Web-Sites. Both <SCRIPT>, <!-- --> elements are 185 // removed. Also, any "class" or "id" fields are eliminated. This "cleaning" can 186 // be easily skipped 187 188 if (cleanIt) 189 { 190 Util.Remove.scriptNodeBlocks(page.articleBody); 191 Util.Remove.styleNodeBlocks(page.articleBody); 192 Util.Remove.allCommentNodes(page.articleBody); 193 Attributes.remove(page.articleBody, "class", "id"); 194 } 195 196 if (log != null) log.append("Writing Page: " + BGREEN + fn.name + RESET + '\n'); 197 198 // 'Localize' any images available. 'localizing' an HTML web-page means downloading 199 // the image data, and saving it to disk. 200 201 ImageScraper.localizeImages(page.articleBody, page.url, log, outDirName); 202 203 // If there were any images available, they were downloaded and localized. The 204 // Write the (updated) HTML to an '.html' text-file. 205 206 FileRW.writeFile(Util.pageToString(page.articleBody), outDirName + "index.html"); 207 } 208 209 // NOTE: The "ImageScraper" spawns a (very) small "monitor thread" that ensures that 210 // downloading does not "hang" the system by aborting image-downloads that take longer 211 // than 10 seconds. It is necessary to shut-down these threads on system exit, because 212 // if they are not shutdown, when a java program terminates, the operating system that 213 // the program is using (the terminal window) will appear to "hang" or "freeze" until 214 // the extra-thread is shut-down by the JVM. This delay can be upwards of 30 seconds. 215 216 catch (IOException ioe) 217 { ImageScraper.shutdownTOThreads(); throw ioe; } 218 219 catch (Exception e) 220 { 221 ImageScraper.shutdownTOThreads(); 222 223 throw new IOException( 224 "There was a problem converting the html pages. See exception.getCause() " + 225 "for more details.", 226 e 227 ); 228 } 229 230 // Exit the method. Again, shutdown the Time-Out "monitor" thread. 231 ImageScraper.shutdownTOThreads(); 232 } 233}