001package Torello.HTML.Tools.NewsSite;
002
003import Torello.HTML.*;
004import Torello.Java.*;
005
006import Torello.HTML.Tools.Images.ImageScraper;
007import Torello.HTML.Tools.Images.ImageScraper.AdditionalParameters;
008import Torello.Java.FileNode.RetTypeChoice;
009import Torello.Java.Shell.C;
010
011import java.util.*;
012import java.io.*;
013import java.util.regex.*;
014
015import java.net.URL;
016import java.util.concurrent.TimeUnit;
017
018/**
019 * Converts Serialized Object Files of HTML-Vectors into <CODE>'&#46;html'</CODE> Files, and can
020 * also be used to do any user-defined, customized post-processing (using a function-pointer) on
021 * news-articles (after downloading them).
022 * 
023 * <EMBED CLASS="external-html" DATA-FILE-ID="TO_HTML">
024 */
025@Torello.HTML.Tools.JavaDoc.StaticFunctional
026public class ToHTML 
027{
028    private ToHTML() { }
029
030    private static final Pattern P1 = Pattern.compile(".*?(\\d{3,}).(\\d{3,}).*");
031
032    /**
033     * This method is a 'convenience' method that converts the data-files (Java Serialized Objects)
034     * generated by the {@code ScrapeArticles.download(...)} method into partial HTML files whose
035     * images have even been download.  This method performs two primary operations:
036     *
037     * <BR /><BR /><OL CLASS="JDOL">
038     * <LI> Retrieves {@code '.vdat'} files from the directory where the
039     *      {@code ScrapeArticles.download(...)} left the page data-files.  Uses standard java
040     *      object de-serialization to load the HTML page-{@code Vector<HTMLNode>} into memory,
041     *      and saves the files as standard {@code .html} text-files.
042     *      <BR /><BR />
043     *      </LI>
044     * <LI> Invokes the {@code ImageScraper.localizeImages} method to download any images that are
045     *      present on the web-page into the local directory, and replaces the HTML
046     *      {@code <IMG SRC=...>} links with the downloaded-image file-name.
047     *      </LI>
048     * </OL>
049     *
050     * @param inputDir This parameter should contain the name of the directory that was used with
051     * method {@code download(...)} from {@code ScrapeArticle's}.  This directory must exist and it
052     * must contain the files that were saved.
053     *
054     * @param outputDir This parameter should contain the name of the directory where the expanded
055     * and de-serialized {@code '.html'} files will be stored, along with their downloaded images.
056     *
057     * @param cleanIt When this parameter is set to <B>TRUE</B>, then some HTML data will be
058     * stripped from each page before it is re-written disk.  The benefit here is that it can make
059     * reading the pages a lot easier (Without losing anything important about the article).  When
060     * scraping news articles, the CSS classes used by the web-site stop having much use, as does
061     * any java-script that is latent on the page.  If you would like to keep this information,
062     * just pass <B>FALSE</B> to this parameter to skip this 'cleaning' step.
063     * 
064     * <BR /><BR />When {@code TRUE}, you will be making a request to remove the following HTML
065     * Elements from article-pages:
066     *
067     * <BR /><BR /><UL CLASS="JDUL">
068     *      <LI>{@code <SCRIPT>...</SCRIPT>} blocks are removed</LI>
069     *      <LI>{@code <STYLE>...</STYLE>} blocks are removed</LI>
070     *      <LI>{@code 'class=...'} and {@code 'id=...'} HTML Element Attributes are stripped</LI>
071     * </UL>
072     * 
073     * @param modifyOrRetrieve This {@code Functional Interface} allows a user to pass a method
074     * or a lambda-expression that performs a customized "Clean Up" of the Newspaper
075     * {@code Article's}.  Customized clean up could be anything from removing advertisements to
076     * extracting the Author's Name and Article Data and placing it somewhere.  You may even get
077     * rid of (or move) the (very common) "Post to Twitter" or "Post to Facebook" thumbnails.
078     *
079     * <BR /><BR /><B><SPAN STYLE="color: red;">NULLABLE:</B></SPAN> This parameter may be null, 
080     * and if it is it will be ignored.  Just to be frank, the {@code ArticleGet} that is used to
081     * retrieve the {@code Article}-body HTML could just as easily be used to perform any needed
082     * cleanup on the news-paper articles.  Having an additional entry-point for tweaking the HTML
083     * here is only provided to make things easier.  This only a function-pointer parameter, and it
084     * may just as easily be passed null as it may be passed a complex HTML Modificatin procedure.
085     *
086     * <BR /><BR /><B>NOTE:</B> Once a good understanding of how the classes and methods in the
087     * {@code package HTML.NodeSearch} package is attained, using those methods to move, update or
088     * modify HTML becomes second-nature.  Cleaning up large numbers of newspaper articles to get
089     * rid of the "View Related Articles" links-portion of the page (for example), or banners at
090     * the top that say "Send Via E-Mail" and "Pin to Pinterest" will usually take a couple lines
091     * of code (with {@code 'NodeSearch'}).
092     *
093     * <BR /><BR /><B>ALSO:</B> Another good use for this {@code Functional Interface} would be
094     * to extract data that is inside HTML {@code <SCRIPT> ... </SCRIPT>} tags.  There might be
095     * additional images or article "Meta Data" (author, title, date, reporter-name, etc..) that
096     * the programmer might consider important - and would need to be parsed using a {@code JSON}
097     * parser which is freely available for download on the internet as well.
098     *
099     * @param log Output text is sent to this log.  This parameter may be null, and if it is, it
100     * shall be ignored.  If this program is running on UNIX, color-codes will be included in the
101     * log data.
102     * 
103     * <EMBED CLASS="external-html" DATA-FILE-ID="APPENDABLE">
104     *
105     * @throws IOException If there any I/O Exceptions when writing image files to the file-system,
106     * then this exception will throw.
107     */
108    @SuppressWarnings("unchecked")
109    public static void convert(
110        String inputDir, String outputDir, boolean cleanIt, HTMLModifier modifyOrRetrieve,
111        Appendable log
112    )
113        throws IOException
114    {
115        if (log !=null) log.append(
116            "\n" + C.BRED +
117            "*****************************************************************************************\n" +
118            "*****************************************************************************************\n" + 
119            C.RESET + " Converting Vector<HTMLNode> to '.html' files, and downloading Pictures." + C.BRED + "\n" +
120            "*****************************************************************************************\n" +
121            "*****************************************************************************************\n" + 
122            C.RESET + '\n'
123        );
124
125        if (! outputDir.endsWith(File.separator)) outputDir = outputDir + File.separator;
126
127        // Uses the FileNode class to build an iterator of all '.dat' files that are found in the
128        // 'inputDir' directory-parameter.
129        Iterator<FileNode> iter = FileNode
130            .createRoot(inputDir)
131            .loadTree()
132            .getDirContentsFiles
133                (RetTypeChoice.ITERATOR, (FileNode fn) -> fn.name.endsWith(".dat"));
134
135        // Iterate through each of the data-files.
136        while (iter.hasNext())
137            try
138            {
139                // Retrieve next article, using the iterator
140                FileNode fn = iter.next();
141
142                // Load the instance of 'Article' into memory, using Object De-Serialization
143                Article page = FileRW.readObjectFromFileNOCNFE(fn.toString(), Article.class, true);
144
145                // If there are customized modifications to the page (or retrieval operations)
146                // that were requested, they are done here.
147                if (modifyOrRetrieve != null)
148                {
149                    // Retrieves the section-number and article-number from file-name
150                    Matcher m = P1.matcher(fn.toString());
151
152                    // These will be set to -1, and if the directoryName/fileName did not use the
153                    // standard "factory-generated" file-save, then these will STILL BE -1 when
154                    // passed to the modifier lambda.
155                    int sectionNum = -1;
156                    int articleNum = -1;
157
158                    if (m.find())
159                    {
160                        sectionNum = Integer.parseInt(m.group(1));
161                        articleNum = Integer.parseInt(m.group(2));
162                    }
163
164                    // pass the articleBody (and it's URL and filename) to the customized
165                    // HTML Modifier provided by the user who called this method
166                    modifyOrRetrieve.modifyOrRetrieve
167                        (page.articleBody, page.url, sectionNum, articleNum);
168                }
169
170                // We need to build a "Sub-Directory" name for the HTML page where the download
171                // images will be stored
172                int     dotPos      = fn.name.lastIndexOf(".");
173                String  outDirName  = outputDir + fn.name.substring(0, dotPos).replace("\\.", "/") + '/';
174
175                // Make sure the subdirectory exists.
176                new File(outDirName).mkdirs();
177
178                // This process may be skipped, but it makes the output HTML much cleaner and more
179                // readable for most Internet News Web-Sites.  Both <SCRIPT>, <!-- --> elements are
180                // removed.  Also, any "class" or "id" fields are eliminated.  This "cleaning" can
181                // be easily skipped
182
183                if (cleanIt)
184                {
185                    Util.removeScriptNodeBlocks(page.articleBody);
186                    Util.removeStyleNodeBlocks(page.articleBody);
187                    Util.removeAllCommentNodes(page.articleBody);
188                    Attributes.remove(page.articleBody, "class", "id");
189                }
190
191                if (log != null) log.append("Writing Page: " + C.BGREEN + fn.name + C.RESET + '\n');
192
193                // 'Localize' any images available.  'localizing' an HTML web-page means downloading
194                // the image data, and saving it to disk.
195                AdditionalParameters ap = new AdditionalParameters();
196                ImageScraper.localizeImages(page.articleBody, page.url, log, ap, outDirName);
197
198                // If there were any images available, they were downloaded and localized.  The
199                // Write the (updated) HTML to an '.html' text-file.
200                FileRW.writeFile(Util.pageToString(page.articleBody), outDirName + "index.html");
201            }
202
203            // NOTE: The "ImageScraper" spawns a (very) small "monitor thread" that ensures that
204            // downloading does not "hang" the system by aborting image-downloads that take longer
205            // than 10 seconds.  It is necessary to shut-down these threads on system exit, because
206            // if they are not shutdown, when a java program terminates, the operating system that
207            // the program is using (the terminal window) will appear to "hang" or "freeze" until
208            // the extra-thread is shut-down by the JVM.  This delay can be upwards of 30 seconds.
209            catch (IOException ioe)
210                { ImageScraper.shutdownTOThreads(); throw ioe; }
211
212            catch (Exception e)
213            {
214                ImageScraper.shutdownTOThreads();
215
216                throw new IOException(
217                    "There was a problem converting the html pages.  See exception.getCause() " +
218                    "for more details.",
219                    e
220                );
221            }
222
223        // Exit the method.  Again, shutdown the Time-Out "monitor" thread.
224        ImageScraper.shutdownTOThreads();
225    }
226}