ToHTML.java.html

package Torello.HTML.Tools.NewsSite;

import Torello.HTML.*;
import Torello.Java.*;

import Torello.HTML.Tools.Images.ImageScraper;
import Torello.HTML.Tools.Images.Request;

import static Torello.Java.C.*;

import java.util.*;
import java.io.*;
import java.util.regex.*;

import java.net.URL;
import java.util.concurrent.TimeUnit;

/**
 * Converts Serialized Object Files of HTML-Vectors into <CODE>'&#46;html'</CODE> Files, and can
 * also be used to do any user-defined, customized post-processing (using a function-pointer) on
 * news-articles (after downloading them).
 * 
 * <EMBED CLASS='external-html' DATA-FILE-ID=TO_HTML>
 */
@Torello.JavaDoc.StaticFunctional
public class ToHTML 
{
    private ToHTML() { }

    private static final Pattern P1 = Pattern.compile(".*?(\\d{3,}).(\\d{3,}).*");

    /**
     * This method is a 'convenience' method that converts the data-files (Java Serialized Objects)
     * generated by the {@code ScrapeArticles.download(...)} method into partial HTML files whose
     * images have even been download.  This method performs two primary operations:
     *
     * <BR /><BR /><OL CLASS=JDOL>
     * <LI> Retrieves {@code '.vdat'} files from the directory where the
     *      {@code ScrapeArticles.download(...)} left the page data-files.  Uses standard java
     *      object de-serialization to load the HTML page-{@code Vector<HTMLNode>} into memory,
     *      and saves the files as standard {@code .html} text-files.
     *      <BR /><BR />
     *      </LI>
     * <LI> Invokes the {@code ImageScraper.localizeImages} method to download any images that are
     *      present on the web-page into the local directory, and replaces the HTML
     *      {@code <IMG SRC=...>} links with the downloaded-image file-name.
     *      </LI>
     * </OL>
     *
     * @param inputDir This parameter should contain the name of the directory that was used with
     * method {@code download(...)} from {@code ScrapeArticle's}.  This directory must exist and it
     * must contain the files that were saved.
     *
     * @param outputDir This parameter should contain the name of the directory where the expanded
     * and de-serialized {@code '.html'} files will be stored, along with their downloaded images.
     *
     * @param cleanIt When this parameter is set to {@code TRUE}, then some HTML data will be
     * stripped from each page before it is re-written disk.  The benefit here is that it can make
     * reading the pages a lot easier (Without losing anything important about the article).  When
     * scraping news articles, the CSS classes used by the web-site stop having much use, as does
     * any java-script that is latent on the page.  If you would like to keep this information,
     * just pass {@code FALSE} to this parameter to skip this 'cleaning' step.
     * 
     * <BR /><BR />When {@code TRUE}, you will be making a request to remove the following HTML
     * Elements from article-pages:
     *
     * <BR /><BR /><UL CLASS=JDUL>
     *      <LI>{@code <SCRIPT>...</SCRIPT>} blocks are removed</LI>
     *      <LI>{@code <STYLE>...</STYLE>} blocks are removed</LI>
     *      <LI>{@code 'class=...'} and {@code 'id=...'} HTML Element Attributes are stripped</LI>
     * </UL>
     * 
     * @param modifyOrRetrieve This {@code Functional Interface} allows a user to pass a method
     * or a lambda-expression that performs a customized "Clean Up" of the Newspaper
     * {@code Article's}.  Customized clean up could be anything from removing advertisements to
     * extracting the Author's Name and Article Data and placing it somewhere.  You may even get
     * rid of (or move) the (very common) "Post to Twitter" or "Post to Facebook" thumbnails.
     *
     * <BR /><BR /><B><SPAN STYLE="color: red;">NULLABLE:</B></SPAN> This parameter may be null, 
     * and if it is it will be ignored.  Just to be frank, the {@code ArticleGet} that is used to
     * retrieve the {@code Article}-body HTML could just as easily be used to perform any needed
     * cleanup on the news-paper articles.  Having an additional entry-point for tweaking the HTML
     * here is only provided to make things easier.  This only a function-pointer parameter, and it
     * may just as easily be passed null as it may be passed a complex HTML Modificatin procedure.
     *
     * <BR /><BR /><B>NOTE:</B> Once a good understanding of how the classes and methods in the
     * {@code package HTML.NodeSearch} package is attained, using those methods to move, update or
     * modify HTML becomes second-nature.  Cleaning up large numbers of newspaper articles to get
     * rid of the "View Related Articles" links-portion of the page (for example), or banners at
     * the top that say "Send Via E-Mail" and "Pin to Pinterest" will usually take a couple lines
     * of code (with {@code 'NodeSearch'}).
     *
     * <BR /><BR /><B>ALSO:</B> Another good use for this {@code Functional Interface} would be
     * to extract data that is inside HTML {@code <SCRIPT> ... </SCRIPT>} tags.  There might be
     * additional images or article "Meta Data" (author, title, date, reporter-name, etc..) that
     * the programmer might consider important - and would need to be parsed using a {@code JSON}
     * parser which is freely available for download on the internet as well.
     *
     * @param log Output text is sent to this log.  This parameter may be null, and if it is, it
     * shall be ignored.  If this program is running on UNIX, color-codes will be included in the
     * log data.
     * 
     * <EMBED CLASS='external-html' DATA-FILE-ID=APPENDABLE>
     *
     * @throws IOException If there any I/O Exceptions when writing image files to the file-system,
     * then this exception will throw.
     */
    @SuppressWarnings("unchecked")
    public static void convert(
        String inputDir, String outputDir, boolean cleanIt, HTMLModifier modifyOrRetrieve,
        Appendable log
    )
        throws IOException
    {
        if (log !=null) log.append(
            "\n" + BRED +
            "*****************************************************************************************\n" +
            "*****************************************************************************************\n" + 
            RESET + " Converting Vector<HTMLNode> to '.html' files, and downloading Pictures." + BRED + "\n" +
            "*****************************************************************************************\n" +
            "*****************************************************************************************\n" + 
            RESET + '\n'
        );

        if (! outputDir.endsWith(File.separator)) outputDir = outputDir + File.separator;

        // Uses the FileNode class to build an iterator of all '.dat' files that are found in the
        // 'inputDir' directory-parameter.

        Iterator<FileNode> iter = FileNode
            .createRoot(inputDir)
            .loadTree()
            .getDirContentsFiles
                (RTC.ITERATOR(), (FileNode fn) -> fn.name.endsWith(".dat"));

        // Iterate through each of the data-files.
        while (iter.hasNext())
            try
            {
                // Retrieve next article, using the iterator
                FileNode fn = iter.next();

                // Load the instance of 'Article' into memory, using Object De-Serialization
                Article page = FileRW.readObjectFromFileNOCNFE(fn.toString(), Article.class, true);

                // If there are customized modifications to the page (or retrieval operations)
                // that were requested, they are done here.

                if (modifyOrRetrieve != null)
                {
                    // Retrieves the section-number and article-number from file-name
                    Matcher m = P1.matcher(fn.toString());

                    // These will be set to -1, and if the directoryName/fileName did not use the
                    // standard "factory-generated" file-save, then these will STILL BE -1 when
                    // passed to the modifier lambda.

                    int sectionNum = -1;
                    int articleNum = -1;

                    if (m.find())
                    {
                        sectionNum = Integer.parseInt(m.group(1));
                        articleNum = Integer.parseInt(m.group(2));
                    }

                    // pass the articleBody (and it's URL and filename) to the customized
                    // HTML Modifier provided by the user who called this method

                    modifyOrRetrieve.modifyOrRetrieve
                        (page.articleBody, page.url, sectionNum, articleNum);
                }

                // We need to build a "Sub-Directory" name for the HTML page where the download
                // images will be stored

                int     dotPos      = fn.name.lastIndexOf(".");
                String  outDirName  = outputDir + fn.name.substring(0, dotPos).replace("\\.", "/") + '/';

                // Make sure the subdirectory exists.
                new File(outDirName).mkdirs();

                // This process may be skipped, but it makes the output HTML much cleaner and more
                // readable for most Internet News Web-Sites.  Both <SCRIPT>, <!-- --> elements are
                // removed.  Also, any "class" or "id" fields are eliminated.  This "cleaning" can
                // be easily skipped

                if (cleanIt)
                {
                    Util.Remove.scriptNodeBlocks(page.articleBody);
                    Util.Remove.styleNodeBlocks(page.articleBody);
                    Util.Remove.allCommentNodes(page.articleBody);
                    Attributes.remove(page.articleBody, "class", "id");
                }

                if (log != null) log.append("Writing Page: " + BGREEN + fn.name + RESET + '\n');

                // 'Localize' any images available.  'localizing' an HTML web-page means downloading
                // the image data, and saving it to disk.

                ImageScraper.localizeImages(page.articleBody, page.url, log, outDirName);

                // If there were any images available, they were downloaded and localized.  The
                // Write the (updated) HTML to an '.html' text-file.

                FileRW.writeFile(Util.pageToString(page.articleBody), outDirName + "index.html");
            }

            // NOTE: The "ImageScraper" spawns a (very) small "monitor thread" that ensures that
            // downloading does not "hang" the system by aborting image-downloads that take longer
            // than 10 seconds.  It is necessary to shut-down these threads on system exit, because
            // if they are not shutdown, when a java program terminates, the operating system that
            // the program is using (the terminal window) will appear to "hang" or "freeze" until
            // the extra-thread is shut-down by the JVM.  This delay can be upwards of 30 seconds.

            catch (IOException ioe)
                { ImageScraper.shutdownTOThreads(); throw ioe; }

            catch (Exception e)
            {
                ImageScraper.shutdownTOThreads();

                throw new IOException(
                    "There was a problem converting the html pages.  See exception.getCause() " +
                    "for more details.",
                    e
                );
            }

        // Exit the method.  Again, shutdown the Time-Out "monitor" thread.
        ImageScraper.shutdownTOThreads();
    }
}