Source code

001package Torello.HTML.Tools.NewsSite;
002
003import Torello.HTML.*;
004import Torello.Java.*;
005
006import Torello.HTML.Tools.Images.ImageScraper;
007import Torello.HTML.Tools.Images.Request;
008
009import static Torello.Java.C.*;
010
011import java.util.*;
012import java.io.*;
013import java.util.regex.*;
014
015import java.net.URL;
016import java.util.concurrent.TimeUnit;
017
018/**
019 * Converts Serialized Object Files of HTML-Vectors into <CODE>'&#46;html'</CODE> Files, and can
020 * also be used to do any user-defined, customized post-processing (using a function-pointer) on
021 * news-articles (after downloading them).
022 * 
023 * <EMBED CLASS='external-html' DATA-FILE-ID=TO_HTML>
024 */
025@Torello.JavaDoc.StaticFunctional
026public class ToHTML 
027{
028    private ToHTML() { }
029
030    private static final Pattern P1 = Pattern.compile(".*?(\\d{3,}).(\\d{3,}).*");
031
032    /**
033     * This method is a 'convenience' method that converts the data-files (Java Serialized Objects)
034     * generated by the {@code ScrapeArticles.download(...)} method into partial HTML files whose
035     * images have even been download.  This method performs two primary operations:
036     *
037     * <BR /><BR /><OL CLASS=JDOL>
038     * <LI> Retrieves {@code '.vdat'} files from the directory where the
039     *      {@code ScrapeArticles.download(...)} left the page data-files.  Uses standard java
040     *      object de-serialization to load the HTML page-{@code Vector<HTMLNode>} into memory,
041     *      and saves the files as standard {@code .html} text-files.
042     *      <BR /><BR />
043     *      </LI>
044     * <LI> Invokes the {@code ImageScraper.localizeImages} method to download any images that are
045     *      present on the web-page into the local directory, and replaces the HTML
046     *      {@code <IMG SRC=...>} links with the downloaded-image file-name.
047     *      </LI>
048     * </OL>
049     *
050     * @param inputDir This parameter should contain the name of the directory that was used with
051     * method {@code download(...)} from {@code ScrapeArticle's}.  This directory must exist and it
052     * must contain the files that were saved.
053     *
054     * @param outputDir This parameter should contain the name of the directory where the expanded
055     * and de-serialized {@code '.html'} files will be stored, along with their downloaded images.
056     *
057     * @param cleanIt When this parameter is set to {@code TRUE}, then some HTML data will be
058     * stripped from each page before it is re-written disk.  The benefit here is that it can make
059     * reading the pages a lot easier (Without losing anything important about the article).  When
060     * scraping news articles, the CSS classes used by the web-site stop having much use, as does
061     * any java-script that is latent on the page.  If you would like to keep this information,
062     * just pass {@code FALSE} to this parameter to skip this 'cleaning' step.
063     * 
064     * <BR /><BR />When {@code TRUE}, you will be making a request to remove the following HTML
065     * Elements from article-pages:
066     *
067     * <BR /><BR /><UL CLASS=JDUL>
068     *      <LI>{@code <SCRIPT>...</SCRIPT>} blocks are removed</LI>
069     *      <LI>{@code <STYLE>...</STYLE>} blocks are removed</LI>
070     *      <LI>{@code 'class=...'} and {@code 'id=...'} HTML Element Attributes are stripped</LI>
071     * </UL>
072     * 
073     * @param modifyOrRetrieve This {@code Functional Interface} allows a user to pass a method
074     * or a lambda-expression that performs a customized "Clean Up" of the Newspaper
075     * {@code Article's}.  Customized clean up could be anything from removing advertisements to
076     * extracting the Author's Name and Article Data and placing it somewhere.  You may even get
077     * rid of (or move) the (very common) "Post to Twitter" or "Post to Facebook" thumbnails.
078     *
079     * <BR /><BR /><B><SPAN STYLE="color: red;">NULLABLE:</B></SPAN> This parameter may be null, 
080     * and if it is it will be ignored.  Just to be frank, the {@code ArticleGet} that is used to
081     * retrieve the {@code Article}-body HTML could just as easily be used to perform any needed
082     * cleanup on the news-paper articles.  Having an additional entry-point for tweaking the HTML
083     * here is only provided to make things easier.  This only a function-pointer parameter, and it
084     * may just as easily be passed null as it may be passed a complex HTML Modificatin procedure.
085     *
086     * <BR /><BR /><B>NOTE:</B> Once a good understanding of how the classes and methods in the
087     * {@code package HTML.NodeSearch} package is attained, using those methods to move, update or
088     * modify HTML becomes second-nature.  Cleaning up large numbers of newspaper articles to get
089     * rid of the "View Related Articles" links-portion of the page (for example), or banners at
090     * the top that say "Send Via E-Mail" and "Pin to Pinterest" will usually take a couple lines
091     * of code (with {@code 'NodeSearch'}).
092     *
093     * <BR /><BR /><B>ALSO:</B> Another good use for this {@code Functional Interface} would be
094     * to extract data that is inside HTML {@code <SCRIPT> ... </SCRIPT>} tags.  There might be
095     * additional images or article "Meta Data" (author, title, date, reporter-name, etc..) that
096     * the programmer might consider important - and would need to be parsed using a {@code JSON}
097     * parser which is freely available for download on the internet as well.
098     *
099     * @param log Output text is sent to this log.  This parameter may be null, and if it is, it
100     * shall be ignored.  If this program is running on UNIX, color-codes will be included in the
101     * log data.
102     * 
103     * <EMBED CLASS='external-html' DATA-FILE-ID=APPENDABLE>
104     *
105     * @throws IOException If there any I/O Exceptions when writing image files to the file-system,
106     * then this exception will throw.
107     */
108    @SuppressWarnings("unchecked")
109    public static void convert(
110        String inputDir, String outputDir, boolean cleanIt, HTMLModifier modifyOrRetrieve,
111        Appendable log
112    )
113        throws IOException
114    {
115        if (log !=null) log.append(
116            "\n" + BRED +
117            "*****************************************************************************************\n" +
118            "*****************************************************************************************\n" + 
119            RESET + " Converting Vector<HTMLNode> to '.html' files, and downloading Pictures." + BRED + "\n" +
120            "*****************************************************************************************\n" +
121            "*****************************************************************************************\n" + 
122            RESET + '\n'
123        );
124
125        if (! outputDir.endsWith(File.separator)) outputDir = outputDir + File.separator;
126
127        // Uses the FileNode class to build an iterator of all '.dat' files that are found in the
128        // 'inputDir' directory-parameter.
129
130        Iterator<FileNode> iter = FileNode
131            .createRoot(inputDir)
132            .loadTree()
133            .getDirContentsFiles
134                (RTC.ITERATOR(), (FileNode fn) -> fn.name.endsWith(".dat"));
135
136        // Iterate through each of the data-files.
137        while (iter.hasNext())
138            try
139            {
140                // Retrieve next article, using the iterator
141                FileNode fn = iter.next();
142
143                // Load the instance of 'Article' into memory, using Object De-Serialization
144                Article page = FileRW.readObjectFromFileNOCNFE(fn.toString(), Article.class, true);
145
146                // If there are customized modifications to the page (or retrieval operations)
147                // that were requested, they are done here.
148
149                if (modifyOrRetrieve != null)
150                {
151                    // Retrieves the section-number and article-number from file-name
152                    Matcher m = P1.matcher(fn.toString());
153
154                    // These will be set to -1, and if the directoryName/fileName did not use the
155                    // standard "factory-generated" file-save, then these will STILL BE -1 when
156                    // passed to the modifier lambda.
157
158                    int sectionNum = -1;
159                    int articleNum = -1;
160
161                    if (m.find())
162                    {
163                        sectionNum = Integer.parseInt(m.group(1));
164                        articleNum = Integer.parseInt(m.group(2));
165                    }
166
167                    // pass the articleBody (and it's URL and filename) to the customized
168                    // HTML Modifier provided by the user who called this method
169
170                    modifyOrRetrieve.modifyOrRetrieve
171                        (page.articleBody, page.url, sectionNum, articleNum);
172                }
173
174                // We need to build a "Sub-Directory" name for the HTML page where the download
175                // images will be stored
176
177                int     dotPos      = fn.name.lastIndexOf(".");
178                String  outDirName  = outputDir + fn.name.substring(0, dotPos).replace("\\.", "/") + '/';
179
180                // Make sure the subdirectory exists.
181                new File(outDirName).mkdirs();
182
183                // This process may be skipped, but it makes the output HTML much cleaner and more
184                // readable for most Internet News Web-Sites.  Both <SCRIPT>, <!-- --> elements are
185                // removed.  Also, any "class" or "id" fields are eliminated.  This "cleaning" can
186                // be easily skipped
187
188                if (cleanIt)
189                {
190                    Util.Remove.scriptNodeBlocks(page.articleBody);
191                    Util.Remove.styleNodeBlocks(page.articleBody);
192                    Util.Remove.allCommentNodes(page.articleBody);
193                    Attributes.remove(page.articleBody, "class", "id");
194                }
195
196                if (log != null) log.append("Writing Page: " + BGREEN + fn.name + RESET + '\n');
197
198                // 'Localize' any images available.  'localizing' an HTML web-page means downloading
199                // the image data, and saving it to disk.
200
201                ImageScraper.localizeImages(page.articleBody, page.url, log, outDirName);
202
203                // If there were any images available, they were downloaded and localized.  The
204                // Write the (updated) HTML to an '.html' text-file.
205
206                FileRW.writeFile(Util.pageToString(page.articleBody), outDirName + "index.html");
207            }
208
209            // NOTE: The "ImageScraper" spawns a (very) small "monitor thread" that ensures that
210            // downloading does not "hang" the system by aborting image-downloads that take longer
211            // than 10 seconds.  It is necessary to shut-down these threads on system exit, because
212            // if they are not shutdown, when a java program terminates, the operating system that
213            // the program is using (the terminal window) will appear to "hang" or "freeze" until
214            // the extra-thread is shut-down by the JVM.  This delay can be upwards of 30 seconds.
215
216            catch (IOException ioe)
217                { ImageScraper.shutdownTOThreads(); throw ioe; }
218
219            catch (Exception e)
220            {
221                ImageScraper.shutdownTOThreads();
222
223                throw new IOException(
224                    "There was a problem converting the html pages.  See exception.getCause() " +
225                    "for more details.",
226                    e
227                );
228            }
229
230        // Exit the method.  Again, shutdown the Time-Out "monitor" thread.
231        ImageScraper.shutdownTOThreads();
232    }
233}