ScrapeURLs.java.html

package Torello.HTML.Tools.NewsSite;

import java.util.*;
import java.io.*;
import java.util.stream.*;
import java.net.*;

import Torello.HTML.*;
import Torello.HTML.NodeSearch.*;
import Torello.Java.*;

import static Torello.Java.C.*;

import Torello.Java.Additional.Ret2;
import Torello.Java.Additional.URLs;

import Torello.JavaDoc.Excuse;
import Torello.JavaDoc.StaticFunctional;

/**
 * Collects all <B>news-article {@code URL's}</B> from a news oriented web-site's main web-page
 * and from the list 'sub-section' web-pages.
 * 
 * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPE_URLS>
 */
@StaticFunctional(Excused="SKIP_ON_SECTION_URL_EXCEPTION", Excuses=Excuse.CONFIGURATION)
public class ScrapeURLs
{
    private ScrapeURLs() { }

    /**
     * This is a {@code static boolean} configuration field.  When this is set to {@code TRUE}, if
     * one of the {@code "Section URL's"} provided to this class is not valid, and generates a
     * {@code 404 FileNotFoundException}, or some other {@code HttpConnection} exception, those
     * exceptions will simply be logged, and quietly ignored.
     * 
     * <BR /><BR />When this {@code flag} is set to {@code FALSE}, any problems that can occur when
     * attempting to pick out News Article {@code URL's} from a {@code Section Web-Page} will cause
     * a {@code SectionURLException} to throw, and the {@code ScrapeURL's} process will halt.
     * 
     * <BR /><BR /><B><SPAN STYLE="color: red;">SIMPLY PUT:</B></SPAN> There are occasions when a
     * news web-site will remove a section such as "Commerce", "Sports", or "Travel" - and when or
     * if one of these suddenly goes missing, it is better to just skip the site rather than
     * halting the scrape, keep this {@code flag} set to {@code TRUE}.
     * 
     * <BR /><BR /><B><SPAN STYLE="color: red;">ALSO:</B></SPAN> This is, indeed, a {@code public}
     * and {@code static flag} (field) which does mean that all processes ({@code Thread's}) using
     * {@code class ScrapeURLs} must share the same setting (simultaneously).  This particular
     * {@code flag} <I><B>CANNOT</B> be changed in a {@code Thread-Safe} manner.</I>
     */
    public static boolean SKIP_ON_SECTION_URL_EXCEPTION = true;

    /**
     * Convenience Method.
     * <BR />Invokes: {@link #get(Vector, URLFilter, LinksGet, Appendable)}
     */
    public static Vector<Vector<String>> get(NewsSite ns, Appendable log) throws IOException
    { return get(ns.sectionURLsVec(), ns.filter, ns.linksGetter, log); }

    /**
     * This class is used to retrieve <B>all</B> of the available article {@code URL} links found
     * on <B>all sections</B> of a newspaper website.
     *
     * @param sectionURLs This should be a vector of {@code URL's}, that has all of the the
     * "Main News-Paper Page Sections."  Typical NewsPaper Sections are things like: Life, Sports,
     * Business, World, Economy, Arts, etc...  This parameter may not be null, or a
     * {@code NullPointerException} will throw.
     *
     * @param articleURLFilter If there is a standard pattern for a URL that must be avoided, then
     * this filter parameter should be used.  This parameter may be null, and if it is, it shall be
     * ignored.  This Java {@code URL-Predicate} (an instance of {@code Predicate<URL>}) should
     * return {@code TRUE} if a particular {@code URL} needs to be <B>kept</B>, not <B>filtered</B>.
     * When this {@code Predicate} evaluates to {@code FALSE} - <I>the {@code URL} will be
     * filtered</I>.
     *
     * <BR /><BR /><B>NOTE:</B> This behavior is identical to the Java Stream's method
     * {@code "filter(Predicate<>)".}
     *
     * <BR /><BR /><B>ALSO:</B> {@code URL's} that are filtered will neither be scraped, nor saved,
     * into the newspaper article result-set output file.
     *
     * @param linksGetter This method may be used to retrieve all links on a particular
     * section-page.  This parameter may be null.  If it is null, it will be ignored - <I>and all
     * HTML Anchor ({@code <A HREF=...>}) links will be considered "Newspaper Articles to be
     * scraped."</I>  Be careful about ignoring this parameter, because there may be many
     * extraneous non-news-article links on a particular Internet News WebSite or inside a 
     * Web-Page Section.
     *
     * @param log This prints log information to the screen.  This parameter may not be null,
     * or a {@code NullPointerException} will throw.
     * <EMBED CLASS='external-html' DATA-FILE-ID=APPENDABLE>
     *
     * @return The "{@code Vector} of {@code Vector's}" that is returned is simply a list of
     * all newspaper anchor-link {@code URL's} found on each Newspaper Sub-Section {@code URL}
     * passed to the {@code 'sectionURLs'} parameter.  The returned "{@code Vector} of
     * {@code Vector's}" is parallel to the input-parameter {@code Vector<URL> Section-URL's}.
     * 
     * <BR /><BR />What this means is that the Newspaper-Article {@code URL}-Links scraped from
     * the page located at {@code sectionURLs.elementAt(0)} - will be stored in the
     * return-{@code Vector} at {@code ret.elementAt(0).}
     *
     * <BR /><BR />The article {@code URL's} scraped off of
     * page {@code URL} from {@code sectionURLs.elementAt(1)} will be stored in the 
     * return-{@code Vector} at {@code ret.elementAt(1)}.  <I><B>And so on, and so 
     * forth...</I></B>
     * 
     * @throws SectionURLException If one of the provided {@code sectionURL's} (Life, Sports, 
     * Travel, etc...) is not valid, or not available on the page then this exception will throw.
     * Note, though, however there is a {@code flag} ({@link #SKIP_ON_SECTION_URL_EXCEPTION}) that
     * will force this method to simply "skip" a faulty or non-available {@code Section URL}, and
     * move on to the next news-article section.
     * 
     * <BR /><BR />By default, this {@code flag} is set to {@code TRUE}, meaning that this method
     * will skip news-paper sections that have been temporarily removed rather than causing the
     * method to exit.  This default behavior can be changed by setting the {@code flag}
     * {@code FALSE}.
     */
    public static Vector<Vector<String>> get(
        Vector<URL> sectionURLs, URLFilter articleURLFilter, LinksGet linksGetter,
        Appendable log
    )
    {
        LOG_WRITE(
            log,
            "\n" + BRED +
            "*****************************************************************************************\n" +
            "*****************************************************************************************\n" + 
            RESET + " Finding Article URL's in Newspaper Sections" + BRED + "\n" +
            "*****************************************************************************************\n" +
            "*****************************************************************************************\n" + 
            RESET + '\n'
        );

        Vector<Vector<String>> ret = new Vector<>();

        for (URL sectionURL : sectionURLs)
        {
            Stream<String> urlStream;

            // It helps to run this, because web-pages can use a lot of strings
            System.gc();

            // Starting Scraping the Section for URL's
            LOG_WRITE(log, "Visiting Section URL: " + sectionURL.toString() + '\n');

            try
            {
                // Download, Scrape & Parse the main-page or section URL.
                Vector<HTMLNode> sectionPage = HTMLPage.getPageTokens(sectionURL, false);

                // If the 'LinksGet' instances is null, then select all URL's on the main-page
                // section-pge, and pray for rain (hope for the best).  If no 'LinksGet' instance
                // was provided, there will likely be many spurious / irrelevant links to
                // non-article pages, and even advertisement pages that are also included in this
                // Stream<String>.
                //
                // InnerTagGet returns a Vector<TagNode>.  Convert that to a Stream<String>, where
                // each 'String' in the 'Stream' is the HREF attribute of the <A HREF=...> tag.

                if (linksGetter == null)
                    urlStream = InnerTagGet.all(sectionPage, "a", "href")
                        .stream().map((TagNode tn) -> tn.AV("href"));

                else 
                    urlStream = linksGetter.apply(sectionURL, sectionPage).stream();
            }
            catch (Exception e)
            {
                LOG_WRITE(
                    log,
                    BRED + "Error loading this main-section page-URL\n" + RESET +
                    e.getMessage() + '\n'
                );

                if (SKIP_ON_SECTION_URL_EXCEPTION)
                {
                    LOG_WRITE(log, "Non-fatal Exception, continuing to next Section URL.\n\n");
                    continue;
                }
                else
                {
                    LOG_WRITE(
                        log,
                        BRED + "Fatal - Exiting.  Top-Level Section URL's must be valid URL's." +
                        RESET + "\n" + HTTPCodes.convertMessageVerbose(e, sectionURL, 0) + '\n'
                    );

                    throw new SectionURLException
                        ("Invalid Main Section URL: " + sectionURL.toString(), e);
                }
            }

            Vector<String> sectionArticleURLs = urlStream

                // If any TagNode's did not have HREF-Attributes, remove those null-values
                .filter ((String href)  -> (href != null))

                // Perform a Standard String.trim() operation.
                .map    ((String href) -> href.trim())

                // Any HREF's that are "just white-space" are now removed.
                .filter ((String href) -> href.length() > 0)

                // This removes any HREF Attribute values that begin with
                // "mailto:" "tel:" "javascript:" "magnet:" etc...
                .filter ((String href) -> StrCmpr.startsWithNAND_CI(href, Links.NON_URL_HREFS()))

                // Now, Resolve any "Partial URL References"
                .map ((String href) -> Links.resolve_KE(href, sectionURL))
                                                                        
                // If there were any exceptions while performing the Partial-URL Resolve-Operation,
                // then print an error message.
                .peek ((Ret2<URL, MalformedURLException> r2) ->
                {
                    if (r2.b != null) LOG_WRITE(
                        log,
                        "Section URL was a malformed URL, and provided exception messsage:\n" +
                        r2.b.getMessage() + '\n'
                    );
                })

                // Convert the Ret2<URL, Exception> to just the URL, without any Exceptions
                .map ((Ret2<URL, MalformedURLException> r2) -> r2.a)

                // If there was an exception, the URL Ret.a field would be null (remove nulls)
                .filter ((URL url) -> url != null)

                // NOTE: When this evaluates to TRUE - it should be kept
                // Java Stream's 'filter' method states that when the predicate evaluates to TRUE,
                // the stream element is KEPT / RETAINED.
                // 
                // Class URLFilter mimics the filter behavior of Streams.filter(...)

                .filter ((URL url) -> (articleURLFilter == null) || articleURLFilter.test(url))

                // Convert these to "Standard Strings"
                //      Case-Insensitive parts are set to LowerCase
                //      Case Sensitive Parts are left alone.

                .map ((URL url) -> URLs.urlToString(url))

                // Filter any duplicates -> This is the reason for the above case-sensitive parts
                // being separated.

                .distinct()

                // Convert the URL's back to a String. There really should not be any exceptions,
                // This is just an "extra-careful" step.  It is not needed.

                .filter ((String url) ->
                    { try { new URL(url); return true; } catch (Exception e) { return false; } })

                // Convert the Stream to a Vector
                .collect(Collectors.toCollection(Vector::new));
            
            ret.add(sectionArticleURLs);

            LOG_WRITE(
                log,
                "Found [" + BYELLOW + sectionArticleURLs.size() + RESET + "] " +
                "Article Links.\n\n"
            );
        }

        // Provide a simple count to the log output on how many URL's have been uncovered.
        // NOTE: This does not heed whether different sections contain non-distinct URL's.
        //       (An identical URL found in two different sections will be counted twice!)

        int totalURLs = 0;

        // <?> Prevents the "Xlint:all" from generating warnings...
        for (Vector<?> section : ret) totalURLs += section.size();

        LOG_WRITE(
            log,
            "Complete Possible Article URL list has: " + 
            BYELLOW + StringParse.zeroPad10e4(totalURLs) + RESET + ' ' +
            "url(s).\n\n"
        );

        return ret;
    }

    // This is necessary because Java's 'java.lang.Appendable' permits a IOException
    private static void LOG_WRITE(Appendable log, String s)
    {
        try
            { log.append(s); }

        catch (Exception e)
        {
            System.out.println(
                "While trying to write to the log, an exception occurred\n" +
                "the java.lang.Appendable you have provided threw an IOException:\n" +
                StrIndent.indent(e.getMessage(), 4) +
                "Unfortunaely, with a faulty Appendable-Log, the scraper cannot continue."
            );

            e.printStackTrace();

            System.out.println("Fatal Error, JVM Exiting...");

            System.exit(1);
        }
    }
}