Source code

001package Torello.HTML.Tools.NewsSite;
002
003import java.util.*;
004import java.io.*;
005import java.util.stream.*;
006import java.net.*;
007
008import Torello.HTML.*;
009import Torello.HTML.NodeSearch.*;
010import Torello.Java.*;
011
012import static Torello.Java.C.*;
013
014import Torello.Java.Additional.Ret2;
015import Torello.Java.Additional.URLs;
016
017import Torello.JavaDoc.Excuse;
018import Torello.JavaDoc.StaticFunctional;
019
020/**
021 * Collects all <B>news-article {@code URL's}</B> from a news oriented web-site's main web-page
022 * and from the list 'sub-section' web-pages.
023 * 
024 * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPE_URLS>
025 */
026@StaticFunctional(Excused="SKIP_ON_SECTION_URL_EXCEPTION", Excuses=Excuse.CONFIGURATION)
027public class ScrapeURLs
028{
029    private ScrapeURLs() { }
030
031    /**
032     * This is a {@code static boolean} configuration field.  When this is set to {@code TRUE}, if
033     * one of the {@code "Section URL's"} provided to this class is not valid, and generates a
034     * {@code 404 FileNotFoundException}, or some other {@code HttpConnection} exception, those
035     * exceptions will simply be logged, and quietly ignored.
036     * 
037     * <BR /><BR />When this {@code flag} is set to {@code FALSE}, any problems that can occur when
038     * attempting to pick out News Article {@code URL's} from a {@code Section Web-Page} will cause
039     * a {@code SectionURLException} to throw, and the {@code ScrapeURL's} process will halt.
040     * 
041     * <BR /><BR /><B><SPAN STYLE="color: red;">SIMPLY PUT:</B></SPAN> There are occasions when a
042     * news web-site will remove a section such as "Commerce", "Sports", or "Travel" - and when or
043     * if one of these suddenly goes missing, it is better to just skip the site rather than
044     * halting the scrape, keep this {@code flag} set to {@code TRUE}.
045     * 
046     * <BR /><BR /><B><SPAN STYLE="color: red;">ALSO:</B></SPAN> This is, indeed, a {@code public}
047     * and {@code static flag} (field) which does mean that all processes ({@code Thread's}) using
048     * {@code class ScrapeURLs} must share the same setting (simultaneously).  This particular
049     * {@code flag} <I><B>CANNOT</B> be changed in a {@code Thread-Safe} manner.</I>
050     */
051    public static boolean SKIP_ON_SECTION_URL_EXCEPTION = true;
052
053    /**
054     * Convenience Method.
055     * <BR />Invokes: {@link #get(Vector, URLFilter, LinksGet, Appendable)}
056     */
057    public static Vector<Vector<String>> get(NewsSite ns, Appendable log) throws IOException
058    { return get(ns.sectionURLsVec(), ns.filter, ns.linksGetter, log); }
059
060    /**
061     * This class is used to retrieve <B>all</B> of the available article {@code URL} links found
062     * on <B>all sections</B> of a newspaper website.
063     *
064     * @param sectionURLs This should be a vector of {@code URL's}, that has all of the the
065     * "Main News-Paper Page Sections."  Typical NewsPaper Sections are things like: Life, Sports,
066     * Business, World, Economy, Arts, etc...  This parameter may not be null, or a
067     * {@code NullPointerException} will throw.
068     *
069     * @param articleURLFilter If there is a standard pattern for a URL that must be avoided, then
070     * this filter parameter should be used.  This parameter may be null, and if it is, it shall be
071     * ignored.  This Java {@code URL-Predicate} (an instance of {@code Predicate<URL>}) should
072     * return {@code TRUE} if a particular {@code URL} needs to be <B>kept</B>, not <B>filtered</B>.
073     * When this {@code Predicate} evaluates to {@code FALSE} - <I>the {@code URL} will be
074     * filtered</I>.
075     *
076     * <BR /><BR /><B>NOTE:</B> This behavior is identical to the Java Stream's method
077     * {@code "filter(Predicate<>)".}
078     *
079     * <BR /><BR /><B>ALSO:</B> {@code URL's} that are filtered will neither be scraped, nor saved,
080     * into the newspaper article result-set output file.
081     *
082     * @param linksGetter This method may be used to retrieve all links on a particular
083     * section-page.  This parameter may be null.  If it is null, it will be ignored - <I>and all
084     * HTML Anchor ({@code <A HREF=...>}) links will be considered "Newspaper Articles to be
085     * scraped."</I>  Be careful about ignoring this parameter, because there may be many
086     * extraneous non-news-article links on a particular Internet News WebSite or inside a 
087     * Web-Page Section.
088     *
089     * @param log This prints log information to the screen.  This parameter may not be null,
090     * or a {@code NullPointerException} will throw.
091     * <EMBED CLASS='external-html' DATA-FILE-ID=APPENDABLE>
092     *
093     * @return The "{@code Vector} of {@code Vector's}" that is returned is simply a list of
094     * all newspaper anchor-link {@code URL's} found on each Newspaper Sub-Section {@code URL}
095     * passed to the {@code 'sectionURLs'} parameter.  The returned "{@code Vector} of
096     * {@code Vector's}" is parallel to the input-parameter {@code Vector<URL> Section-URL's}.
097     * 
098     * <BR /><BR />What this means is that the Newspaper-Article {@code URL}-Links scraped from
099     * the page located at {@code sectionURLs.elementAt(0)} - will be stored in the
100     * return-{@code Vector} at {@code ret.elementAt(0).}
101     *
102     * <BR /><BR />The article {@code URL's} scraped off of
103     * page {@code URL} from {@code sectionURLs.elementAt(1)} will be stored in the 
104     * return-{@code Vector} at {@code ret.elementAt(1)}.  <I><B>And so on, and so 
105     * forth...</I></B>
106     * 
107     * @throws SectionURLException If one of the provided {@code sectionURL's} (Life, Sports, 
108     * Travel, etc...) is not valid, or not available on the page then this exception will throw.
109     * Note, though, however there is a {@code flag} ({@link #SKIP_ON_SECTION_URL_EXCEPTION}) that
110     * will force this method to simply "skip" a faulty or non-available {@code Section URL}, and
111     * move on to the next news-article section.
112     * 
113     * <BR /><BR />By default, this {@code flag} is set to {@code TRUE}, meaning that this method
114     * will skip news-paper sections that have been temporarily removed rather than causing the
115     * method to exit.  This default behavior can be changed by setting the {@code flag}
116     * {@code FALSE}.
117     */
118    public static Vector<Vector<String>> get(
119        Vector<URL> sectionURLs, URLFilter articleURLFilter, LinksGet linksGetter,
120        Appendable log
121    )
122    {
123        LOG_WRITE(
124            log,
125            "\n" + BRED +
126            "*****************************************************************************************\n" +
127            "*****************************************************************************************\n" + 
128            RESET + " Finding Article URL's in Newspaper Sections" + BRED + "\n" +
129            "*****************************************************************************************\n" +
130            "*****************************************************************************************\n" + 
131            RESET + '\n'
132        );
133
134        Vector<Vector<String>> ret = new Vector<>();
135
136        for (URL sectionURL : sectionURLs)
137        {
138            Stream<String> urlStream;
139
140            // It helps to run this, because web-pages can use a lot of strings
141            System.gc();
142
143            // Starting Scraping the Section for URL's
144            LOG_WRITE(log, "Visiting Section URL: " + sectionURL.toString() + '\n');
145
146            try
147            {
148                // Download, Scrape & Parse the main-page or section URL.
149                Vector<HTMLNode> sectionPage = HTMLPage.getPageTokens(sectionURL, false);
150
151                // If the 'LinksGet' instances is null, then select all URL's on the main-page
152                // section-pge, and pray for rain (hope for the best).  If no 'LinksGet' instance
153                // was provided, there will likely be many spurious / irrelevant links to
154                // non-article pages, and even advertisement pages that are also included in this
155                // Stream<String>.
156                //
157                // InnerTagGet returns a Vector<TagNode>.  Convert that to a Stream<String>, where
158                // each 'String' in the 'Stream' is the HREF attribute of the <A HREF=...> tag.
159
160                if (linksGetter == null)
161                    urlStream = InnerTagGet.all(sectionPage, "a", "href")
162                        .stream().map((TagNode tn) -> tn.AV("href"));
163
164                else 
165                    urlStream = linksGetter.apply(sectionURL, sectionPage).stream();
166            }
167            catch (Exception e)
168            {
169                LOG_WRITE(
170                    log,
171                    BRED + "Error loading this main-section page-URL\n" + RESET +
172                    e.getMessage() + '\n'
173                );
174
175                if (SKIP_ON_SECTION_URL_EXCEPTION)
176                {
177                    LOG_WRITE(log, "Non-fatal Exception, continuing to next Section URL.\n\n");
178                    continue;
179                }
180                else
181                {
182                    LOG_WRITE(
183                        log,
184                        BRED + "Fatal - Exiting.  Top-Level Section URL's must be valid URL's." +
185                        RESET + "\n" + HTTPCodes.convertMessageVerbose(e, sectionURL, 0) + '\n'
186                    );
187
188                    throw new SectionURLException
189                        ("Invalid Main Section URL: " + sectionURL.toString(), e);
190                }
191            }
192
193            Vector<String> sectionArticleURLs = urlStream
194
195                // If any TagNode's did not have HREF-Attributes, remove those null-values
196                .filter ((String href)  -> (href != null))
197
198                // Perform a Standard String.trim() operation.
199                .map    ((String href) -> href.trim())
200
201                // Any HREF's that are "just white-space" are now removed.
202                .filter ((String href) -> href.length() > 0)
203
204                // This removes any HREF Attribute values that begin with
205                // "mailto:" "tel:" "javascript:" "magnet:" etc...
206                .filter ((String href) -> StrCmpr.startsWithNAND_CI(href, Links.NON_URL_HREFS()))
207
208                // Now, Resolve any "Partial URL References"
209                .map ((String href) -> Links.resolve_KE(href, sectionURL))
210                                                                        
211                // If there were any exceptions while performing the Partial-URL Resolve-Operation,
212                // then print an error message.
213                .peek ((Ret2<URL, MalformedURLException> r2) ->
214                {
215                    if (r2.b != null) LOG_WRITE(
216                        log,
217                        "Section URL was a malformed URL, and provided exception messsage:\n" +
218                        r2.b.getMessage() + '\n'
219                    );
220                })
221
222                // Convert the Ret2<URL, Exception> to just the URL, without any Exceptions
223                .map ((Ret2<URL, MalformedURLException> r2) -> r2.a)
224
225                // If there was an exception, the URL Ret.a field would be null (remove nulls)
226                .filter ((URL url) -> url != null)
227
228                // NOTE: When this evaluates to TRUE - it should be kept
229                // Java Stream's 'filter' method states that when the predicate evaluates to TRUE,
230                // the stream element is KEPT / RETAINED.
231                // 
232                // Class URLFilter mimics the filter behavior of Streams.filter(...)
233
234                .filter ((URL url) -> (articleURLFilter == null) || articleURLFilter.test(url))
235
236                // Convert these to "Standard Strings"
237                //      Case-Insensitive parts are set to LowerCase
238                //      Case Sensitive Parts are left alone.
239
240                .map ((URL url) -> URLs.urlToString(url))
241
242                // Filter any duplicates -> This is the reason for the above case-sensitive parts
243                // being separated.
244
245                .distinct()
246
247                // Convert the URL's back to a String. There really should not be any exceptions,
248                // This is just an "extra-careful" step.  It is not needed.
249
250                .filter ((String url) ->
251                    { try { new URL(url); return true; } catch (Exception e) { return false; } })
252
253                // Convert the Stream to a Vector
254                .collect(Collectors.toCollection(Vector::new));
255            
256            ret.add(sectionArticleURLs);
257
258            LOG_WRITE(
259                log,
260                "Found [" + BYELLOW + sectionArticleURLs.size() + RESET + "] " +
261                "Article Links.\n\n"
262            );
263        }
264
265        // Provide a simple count to the log output on how many URL's have been uncovered.
266        // NOTE: This does not heed whether different sections contain non-distinct URL's.
267        //       (An identical URL found in two different sections will be counted twice!)
268
269        int totalURLs = 0;
270
271        // <?> Prevents the "Xlint:all" from generating warnings...
272        for (Vector<?> section : ret) totalURLs += section.size();
273
274        LOG_WRITE(
275            log,
276            "Complete Possible Article URL list has: " + 
277            BYELLOW + StringParse.zeroPad10e4(totalURLs) + RESET + ' ' +
278            "url(s).\n\n"
279        );
280
281        return ret;
282    }
283
284    // This is necessary because Java's 'java.lang.Appendable' permits a IOException
285    private static void LOG_WRITE(Appendable log, String s)
286    {
287        try
288            { log.append(s); }
289
290        catch (Exception e)
291        {
292            System.out.println(
293                "While trying to write to the log, an exception occurred\n" +
294                "the java.lang.Appendable you have provided threw an IOException:\n" +
295                StrIndent.indent(e.getMessage(), 4) +
296                "Unfortunaely, with a faulty Appendable-Log, the scraper cannot continue."
297            );
298
299            e.printStackTrace();
300
301            System.out.println("Fatal Error, JVM Exiting...");
302
303            System.exit(1);
304        }
305    }
306}