Source code

001package Torello.HTML.Tools.NewsSite;
002
003import Torello.HTML.*;
004import Torello.HTML.NodeSearch.*;
005import Torello.Java.*;
006
007import Torello.Java.Additional.Ret4;
008
009import static Torello.Java.C.*;
010
011import java.util.*;
012import java.io.*;
013
014import java.net.URL;
015import java.util.concurrent.TimeUnit;
016
017/**
018 * This class runs the primary iteration-loop for downloading news-articles using a list of
019 * article-{@code URL's}.
020 * 
021 * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPE_ARTICLES>
022 */
023@Torello.JavaDoc.StaticFunctional
024public class ScrapeArticles
025{
026    private ScrapeArticles() { }
027
028    private static final String STARS =
029        "*********************************************" +
030        "********************************************\n";
031
032    /**
033     * This is used to do the downloading of newspaper articles.
034     *
035     * @param articleReceiver This is an instance of {@code ScrapedArticleReceiver}.  Whenever an
036     * {@code Article} has successfully downloaded, it will be passed to this 'receiver' class.  
037     * There is a pre-written, standard {@code ScrapedArticleReceiver} that writes to a directory
038     * on the file-system as {@code Article's} are downloaded.  If there is a need to transmit
039     * downloaded {@code Article's} elsewhere, implement that  {@code interface}, and provide an
040     * instance of it to this parameter.
041     *
042     * @param articleURLs this is a parameter that should have been generated by a call to method:
043     * {@code ScrapeURLs.getArticleURLs(...)}
044     *
045     * @param articleGetter This is basically a "Post-Processor" for HTML Web-based newspaper 
046     * articles. This parameter cannot be null.  It is just a simple, one-line, lambda-predicate
047     * which needs to be implemented by the programmer.  Internet news websites (such as: 
048     * {@code news.yahoo.com, cnn.com}, and {@code gov.cn}) have News-Articles on pages that 
049     * contain a lot of extraneous and advertising links and content.  This parameter needs to
050     * extract the {@code Article}-body content from the rest of the page.  <I>This is usually 
051     * very trivial, but it is also mandatory.</I>  Read about the  {@code class ArticleGet} for
052     * more information about extracting the news-content from a Newspaper {@code Article}
053     * web-page.
054     *
055     * @param skipArticlesWithoutPhotos This may be {@code TRUE}, and if it is - articles that
056     * contain only textual content will be skipped.  This can be useful for foreign-news sources
057     * where the reader is usually working-harder to understand the content in the first place.
058     * This class is primarily used with foreign-news content websites.  As such, staring at 
059     * pages of Mandarin Chinese or Spanish is usually a lot easier if there is at least one 
060     * photo on the page.  This parameter allows users to skip highly dense articles that do not
061     * contain at least one picture.
062     *
063     * @param bannerAndAdFinder This parameter may be null, but if it is not, it will be used to
064     * skip banner-advertisement images.  This parameter, in reality, does very little.  It
065     * will not actually be used to eliminated advertising images - <I>but rather only to identify
066     * when an image is a banner, advertisement, or spurious picture</I>.  Since this is a news
067     * web-site scraping Java Package, there is a part that allows a user to require that only news
068     * paper articles that contain a photo be downloaded - and the real purpose of including the
069     * {@code 'bannerAndAdFinder'} is to allow the scrape mechanism to 'skip' articles whose only
070     * photos are advertisements.
071     * 
072     * <BR /><BR /><B>NOTE:</B> Again, the primary impetus for developing these tools was for 
073     * scraping and translating news articles from foreign countries like Spain, China, and parts
074     * of South America.  It could be used for any news-source desired.  When reading foreign
075     * language text - it helps "a little bit more" to see a picture.  This parameter is solely 
076     * used for that purpose.
077     * 
078     * <BR /><BR /><B>PRODUCT ADVERTISEMENTS &amp; FACEBOOK / TWITTER LINKS:</B> Removing actual
079     * links about "pinning to Reddit.com" or "Tweeting" articles can be done using either:
080     *
081     * <BR /><BR /><UL CLASS=JDUL>
082     * <LI> {@link ArticleGet} - Writing an instance of {@code ArticleGet} that <B>NOT ONLY</B> 
083     *      extracts the body of a newspaper-article, <B>BUT ALSO</B> performs HTML cleanup using
084     *      the {@code 'Remove'} method of the NodeSearch Package.
085     * </LI>
086     * <LI> {@link HTMLModifier} - Writing a "cleaner" version of the {@code HTMLModifier} lambda
087     *      expression / {@code Function Interface} can also use the NodeSearch classes for
088     *      removing annoying commercials - or buttons about "Sharing a link on Facebook."  The
089     *      class {@link ToHTML} provides a window for accepting an instance of
090     *      {@code HTMLModifier} when converting the generated serialized-data HTML
091     *      {@code Vector's} into {@code '.html' index} files.
092     * </LI>
093     * </UL>
094     *
095     * @param keepOriginalPageHTML When this is {@code TRUE}, the original page html will be stored
096     * in the result set.  When this is {@code FALSE} null shall be stored in place of the original
097     * page data.
098     *
099     * <BR /><BR /><B>NOTE:</B> The original page HTML is the source HTML that is fed into the
100     * {@code ArticleGet} lambda.  It contains the "pre-processed HTML."
101     *
102     * @param pause If there are many / numerous articles to download, pass an instance of
103     * {@code class Pause}, and intermediate progress can be saved, and reloaded at a later time.
104     *
105     * @param log This parameter may not be null, or a {@code NullPointerException} shall throw.
106     * As articles are downloaded, notices shall be posted to this {@code 'log'} by this method.
107     * <EMBED CLASS='external-html' DATA-FILE-ID=APPENDABLE>
108     *
109     * @return A {@code Vector} that is <B><I>exactly</B></I> parallel to the input
110     * {@code Vector<Vector<String>> articleURLs} will be returned.  Each element of each of the
111     * sub-{@code Vector's} in this two-dimensional {@code Vector} will have an instance of the
112     * enumerated-type {@code 'DownloadResult'}.  The constant-value in {@code 'DownloadResult'}
113     * will identify whether or not the {@code Article} pointed to by the {@code URL} at that
114     * {@code Vector}-location successfully downloaded.
115     * <BR /><BR />If the download failed, then the value of the {@code enum 'DownloadResult'} 
116     * will be able to identify the error that occurred when attempting to scrape a particular
117     * news-story {@code URL} 
118     *
119     * @throws PauseException If there is an error when attempting to save the download state.
120     *
121     * @throws ReceiveException If there are any problems with the {@code ScrapedArticleReceiver}
122     * <BR /><BR /><B>NOTE:</B> A {@code ReceiveException} implies that the user's code has failed
123     * to properly handle or save an instance of {@code Article} that has downloaded, successfully,
124     * by this {@code class ScrapeArticles}.  A {@code ReceiveException} will halt the download
125     * process immediately, and download state will be saved if the user has provided a reference
126     * to the {@code Pause} parameter.
127     *
128     * <BR /><BR /><B>NOTE:</B> Other internally caused download-exceptions will be handled and
129     * logged (<I>without halting the entire download-process</I>) - and downloading will continue.  
130     * A note about the internally-produced exception will be printed to the log, and an 
131     * appropriate instance of {@code enum DownloadResult} will be put in the return
132     * {@code Vector}.
133     *
134     * @throws IOException This exception is required for any method that uses Java's
135     * {@code interface java.lang.Appendable}.  Here, the {@code 'Appendable'} is the log, and if
136     * writing to this user provided {@code 'log'} produces an exception, then download progress
137     * will halt immediately, and download state will be saved if the user has provided a reference
138     * to the {@code Pause} parameter.
139     */
140    public static Vector<Vector<DownloadResult>> download(   
141        ScrapedArticleReceiver  articleReceiver,
142        Vector<Vector<String>>  articleURLs,
143        ArticleGet              articleGetter,
144        boolean                 skipArticlesWithoutPhotos,
145        StrFilter               bannerAndAdFinder,   
146        boolean                 keepOriginalPageHTML,
147        Pause                   pause,
148        Appendable              log
149    )
150        throws PauseException, ReceiveException, IOException
151    {
152        log.append(
153            "\n" + BRED + STARS + STARS +
154            RESET + " Downloading Articles" + BRED + "\n" +
155            STARS + STARS + RESET + '\n'
156        );
157
158        // The loop variables, and the return-result Vector.
159        int                             outerCounter    = 0;
160        int                             innerCounter    = 0;
161        int                             successCounter  = 0;
162        boolean                         firstIteration  = true;
163        Vector<Vector<DownloadResult>>  ret             = null;
164        URL                             url             = null;
165        Runtime                         rt              = Runtime.getRuntime();
166
167        // If the user has passed an instance of 'pause' then it should be loaded from disk.
168        if (pause != null)
169        {
170            Ret4<Vector<Vector<DownloadResult>>, Integer, Integer, Integer> r = pause.loadState();
171
172            ret             = r.a;
173            outerCounter    = r.b.intValue();
174            innerCounter    = r.c.intValue();
175            successCounter  = r.d.intValue();
176        }
177
178        // If the user did not provide a "Pause" mechanism, **OR** the "Pause Mechanism" asserts
179        // that the download process is starting from the beginning of the article-URL Vector,
180        // THEN a *new vector* should be built.
181        if (    (pause == null)
182            ||  ((outerCounter == 0) && (innerCounter == 0) && (successCounter == 0))
183        )
184        {
185            // Need to instantiate a brand new return vector.  The downloader is starting over
186            // at the beginning of the Article URL list.
187
188            ret = new Vector<>(articleURLs.size());
189
190            // Initializes the capacity (sizes) of the two-dimensional "Return Vector."
191            //
192            // NOTE: The return Vector is exactly parallel to the input "articleURLs"
193            //       two-dimensional input Vector.
194
195            for (int i=0; i < articleURLs.size(); i++) 
196                ret.add(new Vector<DownloadResult>(articleURLs.elementAt(i).size()));
197        }
198
199        for (; outerCounter < articleURLs.size(); outerCounter++)
200        {
201            // System.out.println("outerCounter=" + outerCounter + ", innerCounter=" +
202            //      innerCounter + ", articleURLs.size()=" + articleURLs.size());
203
204            // System.out.println("articleURLs.elementAt(" + outerCounter + ").size()=" +
205            //      articleURLs.elementAt(outerCounter).size());
206
207            for (   innerCounter = (firstIteration ? innerCounter : 0);
208                    innerCounter < articleURLs.elementAt(outerCounter).size();
209                    innerCounter++
210                )
211
212                try
213                {
214                    firstIteration = false;
215                    String urlStr = articleURLs.elementAt(outerCounter).elementAt(innerCounter);
216
217                    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
218                    // Instantiate the URL object from the URLStr String.
219                    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
220
221                    // Should never happen, because each URL will have been alredy tested 
222                    // and instantiated in the previous method.
223
224                    try
225                        { url = new URL(urlStr); }
226
227                    catch (Exception e)
228                    {
229                        log.append
230                            ("Could not instantiate URL-String into URL for [" + urlStr + "].\n");
231
232                        ret.elementAt(outerCounter).add(DownloadResult.BAD_ARTICLE_URL);
233                        continue;
234                    }
235
236
237                    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
238                    // Run the Garbage Collector, Print Article URL and Number to log.
239                    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
240
241                    rt.gc();
242                    String              freeMem         = StringParse.commas(rt.freeMemory());
243                    String              totalMem        = StringParse.commas(rt.totalMemory());
244
245                    log.append(
246                        "\nVisiting URL: [" +
247                        YELLOW +  StringParse.zeroPad10e4(outerCounter) + RESET + 
248                        " of " + StringParse.zeroPad10e4(articleURLs.size()) + ", " +
249                        YELLOW +  StringParse.zeroPad10e4(innerCounter) + RESET + 
250                        " of " + StringParse.zeroPad10e4
251                            (articleURLs.elementAt(outerCounter).size()) + "] " +
252                        CYAN         + " - "  + url                       + RESET + '\n' +
253                        "Available Memory: "    + YELLOW +  freeMem       + RESET + '\t' +
254                        "Total Memory: "        + YELLOW +  totalMem      + RESET + '\n'
255                    );
256
257
258                    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
259                    // Scrape the web-page
260                    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
261
262                    int                 retryCount      = 0;
263                    Vector<HTMLNode>    page            = null;
264
265                    while ((page == null) && (retryCount < 5))
266
267                        try
268                            { page = HTMLPageMWT.getPageTokens(15, TimeUnit.SECONDS, url, false); }
269    
270                        catch (Exception e)
271                        {
272                            log.append(HTTPCodes.convertMessageVerbose(e, url, 1) + '\n');
273                            retryCount++;
274                        }
275
276
277                    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
278                    // Verify the results of scraping the web-page
279                    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
280
281                    if (page == null)
282                    {
283                        log.append(
284                            BRED + "\tArticle could not download, max 5 retry counts." +
285                            RESET + '\n'
286                        );
287
288                        ret.elementAt(outerCounter).add(DownloadResult.COULD_NOT_DOWNLOAD);
289                        continue;
290                    }
291
292                    if (page.size() == 0)
293                    {
294                        log.append(
295                            BRED + "\tArticle was retrieved, but page-vector was empty" +
296                            RESET + '\n'
297                        );
298
299                        ret.elementAt(outerCounter).add(DownloadResult.EMPTY_PAGE_VECTOR);
300                        continue;
301                    }
302
303                    log.append
304                        ("\tPage contains (" + YELLOW + page.size() + RESET + ") HTMLNodes.\n");
305
306
307                    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
308                    // Retrieve the <TITLE> element (as a String) from the page - if it has one.
309                    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
310
311                    String title = Util.textNodesString(TagNodeGetInclusive.first(page, "title"));
312
313                    if (title.length() > 0)
314                        log.append
315                            ("\tPage <TITLE> element is: " + YELLOW + title + RESET + '\n');
316
317                    else
318                        log.append("\tPage has no <TITLE> element, or it was empty.\n");
319
320
321                    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
322                    // Use the Article-Getter to get the Article-Body.  Watch for Exceptions.
323                    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
324
325                    Vector<HTMLNode> article = null;
326
327                    // The function-pointer (FunctionInterface) 'articleGetter' is supposed to 
328                    // locate and extract the Article's HTML from the surrounding web-page, which
329                    // is usually fully-loaded with advertisements, and "See This Also" links.
330                    //
331                    // All news-websites I have seen wrap the article itself in an HTML <MAIN>
332                    // <ARTICLE>, <SECTION role='article'> or a <DIV CLASS='main'> tag
333                    // that is very easy to find.  Also, these tags differ from site-to-site, each
334                    // site will use the same tag for all of its articles.
335                    //
336                    // (But you have to look at the HTML first)
337
338                    try
339                        { article = articleGetter.apply(url, page); }
340
341                    catch (ArticleGetException e)
342                    {
343                        log.append(
344                            BRED + "\tArticleGet.apply(...) failed: " + e.getMessage() +
345                            RESET + "\nException Cause Chain:\n" + EXCC.toString(e) + '\n'
346                        );
347
348                        ret.elementAt(outerCounter).add(DownloadResult.ARTICLE_GET_EXCEPTION);
349                        continue;
350                    }
351
352
353                    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
354                    // Verify the results of article get, and choose the right DownloadResult
355                    // Enumerated-Constant if the download failed
356                    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
357
358                    if (article == null)
359                    {
360                        log.append(
361                            BRED + "\tContent-body not found by ArticleGet.apply(...)\n" +
362                            RESET
363                        );
364
365                        ret.elementAt(outerCounter).add(DownloadResult.ARTICLE_GET_RETURNED_NULL);
366                        continue;
367                    }
368
369                    if (article.size() == 0)
370                    {
371                        log.append(
372                            BRED + "\tContent-body not found by ArticleGet.apply(...)\n" +
373                            RESET
374                        );
375
376                        ret.elementAt(outerCounter)
377                            .add(DownloadResult.ARTICLE_GET_RETURNED_EMPTY_VECTOR);
378                        continue;
379                    }
380
381                    log.append(
382                        "\tArticle body contains (" + YELLOW + article.size() + RESET +
383                        ") HTMLNodes.\n"
384                    );
385
386
387                    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
388                    // Retrieve the positions of the images
389                    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
390
391                    // The Vector-index location of all the images inside the article-body
392                    int[] imagePosArr = InnerTagFind.all(article, "img", "src",
393                        (String src) -> ! StrCmpr.startsWithXOR_CI(src.trim(), "data:"));
394
395                    // A list of all the image-URL's that were extracted from the article-body
396                    // using the integer-array aquired in the previous line.
397                    Vector<URL> imageURLs = Links.resolveSRCs(article, imagePosArr, url);
398
399                    if (skipArticlesWithoutPhotos && (imageURLs.size() == 0))
400                    {
401                        log.append(
402                            BRED + "\tArticle content contained 0 HTML IMG elements" + RESET +
403                            '\n'
404                        );
405
406                        ret.elementAt(outerCounter).add(DownloadResult.NO_IMAGES_FOUND);
407                        continue;
408                    }
409
410                    log.append(
411                        "\tArticle contains (" + YELLOW + imageURLs.size() + RESET + ") " +
412                        "image TagNodes.\n"
413                    );
414
415
416                    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
417                    // Check the banner-situation.  Count all images, and less that number by the
418                    // number of "banner-images"
419                    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
420
421                    // IMPORTANT NOTE: THIS ISN'T ALWAYS USEFUL OR USEABLE...  IT IS **SOMETIMES**
422                    // USEFUL
423
424                    int imageCount = imageURLs.size();
425
426                    if (bannerAndAdFinder != null)
427
428                        for (int pos : imagePosArr)
429
430                            if (bannerAndAdFinder
431                                .test(((TagNode) article.elementAt(pos)).AV("src"))
432                            )
433                                imageCount--;
434
435                    if (skipArticlesWithoutPhotos && (imageCount == 0))
436                    {
437                        log.append(
438                            BRED + "\tAll images inside article were banner images" +
439                            RESET + '\n'
440                        );
441
442                        ret.elementAt(outerCounter)
443                            .add(DownloadResult.NO_IMAGES_FOUND_ONLY_BANNERS);
444
445                        continue;
446                    }
447
448                    if (bannerAndAdFinder != null)
449
450                        log.append(
451                            "\tArticle contains (" + YELLOW + imageCount + RESET + ") " +
452                            "non-banner image TagNodes.\n"
453                        );
454
455
456                    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
457                    // Write the results to the output file
458                    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
459
460                    Article articleResult = new Article(
461                        url, title, (keepOriginalPageHTML ? page : null), article, imageURLs,
462                        imagePosArr
463                    );
464
465                    // The article was successfully downloaded and parsed.  Send it to the
466                    // "Receiver" and add DownloadResult to the return vector.
467
468                    log.append(
469                        GREEN + "ARTICLE LOADED." + RESET +
470                        "  Sending to ScrapedArticleReceiver.\n"
471                    );
472
473                    articleReceiver.receive(articleResult, outerCounter, innerCounter);
474                    ret.elementAt(outerCounter).add(DownloadResult.SUCCESS);
475
476                    successCounter++;
477
478                }
479                catch (ReceiveException re)
480                {
481                    // NOTE: If there was a "ReceiveException" then article-downloading must be
482                    //       halted immediately.  A ReceiveException implies that the user did not
483                    //       properly handle the downloaded Article, and the user's code would have
484                    //       to be debugged.
485
486                    log.append(
487                        "There was an error when attempting to pass the downloaded article to " +
488                        "the ArticleReceiver.  Unrecoverable.  Saving download state, and " +
489                        "halting download.\n"
490                    );
491
492                    // Make sure to save the internal download state                        
493                    if (pause != null)
494                        pause.saveState(ret, outerCounter, innerCounter, successCounter);
495
496                    // Make sure to stop the download process now.  If the article "Receiver"
497                    // failed to save or store a received-article, there is NO POINT IN CONTINUING
498                    // THE DOWNLOADER.
499                    //
500                    // NOTE: This will cause the method to exit with error, make sure to stop the
501                    //       "MWT Thread" Remember, this is just a simple "Monitor Thread" that 
502                    //       prevents a download from hanging.
503
504                    HTMLPageMWT.shutdownMWTThreads();
505
506                    throw re;
507                }
508                catch (IOException ioe)
509                {
510                    // This exception occurs if writing the "Appendable" (the log) fails.  If this
511                    // happens, download should halt immediately, and the internal-state should be
512                    // saved to the 'pause' variable.
513
514                    if (pause != null)
515                        pause.saveState(ret, outerCounter, innerCounter, successCounter);
516
517                    // Need to stop the download process.  IOException could ONLY BE the result of
518                    // the "Appendable.append" method.  None of the other stuff throws IOException.
519                    //
520                    // ALSO: If the "Appendable" never fails (which is 99% likely not to happen),
521                    // This catch-statement will never actually execute.  However, if Appendable
522                    // did, in fact, fail to write - then downloading cannot continue;
523                    //
524                    // NOTE: This will cause the method to exit with error, make sure to stop the
525                    //       HTMLPage's "MWT Thread" (It is a simple "Monitor Thread") that 
526                    //       can be used to prevent the download from hanging.
527                    //       HOWEVER, it will also cause the JVM to 'hang' this thread exits
528                    //       without shutting down the monitor-thread!
529
530                    HTMLPageMWT.shutdownMWTThreads();
531
532                    throw ioe;
533                }
534                catch (Exception e)
535                {
536                    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
537                    // Handle "Unknown Exception" case.
538                    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
539  
540                    log.append(
541                        "There was an unknown Exception:\n" + EXCC.toString(e) +
542                        "\nSkipping URL: " + url + '\n'
543                    );
544
545                    ret.elementAt(outerCounter).add(DownloadResult.UNKNOWN_EXCEPTION);
546                }
547                finally
548                {
549                    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
550                    // Write the current "READ STATE" information (two integers)
551                    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
552
553                    // This makes sure that the download-progress is not lost when large numbers
554                    // of articles are being processed.  Restart the download, and the loop
555                    // variables will automatically be initialized to where they were before the
556                    // JVM exited.  (Pretty Useful)
557
558                    if (pause != null)
559                        pause.saveState(ret, outerCounter, innerCounter, successCounter);
560                }
561        }
562
563        log.append(
564            BRED + STARS + RESET +
565            "Traversing Site Completed.\n" +
566            "Loaded a total of (" + successCounter + ") articles.\n"
567        );
568
569        // Returns the two-dimensional "Download Result" Vector
570        // Make sure to stop the "Max Wait Time Threads"
571
572        HTMLPageMWT.shutdownMWTThreads();
573
574        return ret;
575    }
576
577}