ForeignNewsArticle.java.html

package Torello.Languages;

import java.util.*;
import java.io.*;
import java.net.URL;

import Torello.Java.*;
import Torello.Java.Additional.*;
import Torello.HTML.*;
import Torello.HTML.NodeSearch.*;
import Torello.HTML.Tools.Images.*;
import Torello.Languages.FNA.*;

import static Torello.Java.C.*;

/**
 * A simple Foreign News Article Scraper.
 * 
 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=FNA>
 * @see GCSTAPI#key
 * @see GCSTAPI#sentence(String, LC, LC)
 * @see GCSTAPI#wordByWord(Vector, LC, LC)
 */
@Torello.JavaDoc.StaticFunctional
public class ForeignNewsArticle
{
    private ForeignNewsArticle() { }

    /**
     * This is the HTML page header that is appended to the output page.
     */
    public static final String HEADER =  
        "<HTML>\n"	+
        HTMLHeader.metaTag + "\n"	+
        "<TITLE>Translated, Foreign Language Article</TITLE>\n"	+
        "<SCRIPT type=\"text/javascript\">\n" + HTMLHeader.javaScript + "\n" + "</SCRIPT>" + "\n"	+
        "<STYLE>\n" + HTMLHeader.css + "</STYLE>" + "\n"	+
        "<BODY>" + "\n" + HTMLHeader.popUpDIV + "\n"	+
        HTMLHeader.text2SpeechNote;

    /**
     * This will download and translate a news article from a foreign news website.  All that you
     * need to do is provide the main "Article-Body" of the article, and some information -
     * <I><B>and calls to Google Cloud Server Translate API</I></B> will be handled by the code.
     *
     * <BR /><BR /><B><SPAN STYLE="color: red;">IMPORTANT NOTE:</B></SPAN> This class makes calls
     * to the GCSTAPI, which is an acronym meaning the Google Cloud Server Translate API.  This
     * server expects you to pay Google for the services that it provides. The translations are not
     * free - but they are not too expensive either.  <B><I>You must be sure to set the
     * {@code class GSCTAPI -> String key} field </I></B> in order for the GGCS Translate API
     * Queries to succeed.
     *
     * <BR /><BR /><B>Your Directory Will Contain:</B>
     * 
     * <BR /><BR /><OL CLASS=JDUL>
     * <LI>Article Photos, stored by number as they appear in the article</LI>
     * <LI>{@code index.html} - Article Body with Translations</LI>
     * </OL>
     * 
     * @param articleBody This should have the content of the article from the vectorized HTML
     * page.  Read more about  cleaning an HTML news article in the class ArticleGet.
     * 
     * <DIV CLASS="EXAMPLE-SCROLL">{@code
     * // Generally retreiving the "Article Body" from a news-article web-page is a 'sort-of' simple
     * // two-step process.
     * //
     * // Step 1:  You must look at the web-page in your browser and press your browser's "View Content"
     * //          Button.  Identify the HTML Divider Element that looks something to the effect of
     * //          <DIV CLASS='article_body'> ... or maybe <DIV CLASS='page_content'>
     * //          You will have to find the relevant divider, or article element once, and only once,
     * //          per website
     * //
     * // Step 2: Grab that content with a simple call to the Inclusive-Get methods in NodeSearch
     *
     * URL url = new URL("https://some.foreign-news.site/some-article.html");
     * Vector<HTMLNode> articlePage = HTMLPage.getPageTokens(url, false);
     * Vector<HTMLNode> articleBody = InnerTagGetInclusive.first(articlePage, "div", "class",
     *                                  TextComparitor.C, "page-content");
     *                                  // use whatever tag you have found via the "View Content"
     *                                  // Button on your browser.  You only need to find this tag
     *                                  // once per website!
     *
     * // Now pass the 'articleBody' to this 'processArticle' method.
     * // You will also have to  retrieve the "Article Title" manually as well.
     * // Hopefully it is obvious that the 'title' could be stored in any number of ways
     * // depending on which site is being viewed.  The title location is usually "consistently 
     * // the same" as long as your on the same website.
     *
     * String title = "?";    // you must search the page to retrieve the title
     * LC articleLC = LC.es;  // Select the (spoken) language used in the article.
     *                        // This could be LC.vi (Vietnamese), LC.es (Spanish) etc...
     *
     * Ret3<Vector<String>, Vector<String>, String[]> response = processArticle
     *         (articleBody, url, title, articleLC, new StorageWriter(), "outdir/");
     *
     * // The returned String-Vectors will have the translated sentences and words readily
     * // available for use - if you wish to further process the article-content.
     * // The output directory 'outdir/' will have a readable 'index.html' file, along
     * // with any photos that were found on the page already downloaded so they may be
     * // locally included on the output page.
     * }</DIV>
     *
     * @param url  This article's URL to be scraped.  This is used, only, for including a link to
     * the articles  original page on the output index.html file.
     * 
     * @param title This is needed because obtaining the title can be done in myraid ways.  If it
     * is kept as an "external 		option" - this provides more leeway to the coder/programmer.
     * 
     * @param srcLang This is just the "two character" language code that Google Cloud Server
     * expects to see.
     * 
     * @param log This logs progress to terminal out.  Null may be passsed, in which case output
     * will not be      displayed.  Any implementation of {@code java.lang.Appendable} will
     * suffice.  Make note that the      'Appendable' interface allows / requires heeding
     * IOException's for it's 'append(...)' methods.
     * 
     * @param targetDirectory This is the directory where the image-files and 'index.html' file
     * will be stored.
     * 
     * @return This will return an instance of:
     * {@code Ret3<Vector<String>, Vector<String>, String[]>}
     * 
     * <BR /><BR /><UL CLASS=JDUL>
     * 
     * <LI> {@code ret3.a (Vector<String>)} 
     *      <BR /><BR />
     *      This vector contains a list of sentences, or sentence-fragments, in the original
     *      language of the news or article.
     *      <BR /><BR />
     *      </LI>
     * 
     * <LI> {@code ret3.b (Vector<String>)}
     *      <BR /><BR />
     *      This vector contains a list of sentences, or sentence-fragments, in the target
     *      language, which is english.
     *      <BR /><BR />
     *      </LI>
     * 
     * <LI> {@code ret3.c (String[])}
     *      <BR /><BR />
     *      This array of strings contains a list of filenames, one for each image that was 
     *      present on the original news or article page, and therefore downloaded.
     *      </LI>
     * 
     * </UL>
     */
    @SuppressWarnings("unchecked")
    public static Ret3<Vector<String>, Vector<String>, String[]> processArticle(
            Vector<HTMLNode> articleBody, URL url, String title,
            LC srcLang, Appendable log, String targetDirectory
        )
        throws IOException, ImageScraperException
    {
        if (! targetDirectory.endsWith(File.separator)) targetDirectory += File.separator;

        Vector<HTMLNode>    article             = (Vector<HTMLNode>) articleBody.clone();
        StringBuilder       out                 = new StringBuilder();
        int[]               divNodes            = null;
        String              divElemStr          = null;
        Vector<String>      imageFileNames      = null;
        String              urlStr              = URLs.urlToString(url);
        String              outFile             = targetDirectory + "index.html";

        // Anounce the beginning of the Parse & Translation
        if (log != null) log.append("FOUND ARTICLE TITLE: " + title +  '\n');

        // Start removing extranneous nodes.  First <SCRIPT>...</SCRIPT>
        // REASONS: 1) Clean Up   2) Cannot Use 'in isolation'   3) Makes Readable HTML

        int removeCount = Util.Remove.styleNodeBlocks(article);

        if (log != null) log.append
            ("Removed " + removeCount + " <STYLE ...> ... </STYLE> Node-Blocks\n");

        // Remove <STYLE>...</STYLE>
        // REASONS: 1) Clean Up   2) Cannot Use 'in isolation'   3) Makes Readable HTML

        removeCount = Util.Remove.scriptNodeBlocks(article);

        if (log != null) log.append
            ("Removed " + removeCount + " <SCRIPT ...> ... </STYLE> Node-Blocks\n");

        // Remove all other tags.  Throws away all formatting in the news-article.
        removeCount = TagNodeRemove.allExcept(article, TC.Both, "img", "a");

        if (log != null) log.append
            ("Removed " + removeCount + " remaining HTML elements that were not: <IMG> or <A>.\n");

        Util.trimTextNodes(article, true);

        // Grab and save the images.  Keep the image-filenames as they were downloaded in a vector.
        if (log != null) log.append(C.BRED + "Downloading Images First" + C.RESET + '\n');

        // Call in the ImageScraper
        // Ret2.a ==> Vector-Indices of the downloaded Images
        // Ret2.b ==> Torello.HTML.Tools.Images.Results

        Ret2<int[], Results> r = ImageScraper.localizeImages(article, url, log, targetDirectory);

        // Start building the output HTML page.  Here is the <HEAD> and top of <BODY> stuff.
        out.append(
            HEADER +
            "<H2>" + title + "</H2>\n" +
            "<H2>" + GCSTAPI.sentence(title, srcLang, LC.EN) + "</H2>\n" +
            "Original Article Link: " +
            "<A HREF=\"" + new URL(URLs.toProperURLV4(urlStr)).toString() + "\" TARGET=\"_blank\">\n" +
            urlStr + "</A>\n<BR /><BR />\n\n"
        );

        // Write this header stuff to a file, and clear the output buffer.
        if (log != null) log.append("Writing to file: " + C.BCYAN + outFile + C.RESET + '\n');

        FileRW.writeFile(out, outFile);
        out = new StringBuilder();

        // Generate the Article Body - with Foreign-Language Translations, and Vocabulary tables
        // with English & Spanish

        Ret2<Vector<String>, Vector<String>> r2 =
            HTMLPageTablesGenerator.getArticleHTML(article, srcLang, out, log);

        if (log != null) log.append
            ("\nAppending to file: " + C.BCYAN + outFile + C.RESET + '\n');

        FileRW.appendToFile(out, outFile);

        // generate the data-div's for the JS
        HTMLDataDIVGenerator.generateHTMLDataDIVs(r2.a, true, srcLang, true, true, outFile, log);

        // Write the rest of this to a file.
        if (log != null) log.append("Appending to file: " + C.BCYAN + outFile + C.RESET + '\n');

        FileRW.appendToFile("</BODY>\n</HTML>\n", outFile);

        if (log != null) log.append("Done.\n");
    
        return new Ret3<Vector<String>, Vector<String>, String[]>(r2.a, r2.b, r.b.fileNames);
    }
}