001package Torello.Languages;
002
003import java.util.*;
004import java.io.*;
005import java.net.URL;
006
007import Torello.Java.*;
008import Torello.Java.Additional.*;
009import Torello.HTML.*;
010import Torello.HTML.NodeSearch.*;
011import Torello.HTML.Tools.Images.*;
012import Torello.Languages.FNA.*;
013
014import static Torello.Java.C.*;
015
016/**
017 * A simple Foreign News Article Scraper.
018 * 
019 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=FNA>
020 * @see GCSTAPI#key
021 * @see GCSTAPI#sentence(String, LC, LC)
022 * @see GCSTAPI#wordByWord(Vector, LC, LC)
023 */
024@Torello.JavaDoc.StaticFunctional
025public class ForeignNewsArticle
026{
027    private ForeignNewsArticle() { }
028
029    /**
030     * This is the HTML page header that is appended to the output page.
031     */
032    public static final String HEADER =  
033        "<HTML>\n"  +
034        HTMLHeader.metaTag + "\n"   +
035        "<TITLE>Translated, Foreign Language Article</TITLE>\n" +
036        "<SCRIPT type=\"text/javascript\">\n" + HTMLHeader.javaScript + "\n" + "</SCRIPT>" + "\n"   +
037        "<STYLE>\n" + HTMLHeader.css + "</STYLE>" + "\n"    +
038        "<BODY>" + "\n" + HTMLHeader.popUpDIV + "\n"    +
039        HTMLHeader.text2SpeechNote;
040
041    /**
042     * This will download and translate a news article from a foreign news website.  All that you
043     * need to do is provide the main "Article-Body" of the article, and some information -
044     * <I><B>and calls to Google Cloud Server Translate API</I></B> will be handled by the code.
045     *
046     * <BR /><BR /><B><SPAN STYLE="color: red;">IMPORTANT NOTE:</B></SPAN> This class makes calls
047     * to the GCSTAPI, which is an acronym meaning the Google Cloud Server Translate API.  This
048     * server expects you to pay Google for the services that it provides. The translations are not
049     * free - but they are not too expensive either.  <B><I>You must be sure to set the
050     * {@code class GSCTAPI -> String key} field </I></B> in order for the GGCS Translate API
051     * Queries to succeed.
052     *
053     * <BR /><BR /><B>Your Directory Will Contain:</B>
054     * 
055     * <BR /><BR /><OL CLASS=JDUL>
056     * <LI>Article Photos, stored by number as they appear in the article</LI>
057     * <LI>{@code index.html} - Article Body with Translations</LI>
058     * </OL>
059     * 
060     * @param articleBody This should have the content of the article from the vectorized HTML
061     * page.  Read more about  cleaning an HTML news article in the class ArticleGet.
062     * 
063     * <DIV CLASS="EXAMPLE-SCROLL">{@code
064     * // Generally retreiving the "Article Body" from a news-article web-page is a 'sort-of' simple
065     * // two-step process.
066     * //
067     * // Step 1:  You must look at the web-page in your browser and press your browser's "View Content"
068     * //          Button.  Identify the HTML Divider Element that looks something to the effect of
069     * //          <DIV CLASS='article_body'> ... or maybe <DIV CLASS='page_content'>
070     * //          You will have to find the relevant divider, or article element once, and only once,
071     * //          per website
072     * //
073     * // Step 2: Grab that content with a simple call to the Inclusive-Get methods in NodeSearch
074     *
075     * URL url = new URL("https://some.foreign-news.site/some-article.html");
076     * Vector<HTMLNode> articlePage = HTMLPage.getPageTokens(url, false);
077     * Vector<HTMLNode> articleBody = InnerTagGetInclusive.first(articlePage, "div", "class",
078     *                                  TextComparitor.C, "page-content");
079     *                                  // use whatever tag you have found via the "View Content"
080     *                                  // Button on your browser.  You only need to find this tag
081     *                                  // once per website!
082     *
083     * // Now pass the 'articleBody' to this 'processArticle' method.
084     * // You will also have to  retrieve the "Article Title" manually as well.
085     * // Hopefully it is obvious that the 'title' could be stored in any number of ways
086     * // depending on which site is being viewed.  The title location is usually "consistently 
087     * // the same" as long as your on the same website.
088     *
089     * String title = "?";    // you must search the page to retrieve the title
090     * LC articleLC = LC.es;  // Select the (spoken) language used in the article.
091     *                        // This could be LC.vi (Vietnamese), LC.es (Spanish) etc...
092     *
093     * Ret3<Vector<String>, Vector<String>, String[]> response = processArticle
094     *         (articleBody, url, title, articleLC, new StorageWriter(), "outdir/");
095     *
096     * // The returned String-Vectors will have the translated sentences and words readily
097     * // available for use - if you wish to further process the article-content.
098     * // The output directory 'outdir/' will have a readable 'index.html' file, along
099     * // with any photos that were found on the page already downloaded so they may be
100     * // locally included on the output page.
101     * }</DIV>
102     *
103     * @param url  This article's URL to be scraped.  This is used, only, for including a link to
104     * the articles  original page on the output index.html file.
105     * 
106     * @param title This is needed because obtaining the title can be done in myraid ways.  If it
107     * is kept as an "external      option" - this provides more leeway to the coder/programmer.
108     * 
109     * @param srcLang This is just the "two character" language code that Google Cloud Server
110     * expects to see.
111     * 
112     * @param log This logs progress to terminal out.  Null may be passsed, in which case output
113     * will not be      displayed.  Any implementation of {@code java.lang.Appendable} will
114     * suffice.  Make note that the      'Appendable' interface allows / requires heeding
115     * IOException's for it's 'append(...)' methods.
116     * 
117     * @param targetDirectory This is the directory where the image-files and 'index.html' file
118     * will be stored.
119     * 
120     * @return This will return an instance of:
121     * {@code Ret3<Vector<String>, Vector<String>, String[]>}
122     * 
123     * <BR /><BR /><UL CLASS=JDUL>
124     * 
125     * <LI> {@code ret3.a (Vector<String>)} 
126     *      <BR /><BR />
127     *      This vector contains a list of sentences, or sentence-fragments, in the original
128     *      language of the news or article.
129     *      <BR /><BR />
130     *      </LI>
131     * 
132     * <LI> {@code ret3.b (Vector<String>)}
133     *      <BR /><BR />
134     *      This vector contains a list of sentences, or sentence-fragments, in the target
135     *      language, which is english.
136     *      <BR /><BR />
137     *      </LI>
138     * 
139     * <LI> {@code ret3.c (String[])}
140     *      <BR /><BR />
141     *      This array of strings contains a list of filenames, one for each image that was 
142     *      present on the original news or article page, and therefore downloaded.
143     *      </LI>
144     * 
145     * </UL>
146     */
147    @SuppressWarnings("unchecked")
148    public static Ret3<Vector<String>, Vector<String>, String[]> processArticle(
149            Vector<HTMLNode> articleBody, URL url, String title,
150            LC srcLang, Appendable log, String targetDirectory
151        )
152        throws IOException, ImageScraperException
153    {
154        if (! targetDirectory.endsWith(File.separator)) targetDirectory += File.separator;
155
156        Vector<HTMLNode>    article             = (Vector<HTMLNode>) articleBody.clone();
157        StringBuilder       out                 = new StringBuilder();
158        int[]               divNodes            = null;
159        String              divElemStr          = null;
160        Vector<String>      imageFileNames      = null;
161        String              urlStr              = URLs.urlToString(url);
162        String              outFile             = targetDirectory + "index.html";
163
164        // Anounce the beginning of the Parse & Translation
165        if (log != null) log.append("FOUND ARTICLE TITLE: " + title +  '\n');
166
167        // Start removing extranneous nodes.  First <SCRIPT>...</SCRIPT>
168        // REASONS: 1) Clean Up   2) Cannot Use 'in isolation'   3) Makes Readable HTML
169
170        int removeCount = Util.Remove.styleNodeBlocks(article);
171
172        if (log != null) log.append
173            ("Removed " + removeCount + " <STYLE ...> ... </STYLE> Node-Blocks\n");
174
175        // Remove <STYLE>...</STYLE>
176        // REASONS: 1) Clean Up   2) Cannot Use 'in isolation'   3) Makes Readable HTML
177
178        removeCount = Util.Remove.scriptNodeBlocks(article);
179
180        if (log != null) log.append
181            ("Removed " + removeCount + " <SCRIPT ...> ... </STYLE> Node-Blocks\n");
182
183        // Remove all other tags.  Throws away all formatting in the news-article.
184        removeCount = TagNodeRemove.allExcept(article, TC.Both, "img", "a");
185
186        if (log != null) log.append
187            ("Removed " + removeCount + " remaining HTML elements that were not: <IMG> or <A>.\n");
188
189        Util.trimTextNodes(article, true);
190
191        // Grab and save the images.  Keep the image-filenames as they were downloaded in a vector.
192        if (log != null) log.append(C.BRED + "Downloading Images First" + C.RESET + '\n');
193
194        // Call in the ImageScraper
195        // Ret2.a ==> Vector-Indices of the downloaded Images
196        // Ret2.b ==> Torello.HTML.Tools.Images.Results
197
198        Ret2<int[], Results> r = ImageScraper.localizeImages(article, url, log, targetDirectory);
199
200        // Start building the output HTML page.  Here is the <HEAD> and top of <BODY> stuff.
201        out.append(
202            HEADER +
203            "<H2>" + title + "</H2>\n" +
204            "<H2>" + GCSTAPI.sentence(title, srcLang, LC.EN) + "</H2>\n" +
205            "Original Article Link: " +
206            "<A HREF=\"" + new URL(URLs.toProperURLV4(urlStr)).toString() + "\" TARGET=\"_blank\">\n" +
207            urlStr + "</A>\n<BR /><BR />\n\n"
208        );
209
210        // Write this header stuff to a file, and clear the output buffer.
211        if (log != null) log.append("Writing to file: " + C.BCYAN + outFile + C.RESET + '\n');
212
213        FileRW.writeFile(out, outFile);
214        out = new StringBuilder();
215
216        // Generate the Article Body - with Foreign-Language Translations, and Vocabulary tables
217        // with English & Spanish
218
219        Ret2<Vector<String>, Vector<String>> r2 =
220            HTMLPageTablesGenerator.getArticleHTML(article, srcLang, out, log);
221
222        if (log != null) log.append
223            ("\nAppending to file: " + C.BCYAN + outFile + C.RESET + '\n');
224
225        FileRW.appendToFile(out, outFile);
226
227        // generate the data-div's for the JS
228        HTMLDataDIVGenerator.generateHTMLDataDIVs(r2.a, true, srcLang, true, true, outFile, log);
229
230        // Write the rest of this to a file.
231        if (log != null) log.append("Appending to file: " + C.BCYAN + outFile + C.RESET + '\n');
232
233        FileRW.appendToFile("</BODY>\n</HTML>\n", outFile);
234
235        if (log != null) log.append("Done.\n");
236    
237        return new Ret3<Vector<String>, Vector<String>, String[]>(r2.a, r2.b, r.b.fileNames);
238    }
239}