1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
package Torello.Languages;

import java.util.*;
import java.io.*;
import java.net.URL;

import Torello.Java.*;
import Torello.Java.Additional.*;
import Torello.HTML.*;
import Torello.HTML.NodeSearch.*;
import Torello.HTML.Tools.Images.*;
import Torello.Languages.FNA.*;

import static Torello.Java.C.*;

/**
 * A simple Foreign News Article Scraper.
 * 
 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=FNA>
 * @see GCSTAPI#key
 * @see GCSTAPI#sentence(String, LC, LC)
 * @see GCSTAPI#wordByWord(Vector, LC, LC)
 */
@Torello.JavaDoc.StaticFunctional
public class ForeignNewsArticle
{
    private ForeignNewsArticle() { }

    /**
     * This is the HTML page header that is appended to the output page.
     */
    public static final String HEADER =  
        "<HTML>\n"	+
        HTMLHeader.metaTag + "\n"	+
        "<TITLE>Translated, Foreign Language Article</TITLE>\n"	+
        "<SCRIPT type=\"text/javascript\">\n" + HTMLHeader.javaScript + "\n" + "</SCRIPT>" + "\n"	+
        "<STYLE>\n" + HTMLHeader.css + "</STYLE>" + "\n"	+
        "<BODY>" + "\n" + HTMLHeader.popUpDIV + "\n"	+
        HTMLHeader.text2SpeechNote;

    /**
     * This will download and translate a news article from a foreign news website.  All that you
     * need to do is provide the main "Article-Body" of the article, and some information -
     * <I><B>and calls to Google Cloud Server Translate API</I></B> will be handled by the code.
     *
     * <BR /><BR /><B><SPAN STYLE="color: red;">IMPORTANT NOTE:</B></SPAN> This class makes calls
     * to the GCSTAPI, which is an acronym meaning the Google Cloud Server Translate API.  This
     * server expects you to pay Google for the services that it provides. The translations are not
     * free - but they are not too expensive either.  <B><I>You must be sure to set the
     * {@code class GSCTAPI -> String key} field </I></B> in order for the GGCS Translate API
     * Queries to succeed.
     *
     * <BR /><BR /><B>Your Directory Will Contain:</B>
     * 
     * <BR /><BR /><OL CLASS=JDUL>
     * <LI>Article Photos, stored by number as they appear in the article</LI>
     * <LI>{@code index.html} - Article Body with Translations</LI>
     * </OL>
     * 
     * @param articleBody This should have the content of the article from the vectorized HTML
     * page.  Read more about  cleaning an HTML news article in the class ArticleGet.
     * 
     * <DIV CLASS="EXAMPLE-SCROLL">{@code
     * // Generally retreiving the "Article Body" from a news-article web-page is a 'sort-of' simple
     * // two-step process.
     * //
     * // Step 1:  You must look at the web-page in your browser and press your browser's "View Content"
     * //          Button.  Identify the HTML Divider Element that looks something to the effect of
     * //          <DIV CLASS='article_body'> ... or maybe <DIV CLASS='page_content'>
     * //          You will have to find the relevant divider, or article element once, and only once,
     * //          per website
     * //
     * // Step 2: Grab that content with a simple call to the Inclusive-Get methods in NodeSearch
     *
     * URL url = new URL("https://some.foreign-news.site/some-article.html");
     * Vector<HTMLNode> articlePage = HTMLPage.getPageTokens(url, false);
     * Vector<HTMLNode> articleBody = InnerTagGetInclusive.first(articlePage, "div", "class",
     *                                  TextComparitor.C, "page-content");
     *                                  // use whatever tag you have found via the "View Content"
     *                                  // Button on your browser.  You only need to find this tag
     *                                  // once per website!
     *
     * // Now pass the 'articleBody' to this 'processArticle' method.
     * // You will also have to  retrieve the "Article Title" manually as well.
     * // Hopefully it is obvious that the 'title' could be stored in any number of ways
     * // depending on which site is being viewed.  The title location is usually "consistently 
     * // the same" as long as your on the same website.
     *
     * String title = "?";    // you must search the page to retrieve the title
     * LC articleLC = LC.es;  // Select the (spoken) language used in the article.
     *                        // This could be LC.vi (Vietnamese), LC.es (Spanish) etc...
     *
     * Ret3<Vector<String>, Vector<String>, String[]> response = processArticle
     *         (articleBody, url, title, articleLC, new StorageWriter(), "outdir/");
     *
     * // The returned String-Vectors will have the translated sentences and words readily
     * // available for use - if you wish to further process the article-content.
     * // The output directory 'outdir/' will have a readable 'index.html' file, along
     * // with any photos that were found on the page already downloaded so they may be
     * // locally included on the output page.
     * }</DIV>
     *
     * @param url  This article's URL to be scraped.  This is used, only, for including a link to
     * the articles  original page on the output index.html file.
     * 
     * @param title This is needed because obtaining the title can be done in myraid ways.  If it
     * is kept as an "external 		option" - this provides more leeway to the coder/programmer.
     * 
     * @param srcLang This is just the "two character" language code that Google Cloud Server
     * expects to see.
     * 
     * @param log This logs progress to terminal out.  Null may be passsed, in which case output
     * will not be      displayed.  Any implementation of {@code java.lang.Appendable} will
     * suffice.  Make note that the      'Appendable' interface allows / requires heeding
     * IOException's for it's 'append(...)' methods.
     * 
     * @param targetDirectory This is the directory where the image-files and 'index.html' file
     * will be stored.
     * 
     * @return This will return an instance of:
     * {@code Ret3<Vector<String>, Vector<String>, String[]>}
     * 
     * <BR /><BR /><UL CLASS=JDUL>
     * 
     * <LI> {@code ret3.a (Vector<String>)} 
     *      <BR /><BR />
     *      This vector contains a list of sentences, or sentence-fragments, in the original
     *      language of the news or article.
     *      <BR /><BR />
     *      </LI>
     * 
     * <LI> {@code ret3.b (Vector<String>)}
     *      <BR /><BR />
     *      This vector contains a list of sentences, or sentence-fragments, in the target
     *      language, which is english.
     *      <BR /><BR />
     *      </LI>
     * 
     * <LI> {@code ret3.c (String[])}
     *      <BR /><BR />
     *      This array of strings contains a list of filenames, one for each image that was 
     *      present on the original news or article page, and therefore downloaded.
     *      </LI>
     * 
     * </UL>
     */
    @SuppressWarnings("unchecked")
    public static Ret3<Vector<String>, Vector<String>, String[]> processArticle(
            Vector<HTMLNode> articleBody, URL url, String title,
            LC srcLang, Appendable log, String targetDirectory
        )
        throws IOException, ImageScraperException
    {
        if (! targetDirectory.endsWith(File.separator)) targetDirectory += File.separator;

        Vector<HTMLNode>    article             = (Vector<HTMLNode>) articleBody.clone();
        StringBuilder       out                 = new StringBuilder();
        int[]               divNodes            = null;
        String              divElemStr          = null;
        Vector<String>      imageFileNames      = null;
        String              urlStr              = URLs.urlToString(url);
        String              outFile             = targetDirectory + "index.html";

        // Anounce the beginning of the Parse & Translation
        if (log != null) log.append("FOUND ARTICLE TITLE: " + title +  '\n');

        // Start removing extranneous nodes.  First <SCRIPT>...</SCRIPT>
        // REASONS: 1) Clean Up   2) Cannot Use 'in isolation'   3) Makes Readable HTML

        int removeCount = Util.Remove.styleNodeBlocks(article);

        if (log != null) log.append
            ("Removed " + removeCount + " <STYLE ...> ... </STYLE> Node-Blocks\n");

        // Remove <STYLE>...</STYLE>
        // REASONS: 1) Clean Up   2) Cannot Use 'in isolation'   3) Makes Readable HTML

        removeCount = Util.Remove.scriptNodeBlocks(article);

        if (log != null) log.append
            ("Removed " + removeCount + " <SCRIPT ...> ... </STYLE> Node-Blocks\n");

        // Remove all other tags.  Throws away all formatting in the news-article.
        removeCount = TagNodeRemove.allExcept(article, TC.Both, "img", "a");

        if (log != null) log.append
            ("Removed " + removeCount + " remaining HTML elements that were not: <IMG> or <A>.\n");

        Util.trimTextNodes(article, true);

        // Grab and save the images.  Keep the image-filenames as they were downloaded in a vector.
        if (log != null) log.append(C.BRED + "Downloading Images First" + C.RESET + '\n');

        // Call in the ImageScraper
        // Ret2.a ==> Vector-Indices of the downloaded Images
        // Ret2.b ==> Torello.HTML.Tools.Images.Results

        Ret2<int[], Results> r = ImageScraper.localizeImages(article, url, log, targetDirectory);

        // Start building the output HTML page.  Here is the <HEAD> and top of <BODY> stuff.
        out.append(
            HEADER +
            "<H2>" + title + "</H2>\n" +
            "<H2>" + GCSTAPI.sentence(title, srcLang, LC.EN) + "</H2>\n" +
            "Original Article Link: " +
            "<A HREF=\"" + new URL(URLs.toProperURLV4(urlStr)).toString() + "\" TARGET=\"_blank\">\n" +
            urlStr + "</A>\n<BR /><BR />\n\n"
        );

        // Write this header stuff to a file, and clear the output buffer.
        if (log != null) log.append("Writing to file: " + C.BCYAN + outFile + C.RESET + '\n');

        FileRW.writeFile(out, outFile);
        out = new StringBuilder();

        // Generate the Article Body - with Foreign-Language Translations, and Vocabulary tables
        // with English & Spanish

        Ret2<Vector<String>, Vector<String>> r2 =
            HTMLPageTablesGenerator.getArticleHTML(article, srcLang, out, log);

        if (log != null) log.append
            ("\nAppending to file: " + C.BCYAN + outFile + C.RESET + '\n');

        FileRW.appendToFile(out, outFile);

        // generate the data-div's for the JS
        HTMLDataDIVGenerator.generateHTMLDataDIVs(r2.a, true, srcLang, true, true, outFile, log);

        // Write the rest of this to a file.
        if (log != null) log.append("Appending to file: " + C.BCYAN + outFile + C.RESET + '\n');

        FileRW.appendToFile("</BODY>\n</HTML>\n", outFile);

        if (log != null) log.append("Done.\n");
    
        return new Ret3<Vector<String>, Vector<String>, String[]>(r2.a, r2.b, r.b.fileNames);
    }
}