001package Torello.Languages; 002 003import java.util.*; 004import java.io.*; 005import java.net.URL; 006 007import Torello.Java.*; 008import Torello.Java.Additional.*; 009import Torello.HTML.*; 010import Torello.HTML.NodeSearch.*; 011import Torello.HTML.Tools.Images.*; 012import Torello.Languages.FNA.*; 013 014import static Torello.Java.C.*; 015 016/** 017 * A simple Foreign News Article Scraper. 018 * 019 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=FNA> 020 * @see GCSTAPI#key 021 * @see GCSTAPI#sentence(String, LC, LC) 022 * @see GCSTAPI#wordByWord(Vector, LC, LC) 023 */ 024@Torello.JavaDoc.StaticFunctional 025public class ForeignNewsArticle 026{ 027 private ForeignNewsArticle() { } 028 029 /** 030 * This is the HTML page header that is appended to the output page. 031 */ 032 public static final String HEADER = 033 "<HTML>\n" + 034 HTMLHeader.metaTag + "\n" + 035 "<TITLE>Translated, Foreign Language Article</TITLE>\n" + 036 "<SCRIPT type=\"text/javascript\">\n" + HTMLHeader.javaScript + "\n" + "</SCRIPT>" + "\n" + 037 "<STYLE>\n" + HTMLHeader.css + "</STYLE>" + "\n" + 038 "<BODY>" + "\n" + HTMLHeader.popUpDIV + "\n" + 039 HTMLHeader.text2SpeechNote; 040 041 /** 042 * This will download and translate a news article from a foreign news website. All that you 043 * need to do is provide the main "Article-Body" of the article, and some information - 044 * <I><B>and calls to Google Cloud Server Translate API</I></B> will be handled by the code. 045 * 046 * <BR /><BR /><B><SPAN STYLE="color: red;">IMPORTANT NOTE:</B></SPAN> This class makes calls 047 * to the GCSTAPI, which is an acronym meaning the Google Cloud Server Translate API. This 048 * server expects you to pay Google for the services that it provides. The translations are not 049 * free - but they are not too expensive either. <B><I>You must be sure to set the 050 * {@code class GSCTAPI -> String key} field </I></B> in order for the GGCS Translate API 051 * Queries to succeed. 052 * 053 * <BR /><BR /><B>Your Directory Will Contain:</B> 054 * 055 * <BR /><BR /><OL CLASS=JDUL> 056 * <LI>Article Photos, stored by number as they appear in the article</LI> 057 * <LI>{@code index.html} - Article Body with Translations</LI> 058 * </OL> 059 * 060 * @param articleBody This should have the content of the article from the vectorized HTML 061 * page. Read more about cleaning an HTML news article in the class ArticleGet. 062 * 063 * <DIV CLASS="EXAMPLE-SCROLL">{@code 064 * // Generally retreiving the "Article Body" from a news-article web-page is a 'sort-of' simple 065 * // two-step process. 066 * // 067 * // Step 1: You must look at the web-page in your browser and press your browser's "View Content" 068 * // Button. Identify the HTML Divider Element that looks something to the effect of 069 * // <DIV CLASS='article_body'> ... or maybe <DIV CLASS='page_content'> 070 * // You will have to find the relevant divider, or article element once, and only once, 071 * // per website 072 * // 073 * // Step 2: Grab that content with a simple call to the Inclusive-Get methods in NodeSearch 074 * 075 * URL url = new URL("https://some.foreign-news.site/some-article.html"); 076 * Vector<HTMLNode> articlePage = HTMLPage.getPageTokens(url, false); 077 * Vector<HTMLNode> articleBody = InnerTagGetInclusive.first(articlePage, "div", "class", 078 * TextComparitor.C, "page-content"); 079 * // use whatever tag you have found via the "View Content" 080 * // Button on your browser. You only need to find this tag 081 * // once per website! 082 * 083 * // Now pass the 'articleBody' to this 'processArticle' method. 084 * // You will also have to retrieve the "Article Title" manually as well. 085 * // Hopefully it is obvious that the 'title' could be stored in any number of ways 086 * // depending on which site is being viewed. The title location is usually "consistently 087 * // the same" as long as your on the same website. 088 * 089 * String title = "?"; // you must search the page to retrieve the title 090 * LC articleLC = LC.es; // Select the (spoken) language used in the article. 091 * // This could be LC.vi (Vietnamese), LC.es (Spanish) etc... 092 * 093 * Ret3<Vector<String>, Vector<String>, String[]> response = processArticle 094 * (articleBody, url, title, articleLC, new StorageWriter(), "outdir/"); 095 * 096 * // The returned String-Vectors will have the translated sentences and words readily 097 * // available for use - if you wish to further process the article-content. 098 * // The output directory 'outdir/' will have a readable 'index.html' file, along 099 * // with any photos that were found on the page already downloaded so they may be 100 * // locally included on the output page. 101 * }</DIV> 102 * 103 * @param url This article's URL to be scraped. This is used, only, for including a link to 104 * the articles original page on the output index.html file. 105 * 106 * @param title This is needed because obtaining the title can be done in myraid ways. If it 107 * is kept as an "external option" - this provides more leeway to the coder/programmer. 108 * 109 * @param srcLang This is just the "two character" language code that Google Cloud Server 110 * expects to see. 111 * 112 * @param log This logs progress to terminal out. Null may be passsed, in which case output 113 * will not be displayed. Any implementation of {@code java.lang.Appendable} will 114 * suffice. Make note that the 'Appendable' interface allows / requires heeding 115 * IOException's for it's 'append(...)' methods. 116 * 117 * @param targetDirectory This is the directory where the image-files and 'index.html' file 118 * will be stored. 119 * 120 * @return This will return an instance of: 121 * {@code Ret3<Vector<String>, Vector<String>, String[]>} 122 * 123 * <BR /><BR /><UL CLASS=JDUL> 124 * 125 * <LI> {@code ret3.a (Vector<String>)} 126 * <BR /><BR /> 127 * This vector contains a list of sentences, or sentence-fragments, in the original 128 * language of the news or article. 129 * <BR /><BR /> 130 * </LI> 131 * 132 * <LI> {@code ret3.b (Vector<String>)} 133 * <BR /><BR /> 134 * This vector contains a list of sentences, or sentence-fragments, in the target 135 * language, which is english. 136 * <BR /><BR /> 137 * </LI> 138 * 139 * <LI> {@code ret3.c (String[])} 140 * <BR /><BR /> 141 * This array of strings contains a list of filenames, one for each image that was 142 * present on the original news or article page, and therefore downloaded. 143 * </LI> 144 * 145 * </UL> 146 */ 147 @SuppressWarnings("unchecked") 148 public static Ret3<Vector<String>, Vector<String>, String[]> processArticle( 149 Vector<HTMLNode> articleBody, URL url, String title, 150 LC srcLang, Appendable log, String targetDirectory 151 ) 152 throws IOException, ImageScraperException 153 { 154 if (! targetDirectory.endsWith(File.separator)) targetDirectory += File.separator; 155 156 Vector<HTMLNode> article = (Vector<HTMLNode>) articleBody.clone(); 157 StringBuilder out = new StringBuilder(); 158 int[] divNodes = null; 159 String divElemStr = null; 160 Vector<String> imageFileNames = null; 161 String urlStr = URLs.urlToString(url); 162 String outFile = targetDirectory + "index.html"; 163 164 // Anounce the beginning of the Parse & Translation 165 if (log != null) log.append("FOUND ARTICLE TITLE: " + title + '\n'); 166 167 // Start removing extranneous nodes. First <SCRIPT>...</SCRIPT> 168 // REASONS: 1) Clean Up 2) Cannot Use 'in isolation' 3) Makes Readable HTML 169 170 int removeCount = Util.Remove.styleNodeBlocks(article); 171 172 if (log != null) log.append 173 ("Removed " + removeCount + " <STYLE ...> ... </STYLE> Node-Blocks\n"); 174 175 // Remove <STYLE>...</STYLE> 176 // REASONS: 1) Clean Up 2) Cannot Use 'in isolation' 3) Makes Readable HTML 177 178 removeCount = Util.Remove.scriptNodeBlocks(article); 179 180 if (log != null) log.append 181 ("Removed " + removeCount + " <SCRIPT ...> ... </STYLE> Node-Blocks\n"); 182 183 // Remove all other tags. Throws away all formatting in the news-article. 184 removeCount = TagNodeRemove.allExcept(article, TC.Both, "img", "a"); 185 186 if (log != null) log.append 187 ("Removed " + removeCount + " remaining HTML elements that were not: <IMG> or <A>.\n"); 188 189 Util.trimTextNodes(article, true); 190 191 // Grab and save the images. Keep the image-filenames as they were downloaded in a vector. 192 if (log != null) log.append(C.BRED + "Downloading Images First" + C.RESET + '\n'); 193 194 // Call in the ImageScraper 195 // Ret2.a ==> Vector-Indices of the downloaded Images 196 // Ret2.b ==> Torello.HTML.Tools.Images.Results 197 198 Ret2<int[], Results> r = ImageScraper.localizeImages(article, url, log, targetDirectory); 199 200 // Start building the output HTML page. Here is the <HEAD> and top of <BODY> stuff. 201 out.append( 202 HEADER + 203 "<H2>" + title + "</H2>\n" + 204 "<H2>" + GCSTAPI.sentence(title, srcLang, LC.EN) + "</H2>\n" + 205 "Original Article Link: " + 206 "<A HREF=\"" + new URL(URLs.toProperURLV4(urlStr)).toString() + "\" TARGET=\"_blank\">\n" + 207 urlStr + "</A>\n<BR /><BR />\n\n" 208 ); 209 210 // Write this header stuff to a file, and clear the output buffer. 211 if (log != null) log.append("Writing to file: " + C.BCYAN + outFile + C.RESET + '\n'); 212 213 FileRW.writeFile(out, outFile); 214 out = new StringBuilder(); 215 216 // Generate the Article Body - with Foreign-Language Translations, and Vocabulary tables 217 // with English & Spanish 218 219 Ret2<Vector<String>, Vector<String>> r2 = 220 HTMLPageTablesGenerator.getArticleHTML(article, srcLang, out, log); 221 222 if (log != null) log.append 223 ("\nAppending to file: " + C.BCYAN + outFile + C.RESET + '\n'); 224 225 FileRW.appendToFile(out, outFile); 226 227 // generate the data-div's for the JS 228 HTMLDataDIVGenerator.generateHTMLDataDIVs(r2.a, true, srcLang, true, true, outFile, log); 229 230 // Write the rest of this to a file. 231 if (log != null) log.append("Appending to file: " + C.BCYAN + outFile + C.RESET + '\n'); 232 233 FileRW.appendToFile("</BODY>\n</HTML>\n", outFile); 234 235 if (log != null) log.append("Done.\n"); 236 237 return new Ret3<Vector<String>, Vector<String>, String[]>(r2.a, r2.b, r.b.fileNames); 238 } 239}