001package Torello.Languages; 002 003import java.util.*; 004import java.io.*; 005 006import Torello.HTML.*; 007import Torello.HTML.NodeSearch.*; 008import Torello.Java.*; 009 010import Torello.Java.Additional.URLs; 011 012/** 013 * Translate (普通话, Simplified & Traditional Chinese) Characters. 014 * 015 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=T> 016 */ 017@Torello.JavaDoc.StaticFunctional 018public class Translate 019{ 020 private Translate() { } 021 022 public static void article( 023 Vector<String> simpSentencesIN, 024 Vector<Vector<String>> sentencesOUT, 025 Vector<Vector<Vector<String>>> wordTablesOUT, 026 Vector<String> DOUTArr, 027 Vector<Boolean> DOUTErrorBoolArr 028 ) 029 throws IOException 030 { 031 for (String simpSentence : simpSentencesIN) 032 { 033 Vector<String> sentences = new Vector<String>(); 034 Vector<Vector<String>> wordTable = new Vector<Vector<String>>(); 035 StringBuilder DOUT = new StringBuilder(); 036 boolean error = block(simpSentence, sentences, wordTable, DOUT); 037 038 sentencesOUT.add(sentences); 039 wordTablesOUT.add(wordTable); 040 DOUTArr.add(DOUT.toString()); 041 DOUTErrorBoolArr.add(Boolean.valueOf(error)); 042 } 043 } 044 045 /** 046 * 047 * @throws IOException The interface java.lang.Appendable mandates that the IOException must be 048 * treated as a checked exception for all output operations. Therefore IOException is a 049 * required exception in this method' throws clause. 050 */ 051 public static boolean block( 052 String simpSentenceIN, 053 Vector<String> sentencesOUT, 054 Vector<Vector<String>> wordTableOUT, 055 Appendable DOUT 056 ) 057 throws IOException 058 { 059 String[] gtScrape = sentenceZH(simpSentenceIN); 060 String pronSentence = gtScrape[0]; 061 String englSentence = gtScrape[1]; 062 063 sentencesOUT.removeAllElements(); 064 sentencesOUT.add(simpSentenceIN); 065 sentencesOUT.add(pronSentence); 066 sentencesOUT.add(englSentence); 067 068 Vector<String> simpWords = new Vector<String>(); 069 Vector<String> pronWords = new Vector<String>(); 070 boolean errorParse = PinYinParse.parse 071 (DOUT, simpSentenceIN, pronSentence, simpWords, pronWords); 072 073 if (pronWords.size() != simpWords.size()) throw new IllegalStateException( 074 "The pronunciation and the character vector's should be the exact same length.\n" + 075 "pronWords.size() = " + pronWords.size() + " and simpWords.size() = " + 076 simpWords.size() 077 ); 078 079 int len = pronWords.size(); 080 for (int i=0; i < len; i++) 081 { 082 Vector<String> vocabEntryRow = new Vector<String>(); 083 084 vocabEntryRow.add(simpWords.elementAt(i)); 085 vocabEntryRow.add(pronWords.elementAt(i)); 086 vocabEntryRow.add(""); //Dictionary.lookupTrad(simp, pron)); 087 vocabEntryRow.add(""); //Dictionary.lookupEngl(simp, pron)); 088 089 wordTableOUT.add(vocabEntryRow); 090 } 091 return errorParse; 092 } 093 094 095 // ********************************************************************************************************* 096 // This is from the original file named "GTScrapeCN" 097 // ********************************************************************************************************* 098 099 /** 100 * This receives as input a sentence in simplified Mandarin Chinese. If it finds a period in 101 * it, it breaks the sentence up into smaller bricks based around the period. It queries 102 * Google Translate using this sentence. 103 * 104 * @param chinese Any sentence, paragraph, phrase or word in Simplified-Mandarin 105 * 106 * @return Two separate Strings returned in a String array - two elements long. 107 * 108 * <OL CLASS=JDOL> 109 * <LI><CODE>ret[0]</CODE> - The pronunciation (罗马拼音) String scraped from a call to Google 110 * Translate</LI> 111 * <LI><CODE>ret[0]</CODE> - The English - also scraped from a call to 112 * <CODE>http://translate.google.com</CODE></LI> 113 * </OL> 114 */ 115 public static String[] sentenceZH(String chinese) 116 { 117 if (chinese.indexOf('\n') != -1) throw new IllegalArgumentException("CHINESE:\t" + chinese + "\nContains a newline!"); 118 119 String[] cArr = chinese.trim().split("。"); 120 StringBuilder completePron = new StringBuilder(); 121 StringBuilder completeEngl = new StringBuilder(); 122 123 for (int i=0; i < cArr.length; i++) 124 { 125 // Prepare the queries and scrape http://translate.google.com/query web-page. 126 Vector<HTMLNode> page = null; 127 int retryCount = 0; 128 129 while ((page == null) && (retryCount < 6)) 130 try { 131 String chineseQ = URLs.toProperURLV2(cArr[i] + "。"); 132 BufferedReader br = Scrape.openConn_iso_8859_1("https://translate.google.com/?q=" + chineseQ); 133 page = HTMLPage.getPageTokens(br, false); 134 } catch (Exception e) { 135 retryCount++; 136 System.out.println("RETRY-SCRAPE Google Translate:\n" + "Attempt #" + retryCount + "\n" + e.getMessage()); 137 } 138 139 // Get Chinese PinYin as Sentence 140 StringBuilder pron = new StringBuilder(); 141 Vector<HTMLNode> partial = InnerTagGetInclusive.first(page, "div", "id", TextComparitor.EQ_CASE_INSENSITIVE, "src-translit"); 142 143 Util.Remove.allTagNodes(partial); 144 for (HTMLNode n : partial) pron.append(((TextNode) n).str); 145 completePron.append(Escape.replace(pron.toString()).trim() + " "); 146 147 148 // Get English from Translate Website as a Sentence 149 StringBuilder engl = new StringBuilder(); 150 partial = InnerTagGetInclusive.first(page, "span", "id", TextComparitor.EQ_CASE_INSENSITIVE, "result_box"); 151 Util.Remove.allTagNodes(partial); 152 for (HTMLNode n : partial) engl.append(((TextNode) n).str); 153 completeEngl.append(Escape.replace(engl.toString().replaceAll("\\\\u200b", "")).trim() + " "); 154 } 155 String [] retArr = { completePron.toString(), completeEngl.toString() }; 156 return retArr; 157 } 158 159 /** 160 * Retrieves the PinYin pronunciation from Google Translate Servers for a single Chinese Word. 161 * 162 * @param chineseWord Any single word in simplified Mandarin Chinese 163 * 164 * @return The Pinyin Pronunciation of that word, stripped by Google Translate Servers. 165 */ 166 public static String getPinYin(String chineseWord) throws IOException 167 { 168 BufferedReader br = Scrape.openConn_iso_8859_1("https://translate.google.com/?q=" + chineseWord + "&source=zh-CN"); 169 Vector<HTMLNode> page = HTMLPage.getPageTokens(br, false); 170 Vector<HTMLNode> partial = InnerTagGetInclusive.first(page, "div", "id", TextComparitor.EQ_CASE_INSENSITIVE, "src-translit"); 171 String pron = ""; 172 173 Util.Remove.allTagNodes(partial); 174 for (HTMLNode n : partial) pron += ((TextNode) n).str; 175 return Escape.replace(pron); 176 } 177 178 /** 179 * Retrieves the Google Translate (English) Textbox-defintion for a particular Mandarin Chinese 180 * Word. <BR /><B>NOTE:</B> This is not the information under the primary/main 181 * translation-text-box, this is the translation-text-box word itself. 182 * 183 * @param chineseWord Any single word in simplified Mandarin Chinese 184 * 185 * @return The Google Translate Server's best attempt at a Translation. 186 */ 187 public static String getEnglish(String chineseWord) throws IOException 188 { 189 BufferedReader br = Scrape.openConn_iso_8859_1("https://translate.google.com/?q=" + chineseWord); 190 Vector<HTMLNode> page = HTMLPage.getPageTokens(br, false, null, "matches.txt", null); 191 Vector<HTMLNode> partial = InnerTagGetInclusive.first(page, "span", "id", TextComparitor.EQ_CASE_INSENSITIVE, "result_box"); 192 String engl = ""; 193 194 Util.Remove.allTagNodes(partial); 195 for (HTMLNode n : partial) engl += ((TextNode) n).str; 196 return Escape.replace(engl); 197 } 198}