Source code

001package Torello.Languages;
002
003import java.util.*;
004import java.io.*;
005
006import Torello.HTML.*;
007import Torello.HTML.NodeSearch.*;
008import Torello.Java.*;
009
010import Torello.Java.Additional.URLs;
011
012/**
013 * Translate (普通话, Simplified &amp; Traditional Chinese) Characters.
014 * 
015 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=T>
016 */
017@Torello.JavaDoc.StaticFunctional
018public class Translate
019{
020    private Translate() { }
021
022    public static void article(
023            Vector<String>                  simpSentencesIN,
024            Vector<Vector<String>>          sentencesOUT,
025            Vector<Vector<Vector<String>>>  wordTablesOUT,
026            Vector<String>                  DOUTArr,
027            Vector<Boolean>                 DOUTErrorBoolArr
028        )
029        throws IOException
030    {
031        for (String simpSentence : simpSentencesIN)
032        {
033            Vector<String>          sentences   = new Vector<String>();
034            Vector<Vector<String>>  wordTable   = new Vector<Vector<String>>();
035            StringBuilder           DOUT        = new StringBuilder();
036            boolean                 error       = block(simpSentence, sentences, wordTable, DOUT);
037
038            sentencesOUT.add(sentences);
039            wordTablesOUT.add(wordTable);
040            DOUTArr.add(DOUT.toString());
041            DOUTErrorBoolArr.add(Boolean.valueOf(error));
042        }
043    }
044
045    /**
046     * 
047     * @throws IOException The interface java.lang.Appendable mandates that the IOException must be
048     * treated as a checked exception for all output operations.  Therefore IOException is a
049     * required exception in this method' throws clause.
050     */
051    public static boolean block(
052            String                  simpSentenceIN,
053            Vector<String>          sentencesOUT,
054            Vector<Vector<String>>  wordTableOUT,
055            Appendable              DOUT
056        )
057        throws IOException
058    {
059        String[]    gtScrape        = sentenceZH(simpSentenceIN);
060        String      pronSentence    = gtScrape[0];
061        String      englSentence    = gtScrape[1];
062
063        sentencesOUT.removeAllElements();
064        sentencesOUT.add(simpSentenceIN);
065        sentencesOUT.add(pronSentence);
066        sentencesOUT.add(englSentence);
067
068        Vector<String>  simpWords   = new Vector<String>();
069        Vector<String>  pronWords   = new Vector<String>();
070        boolean         errorParse  = PinYinParse.parse
071                            (DOUT, simpSentenceIN, pronSentence, simpWords, pronWords);
072
073        if (pronWords.size() != simpWords.size()) throw new IllegalStateException(
074            "The pronunciation and the character vector's should be the exact same length.\n" +
075            "pronWords.size() = " + pronWords.size() + " and simpWords.size() = " + 
076            simpWords.size()
077        );
078
079        int len = pronWords.size();
080        for (int i=0; i < len; i++)
081        {
082            Vector<String> vocabEntryRow = new Vector<String>();
083
084            vocabEntryRow.add(simpWords.elementAt(i));
085            vocabEntryRow.add(pronWords.elementAt(i));
086            vocabEntryRow.add(""); //Dictionary.lookupTrad(simp, pron));
087            vocabEntryRow.add(""); //Dictionary.lookupEngl(simp, pron));
088
089            wordTableOUT.add(vocabEntryRow);
090        }
091        return errorParse;
092    }
093
094
095    // *********************************************************************************************************
096    // This is from the original file named "GTScrapeCN"
097    // *********************************************************************************************************
098
099    /**
100     * This receives as input a sentence in simplified Mandarin Chinese.  If it finds a period in
101     * it, it breaks the sentence up into smaller bricks based around the period.  It queries
102     * Google Translate using this sentence.
103     * 
104     * @param chinese Any sentence, paragraph, phrase or word in Simplified-Mandarin
105     * 
106     * @return Two separate Strings returned in a String array - two elements long.
107     * 
108     * <OL CLASS=JDOL>
109     * <LI><CODE>ret[0]</CODE> - The pronunciation (罗马拼音) String scraped from a call to Google
110     * Translate</LI>
111     * <LI><CODE>ret[0]</CODE> - The English - also scraped from a call to
112     * <CODE>http://translate.google.com</CODE></LI>
113     * </OL>
114     */
115    public static String[] sentenceZH(String chinese)
116    {
117        if (chinese.indexOf('\n') != -1) throw new IllegalArgumentException("CHINESE:\t" + chinese + "\nContains a newline!");
118
119        String[]        cArr            = chinese.trim().split("。");
120        StringBuilder   completePron    = new StringBuilder();
121        StringBuilder   completeEngl    = new StringBuilder();
122
123        for (int i=0; i < cArr.length; i++)
124        {
125            // Prepare the queries and scrape http://translate.google.com/query web-page.
126            Vector<HTMLNode>    page        = null;
127            int                 retryCount  = 0;
128
129            while ((page == null) && (retryCount < 6))
130                try {
131                    String          chineseQ    = URLs.toProperURLV2(cArr[i] + "。");
132                    BufferedReader  br          = Scrape.openConn_iso_8859_1("https://translate.google.com/?q=" + chineseQ);
133                    page                        = HTMLPage.getPageTokens(br, false);
134                } catch (Exception e) {
135                    retryCount++;
136                    System.out.println("RETRY-SCRAPE Google Translate:\n" + "Attempt #" + retryCount + "\n" + e.getMessage());
137                }
138
139            // Get Chinese PinYin as Sentence
140            StringBuilder       pron    = new StringBuilder();
141            Vector<HTMLNode>    partial = InnerTagGetInclusive.first(page, "div", "id", TextComparitor.EQ_CASE_INSENSITIVE, "src-translit");
142
143            Util.Remove.allTagNodes(partial);
144            for (HTMLNode n : partial) pron.append(((TextNode) n).str);
145            completePron.append(Escape.replace(pron.toString()).trim() + "  ");
146
147
148            // Get English from Translate Website as a Sentence
149            StringBuilder   engl    = new StringBuilder();
150            partial                 = InnerTagGetInclusive.first(page, "span", "id", TextComparitor.EQ_CASE_INSENSITIVE, "result_box");
151            Util.Remove.allTagNodes(partial);
152            for (HTMLNode n : partial) engl.append(((TextNode) n).str);
153            completeEngl.append(Escape.replace(engl.toString().replaceAll("\\\\u200b", "")).trim() + "  ");
154        }
155        String [] retArr = { completePron.toString(), completeEngl.toString() };
156        return retArr;
157    }
158
159    /**
160     * Retrieves the PinYin pronunciation from Google Translate Servers for a single Chinese Word.
161     * 
162     * @param chineseWord Any single word in simplified Mandarin Chinese
163     * 
164     * @return The Pinyin Pronunciation of that word, stripped by Google Translate Servers.
165     */
166    public static String getPinYin(String chineseWord) throws IOException
167    {
168        BufferedReader      br      = Scrape.openConn_iso_8859_1("https://translate.google.com/?q=" + chineseWord + "&source=zh-CN");
169        Vector<HTMLNode>    page    = HTMLPage.getPageTokens(br, false);
170        Vector<HTMLNode>    partial = InnerTagGetInclusive.first(page, "div", "id", TextComparitor.EQ_CASE_INSENSITIVE, "src-translit");
171        String              pron    = "";
172
173        Util.Remove.allTagNodes(partial);
174        for (HTMLNode n : partial) pron += ((TextNode) n).str;
175        return Escape.replace(pron);
176    }
177
178    /**
179     * Retrieves the Google Translate (English) Textbox-defintion for a particular Mandarin Chinese
180     * Word. <BR /><B>NOTE:</B> This is not the information under the primary/main
181     * translation-text-box, this is the translation-text-box word itself.  
182     * 
183     * @param chineseWord Any single word in simplified Mandarin Chinese
184     * 
185     * @return The Google Translate Server's best attempt at a Translation.
186     */
187    public static String getEnglish(String chineseWord) throws IOException
188    {
189        BufferedReader      br      = Scrape.openConn_iso_8859_1("https://translate.google.com/?q=" + chineseWord);
190        Vector<HTMLNode>    page    = HTMLPage.getPageTokens(br, false, null, "matches.txt", null);
191        Vector<HTMLNode>    partial = InnerTagGetInclusive.first(page, "span", "id", TextComparitor.EQ_CASE_INSENSITIVE, "result_box");
192        String              engl    = "";
193
194        Util.Remove.allTagNodes(partial);
195        for (HTMLNode n : partial) engl += ((TextNode) n).str;
196        return Escape.replace(engl);
197    }
198}