1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
package Torello.Languages;

import java.util.*;
import java.io.*;

import Torello.HTML.*;
import Torello.HTML.NodeSearch.*;
import Torello.Java.*;

import Torello.Java.Additional.URLs;

/**
 * Translate (普通话, Simplified & Traditional Chinese) Characters.
 * 
 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=T>
 */
@Torello.JavaDoc.StaticFunctional
public class Translate
{
    private Translate() { }

    public static void article(
            Vector<String>                  simpSentencesIN,
            Vector<Vector<String>>          sentencesOUT,
            Vector<Vector<Vector<String>>>  wordTablesOUT,
            Vector<String>                  DOUTArr,
            Vector<Boolean>                 DOUTErrorBoolArr
        )
        throws IOException
    {
        for (String simpSentence : simpSentencesIN)
        {
            Vector<String>          sentences   = new Vector<String>();
            Vector<Vector<String>>  wordTable   = new Vector<Vector<String>>();
            StringBuilder           DOUT        = new StringBuilder();
            boolean                 error       = block(simpSentence, sentences, wordTable, DOUT);

            sentencesOUT.add(sentences);
            wordTablesOUT.add(wordTable);
            DOUTArr.add(DOUT.toString());
            DOUTErrorBoolArr.add(Boolean.valueOf(error));
        }
    }

    /**
     * 
     * @throws IOException The interface java.lang.Appendable mandates that the IOException must be
     * treated as a checked exception for all output operations.  Therefore IOException is a
     * required exception in this method' throws clause.
     */
    public static boolean block(
            String                  simpSentenceIN,
            Vector<String>          sentencesOUT,
            Vector<Vector<String>>  wordTableOUT,
            Appendable              DOUT
        )
        throws IOException
    {
        String[]    gtScrape        = sentenceZH(simpSentenceIN);
        String      pronSentence    = gtScrape[0];
        String      englSentence    = gtScrape[1];

        sentencesOUT.removeAllElements();
        sentencesOUT.add(simpSentenceIN);
        sentencesOUT.add(pronSentence);
        sentencesOUT.add(englSentence);

        Vector<String>  simpWords   = new Vector<String>();
        Vector<String>  pronWords   = new Vector<String>();
        boolean         errorParse  = PinYinParse.parse
                            (DOUT, simpSentenceIN, pronSentence, simpWords, pronWords);

        if (pronWords.size() != simpWords.size()) throw new IllegalStateException(
            "The pronunciation and the character vector's should be the exact same length.\n" +
            "pronWords.size() = " + pronWords.size() + " and simpWords.size() = " + 
            simpWords.size()
        );

        int len = pronWords.size();
        for (int i=0; i < len; i++)
        {
            Vector<String> vocabEntryRow = new Vector<String>();

            vocabEntryRow.add(simpWords.elementAt(i));
            vocabEntryRow.add(pronWords.elementAt(i));
            vocabEntryRow.add(""); //Dictionary.lookupTrad(simp, pron));
            vocabEntryRow.add(""); //Dictionary.lookupEngl(simp, pron));

            wordTableOUT.add(vocabEntryRow);
        }
        return errorParse;
    }


    // *********************************************************************************************************
    // This is from the original file named "GTScrapeCN"
    // *********************************************************************************************************

    /**
     * This receives as input a sentence in simplified Mandarin Chinese.  If it finds a period in
     * it, it breaks the sentence up into smaller bricks based around the period.  It queries
     * Google Translate using this sentence.
     * 
     * @param chinese Any sentence, paragraph, phrase or word in Simplified-Mandarin
     * 
     * @return Two separate Strings returned in a String array - two elements long.
     * 
     * <OL CLASS=JDOL>
     * <LI><CODE>ret[0]</CODE> - The pronunciation (罗马拼音) String scraped from a call to Google
     * Translate</LI>
     * <LI><CODE>ret[0]</CODE> - The English - also scraped from a call to
     * <CODE>http://translate.google.com</CODE></LI>
     * </OL>
     */
    public static String[] sentenceZH(String chinese)
    {
        if (chinese.indexOf('\n') != -1) throw new IllegalArgumentException("CHINESE:\t" + chinese + "\nContains a newline!");

        String[]        cArr            = chinese.trim().split("。");
        StringBuilder   completePron    = new StringBuilder();
        StringBuilder   completeEngl    = new StringBuilder();

        for (int i=0; i < cArr.length; i++)
        {
            // Prepare the queries and scrape http://translate.google.com/query web-page.
            Vector<HTMLNode>    page        = null;
            int                 retryCount  = 0;

            while ((page == null) && (retryCount < 6))
                try {
                    String          chineseQ    = URLs.toProperURLV2(cArr[i] + "。");
                    BufferedReader  br          = Scrape.openConn_iso_8859_1("https://translate.google.com/?q=" + chineseQ);
                    page                        = HTMLPage.getPageTokens(br, false);
                } catch (Exception e) {
                    retryCount++;
                    System.out.println("RETRY-SCRAPE Google Translate:\n" + "Attempt #" + retryCount + "\n" + e.getMessage());
                }

            // Get Chinese PinYin as Sentence
            StringBuilder       pron    = new StringBuilder();
            Vector<HTMLNode>    partial = InnerTagGetInclusive.first(page, "div", "id", TextComparitor.EQ_CASE_INSENSITIVE, "src-translit");

            Util.Remove.allTagNodes(partial);
            for (HTMLNode n : partial) pron.append(((TextNode) n).str);
            completePron.append(Escape.replace(pron.toString()).trim() + "  ");


            // Get English from Translate Website as a Sentence
            StringBuilder   engl    = new StringBuilder();
            partial                 = InnerTagGetInclusive.first(page, "span", "id", TextComparitor.EQ_CASE_INSENSITIVE, "result_box");
            Util.Remove.allTagNodes(partial);
            for (HTMLNode n : partial) engl.append(((TextNode) n).str);
            completeEngl.append(Escape.replace(engl.toString().replaceAll("\\\\u200b", "")).trim() + "  ");
        }
        String [] retArr = { completePron.toString(), completeEngl.toString() };
        return retArr;
    }

    /**
     * Retrieves the PinYin pronunciation from Google Translate Servers for a single Chinese Word.
     * 
     * @param chineseWord Any single word in simplified Mandarin Chinese
     * 
     * @return The Pinyin Pronunciation of that word, stripped by Google Translate Servers.
     */
    public static String getPinYin(String chineseWord) throws IOException
    {
        BufferedReader      br      = Scrape.openConn_iso_8859_1("https://translate.google.com/?q=" + chineseWord + "&source=zh-CN");
        Vector<HTMLNode>    page    = HTMLPage.getPageTokens(br, false);
        Vector<HTMLNode>    partial = InnerTagGetInclusive.first(page, "div", "id", TextComparitor.EQ_CASE_INSENSITIVE, "src-translit");
        String              pron    = "";

        Util.Remove.allTagNodes(partial);
        for (HTMLNode n : partial) pron += ((TextNode) n).str;
        return Escape.replace(pron);
    }

    /**
     * Retrieves the Google Translate (English) Textbox-defintion for a particular Mandarin Chinese
     * Word. <BR /><B>NOTE:</B> This is not the information under the primary/main
     * translation-text-box, this is the translation-text-box word itself.  
     * 
     * @param chineseWord Any single word in simplified Mandarin Chinese
     * 
     * @return The Google Translate Server's best attempt at a Translation.
     */
    public static String getEnglish(String chineseWord) throws IOException
    {
        BufferedReader      br      = Scrape.openConn_iso_8859_1("https://translate.google.com/?q=" + chineseWord);
        Vector<HTMLNode>    page    = HTMLPage.getPageTokens(br, false, null, "matches.txt", null);
        Vector<HTMLNode>    partial = InnerTagGetInclusive.first(page, "span", "id", TextComparitor.EQ_CASE_INSENSITIVE, "result_box");
        String              engl    = "";

        Util.Remove.allTagNodes(partial);
        for (HTMLNode n : partial) engl += ((TextNode) n).str;
        return Escape.replace(engl);
    }
}