1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
package Torello.Languages;

import java.util.*;
import java.util.regex.*;
import java.io.*;

import java.util.stream.IntStream;

import Torello.HTML.*;
import Torello.Java.*;

/**
 * Pin1Yin1 (罗马拼音 - Online Internet Translation Service).
 * 
 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=P1Y1>
 */
public class Pin1Yin1
{
    /** PinYin {@code URL} */
    public static final String PIN1_YIN1_URL = "https://pin1yin1.com/pinyin/convert/?c=";

    private static final String sampleText = "中国政府网_中央人民政府门户网站";

    /** {@code pin1yin1.com} JSON parse */
    private static final Pattern P1, P2, P3;

    static
    { 
        // If boot-strapping a data-file, put these four lines in comments.
        Vector<Pattern> v = RegExFiles.LFEC_JAR_ZIP
            (Pin1Yin1.class, "data-files/Pin1Yin1.sdat");

        P1 = v.elementAt(0); // The JSON reply
        P2 = v.elementAt(1); // Words wrapped in quotation marks
        P3 = v.elementAt(2); // Integer array
    }

    /**
     * This is the original query-string - <I>in Simplified Chinese</I>.
     *
     * <BR /><BR /><B>NOTE:</B> How accurate the traditional/simplified
     * conversion is is not known, and probably not exact.
     */
    public final String simpSENTENCE;

    /** This is the original query-string in Traditional Chinese */
    public final String tradSENTENCE;

    /**
     * The Simplified Conversion - in this version, characters are grouped by "Word Boundaries"
     */
    public final String[] simp;

    /** The Pronunciation of each element in the "Word Boundaries" version. */
    public final String[] pron;

    /**
     * The traditional conversion - In this version, also, characters are grouped by
     * "Word Boundaries"
     */
    public final String[] trad;

    /**
     * These are the English Translations of each word; some Chinese Words are translated into
     * multiple english words.
     */
    public final String[] engl;

    /**
     * This is a 1-to-1 mapping between the chinese characters in the query, and their mandarin
     * pronunciation.  Specifically, each element in this array contains a single ('one') syllable
     * of pinyin romanization - a.k.a. one chinese character's pronunciation.  The array
     * {@code final String[] pron} contains as many syllables as the "Grouped By" Chinese Word
     * does.
     */
    public final String[] pronSINGLE_WORD;

    /**
     * This is just an array of integers that was actually returned from the server.  It identifies
     * the number of Chinese Characters that are in each Chinese Word.
     */
    public final int[] wordGroups;

    /**
     * Unless specifically requested, this variable is null. It is the JSON as a String returned
     * from the server
     */
    public final String JSON;

    /** Run the translation from the command-line. */
    public static void main(String[] argv) throws IOException
    {
        // System.out.println("P1: " + P1 + "\nP2: " + P2 + "\nP3: " + P3);
        // System.exit(1);

        System.out.println(
            getResponse(
                ((argv.length == 1) ? argv[0] : sampleText),
                true
            )
            .toString()
        );
    }

    /**
     * This translates a sentence from Mandarin into English.  The response object is an instance
     * of this exact class.
     * 
     * @param chineseText This is a text String input, and is expected to be in Mandarin.  The
     * server  does not seem to mind whether the input is in "Simplified" or "Traditional."
     * 
     * @param saveJSON If this is true, the field {@code public final String JSON} will contain the
     * original JSON  string retrieved from the server.  If this is FALSE, the JSON field will be
     * null.
     * 
     * @return An instance of this class, with all of the constant {@code final} fields filled in.
     */
    public static Pin1Yin1 getResponse(String chineseText, boolean saveJSON) throws IOException
    { return new Pin1Yin1(chineseText, saveJSON); }

    private Pin1Yin1(String chineseText, boolean saveJSON) throws IOException
    {
        String  receivedJSON    = Scrape.scrapePage(PIN1_YIN1_URL + chineseText).toString();
        Matcher m               = P1.matcher(receivedJSON);

        if (! m.find()) throw new IllegalStateException
            ("The regular expression didn't match the response from pin1yin1.com");

        JSON				= saveJSON ? receivedJSON : null;
        String question		= m.group(1);
        simpSENTENCE		= m.group(2);
        tradSENTENCE		= m.group(3);
        pronSINGLE_WORD		= parseQuotationJSONArray(m.group(4)).toArray(new String[0]);
        engl				= parseQuotationJSONArray(m.group(5)).toArray(new String[0]);
        wordGroups			= parseIntJSONArray(m.group(6));

        String[][] matched	= matchThemUp(simpSENTENCE, tradSENTENCE, pronSINGLE_WORD, wordGroups);
        simp				= matched[0];
        trad				= matched[1];
        pron				= matched[2];
    }

    /** This converts {@code 'this'} result to a {@code java.lang.String} */
    public String toString()
    {
        StringBuilder sb = new StringBuilder();
        if (JSON != null) sb.append(JSON + "\n");
        sb.append("Simp: " + simpSENTENCE + "\n" + "Trad: " + tradSENTENCE);

        sb.append("\nSimp-Words:\t");
        for (String s : simp) sb.append(s + ", ");
        int len = sb.length();
        sb.delete(len - 2, len); // DELETE THE ENDING COMMA-SPACE 

        sb.append("\nPron-Words:\t");
        for (String s : pron) sb.append(s + ", ");
        len = sb.length();
        sb.delete(len - 2, len); // DELETE THE ENDING COMMA-SPACE 

        sb.append("\nTrad-Words:\t");
        for (String s : trad) sb.append(s + ", ");
        len = sb.length();
        sb.delete(len - 2, len); // DELETE THE ENDING COMMA-SPACE 

        sb.append("\nEngl-Words:\t");
        for (String s : engl) sb.append("[" + s + "] ");

        return sb.toString();
    }

    private static Vector<String> parseQuotationJSONArray(String lineJSON)
    {
        Vector<String> ret = new Vector<String>();
        Matcher m = P2.matcher(lineJSON);
        while (m.find()) ret.addElement(m.group(1));
        return ret;
    }

    private static int[] parseIntJSONArray(String lineJSON)
    {
        IntStream.Builder b = IntStream.builder();
        Matcher m = P3.matcher(lineJSON);
        while (m.find()) b.add(Integer.parseInt(m.group(1)));
        return b.build().toArray();	
    }

    /**
     * Matches the characters together.
     * 
     * @param simp Simplified Character's
     * @param trad Traditional Charater's
     * @param pron Pronunciation
     * @param intMatchArr m
     * 
     * @return Matches up the characters and pronunciation.
     * 
     * <BR /><BR /><UL CLASS=JDUL>
     * <LI>{@code ret[0]:} Simplified Chinese Characters Grouped into words</LI>
     * <LI>{@code ret[1]:} Traditional Chinese Characters Grouped into words</LI>
     * <LI>{@code ret[2]:} Pronunciation Grouped into words</LI>
     * <UL>
     */
    private static String[][] matchThemUp
        (String simp, String trad, String[] pron, int[] intMatchArr)
    {
        String			tempStr	= "";
        Vector<String>	tempVec	= new Vector<String>();
        String[][]		ret		= new String[3][0];
        int				pos		= 0;

        // Line up the Simplified characters into word boundaries
        for (int numChineseChars : intMatchArr)
        {
            for (int i=0; i < numChineseChars; i++) tempStr += simp.charAt(pos + i);
            pos += numChineseChars;
            tempVec.addElement(tempStr);
            tempStr = "";
        }
        ret[0] = tempVec.toArray(ret[0]);

        // Line up the Traditional Characters into word boundaries
        pos	= 0;
        tempVec.removeAllElements();
        for (int numChineseChars : intMatchArr)
        {
            for (int i=0; i < numChineseChars; i++) tempStr += trad.charAt(pos + i);
            pos += numChineseChars;
            tempVec.addElement(tempStr);
            tempStr = "";
        }
        ret[1] = tempVec.toArray(ret[1]);

        // Lne up the Pronunciation "PinYin-Characters" into word boundaries
        pos	= 0;
        tempVec.removeAllElements();
        for (int numChineseChars : intMatchArr)
        {
            for (int i=0; i < numChineseChars; i++) tempStr += pron[pos + i];
            pos += numChineseChars;
            tempVec.addElement(tempStr);
            tempStr = "";
        }
        ret[2] = tempVec.toArray(ret[2]);

        return ret;
    }
}