001package Torello.Languages;
002
003import java.util.*;
004import java.util.regex.*;
005import java.io.*;
006
007import java.util.stream.IntStream;
008
009import Torello.HTML.*;
010import Torello.Java.*;
011
012/**
013 * Pin1Yin1 (罗马拼音 - Online Internet Translation Service).
014 * 
015 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=P1Y1>
016 */
017public class Pin1Yin1
018{
019    /** PinYin {@code URL} */
020    public static final String PIN1_YIN1_URL = "https://pin1yin1.com/pinyin/convert/?c=";
021
022    private static final String sampleText = "中国政府网_中央人民政府门户网站";
023
024    /** {@code pin1yin1.com} JSON parse */
025    private static final Pattern P1, P2, P3;
026
027    static
028    { 
029        // If boot-strapping a data-file, put these four lines in comments.
030        Vector<Pattern> v = RegExFiles.LFEC_JAR_ZIP
031            (Pin1Yin1.class, "data-files/Pin1Yin1.sdat");
032
033        P1 = v.elementAt(0); // The JSON reply
034        P2 = v.elementAt(1); // Words wrapped in quotation marks
035        P3 = v.elementAt(2); // Integer array
036    }
037
038    /**
039     * This is the original query-string - <I>in Simplified Chinese</I>.
040     *
041     * <BR /><BR /><B>NOTE:</B> How accurate the traditional/simplified
042     * conversion is is not known, and probably not exact.
043     */
044    public final String simpSENTENCE;
045
046    /** This is the original query-string in Traditional Chinese */
047    public final String tradSENTENCE;
048
049    /**
050     * The Simplified Conversion - in this version, characters are grouped by "Word Boundaries"
051     */
052    public final String[] simp;
053
054    /** The Pronunciation of each element in the "Word Boundaries" version. */
055    public final String[] pron;
056
057    /**
058     * The traditional conversion - In this version, also, characters are grouped by
059     * "Word Boundaries"
060     */
061    public final String[] trad;
062
063    /**
064     * These are the English Translations of each word; some Chinese Words are translated into
065     * multiple english words.
066     */
067    public final String[] engl;
068
069    /**
070     * This is a 1-to-1 mapping between the chinese characters in the query, and their mandarin
071     * pronunciation.  Specifically, each element in this array contains a single ('one') syllable
072     * of pinyin romanization - a.k.a. one chinese character's pronunciation.  The array
073     * {@code final String[] pron} contains as many syllables as the "Grouped By" Chinese Word
074     * does.
075     */
076    public final String[] pronSINGLE_WORD;
077
078    /**
079     * This is just an array of integers that was actually returned from the server.  It identifies
080     * the number of Chinese Characters that are in each Chinese Word.
081     */
082    public final int[] wordGroups;
083
084    /**
085     * Unless specifically requested, this variable is null. It is the JSON as a String returned
086     * from the server
087     */
088    public final String JSON;
089
090    /** Run the translation from the command-line. */
091    public static void main(String[] argv) throws IOException
092    {
093        // System.out.println("P1: " + P1 + "\nP2: " + P2 + "\nP3: " + P3);
094        // System.exit(1);
095
096        System.out.println(
097            getResponse(
098                ((argv.length == 1) ? argv[0] : sampleText),
099                true
100            )
101            .toString()
102        );
103    }
104
105    /**
106     * This translates a sentence from Mandarin into English.  The response object is an instance
107     * of this exact class.
108     * 
109     * @param chineseText This is a text String input, and is expected to be in Mandarin.  The
110     * server  does not seem to mind whether the input is in "Simplified" or "Traditional."
111     * 
112     * @param saveJSON If this is true, the field {@code public final String JSON} will contain the
113     * original JSON  string retrieved from the server.  If this is FALSE, the JSON field will be
114     * null.
115     * 
116     * @return An instance of this class, with all of the constant {@code final} fields filled in.
117     */
118    public static Pin1Yin1 getResponse(String chineseText, boolean saveJSON) throws IOException
119    { return new Pin1Yin1(chineseText, saveJSON); }
120
121    private Pin1Yin1(String chineseText, boolean saveJSON) throws IOException
122    {
123        String  receivedJSON    = Scrape.scrapePage(PIN1_YIN1_URL + chineseText).toString();
124        Matcher m               = P1.matcher(receivedJSON);
125
126        if (! m.find()) throw new IllegalStateException
127            ("The regular expression didn't match the response from pin1yin1.com");
128
129        JSON                = saveJSON ? receivedJSON : null;
130        String question     = m.group(1);
131        simpSENTENCE        = m.group(2);
132        tradSENTENCE        = m.group(3);
133        pronSINGLE_WORD     = parseQuotationJSONArray(m.group(4)).toArray(new String[0]);
134        engl                = parseQuotationJSONArray(m.group(5)).toArray(new String[0]);
135        wordGroups          = parseIntJSONArray(m.group(6));
136
137        String[][] matched  = matchThemUp(simpSENTENCE, tradSENTENCE, pronSINGLE_WORD, wordGroups);
138        simp                = matched[0];
139        trad                = matched[1];
140        pron                = matched[2];
141    }
142
143    /** This converts {@code 'this'} result to a {@code java.lang.String} */
144    public String toString()
145    {
146        StringBuilder sb = new StringBuilder();
147        if (JSON != null) sb.append(JSON + "\n");
148        sb.append("Simp: " + simpSENTENCE + "\n" + "Trad: " + tradSENTENCE);
149
150        sb.append("\nSimp-Words:\t");
151        for (String s : simp) sb.append(s + ", ");
152        int len = sb.length();
153        sb.delete(len - 2, len); // DELETE THE ENDING COMMA-SPACE 
154
155        sb.append("\nPron-Words:\t");
156        for (String s : pron) sb.append(s + ", ");
157        len = sb.length();
158        sb.delete(len - 2, len); // DELETE THE ENDING COMMA-SPACE 
159
160        sb.append("\nTrad-Words:\t");
161        for (String s : trad) sb.append(s + ", ");
162        len = sb.length();
163        sb.delete(len - 2, len); // DELETE THE ENDING COMMA-SPACE 
164
165        sb.append("\nEngl-Words:\t");
166        for (String s : engl) sb.append("[" + s + "] ");
167
168        return sb.toString();
169    }
170
171    private static Vector<String> parseQuotationJSONArray(String lineJSON)
172    {
173        Vector<String> ret = new Vector<String>();
174        Matcher m = P2.matcher(lineJSON);
175        while (m.find()) ret.addElement(m.group(1));
176        return ret;
177    }
178
179    private static int[] parseIntJSONArray(String lineJSON)
180    {
181        IntStream.Builder b = IntStream.builder();
182        Matcher m = P3.matcher(lineJSON);
183        while (m.find()) b.add(Integer.parseInt(m.group(1)));
184        return b.build().toArray(); 
185    }
186
187    /**
188     * Matches the characters together.
189     * 
190     * @param simp Simplified Character's
191     * @param trad Traditional Charater's
192     * @param pron Pronunciation
193     * @param intMatchArr m
194     * 
195     * @return Matches up the characters and pronunciation.
196     * 
197     * <BR /><BR /><UL CLASS=JDUL>
198     * <LI>{@code ret[0]:} Simplified Chinese Characters Grouped into words</LI>
199     * <LI>{@code ret[1]:} Traditional Chinese Characters Grouped into words</LI>
200     * <LI>{@code ret[2]:} Pronunciation Grouped into words</LI>
201     * <UL>
202     */
203    private static String[][] matchThemUp
204        (String simp, String trad, String[] pron, int[] intMatchArr)
205    {
206        String          tempStr = "";
207        Vector<String>  tempVec = new Vector<String>();
208        String[][]      ret     = new String[3][0];
209        int             pos     = 0;
210
211        // Line up the Simplified characters into word boundaries
212        for (int numChineseChars : intMatchArr)
213        {
214            for (int i=0; i < numChineseChars; i++) tempStr += simp.charAt(pos + i);
215            pos += numChineseChars;
216            tempVec.addElement(tempStr);
217            tempStr = "";
218        }
219        ret[0] = tempVec.toArray(ret[0]);
220
221        // Line up the Traditional Characters into word boundaries
222        pos = 0;
223        tempVec.removeAllElements();
224        for (int numChineseChars : intMatchArr)
225        {
226            for (int i=0; i < numChineseChars; i++) tempStr += trad.charAt(pos + i);
227            pos += numChineseChars;
228            tempVec.addElement(tempStr);
229            tempStr = "";
230        }
231        ret[1] = tempVec.toArray(ret[1]);
232
233        // Lne up the Pronunciation "PinYin-Characters" into word boundaries
234        pos = 0;
235        tempVec.removeAllElements();
236        for (int numChineseChars : intMatchArr)
237        {
238            for (int i=0; i < numChineseChars; i++) tempStr += pron[pos + i];
239            pos += numChineseChars;
240            tempVec.addElement(tempStr);
241            tempStr = "";
242        }
243        ret[2] = tempVec.toArray(ret[2]);
244
245        return ret;
246    }
247}