001package Torello.Languages;
002
003import java.util.*;
004import java.util.regex.*;
005import java.io.*;
006
007import java.util.stream.IntStream;
008
009import Torello.HTML.*;
010import Torello.Java.*;
011
012/**
013 * Pin1Yin1 (罗马拼音 - Online Internet Translation Service).
014 * 
015 * <BR /><BR /><EMBED CLASS="external-html" DATA-FILE-ID="P1Y1">
016 */
017public class Pin1Yin1
018{
019    /** PinYin {@code URL} */
020    public static final String PIN1_YIN1_URL = "https://pin1yin1.com/pinyin/convert/?c=";
021
022    private static final String sampleText = "中国政府网_中央人民政府门户网站";
023
024    /** {@code pin1yin1.com} JSON parse */
025    private static final Pattern P1, P2, P3;
026
027    static
028    { 
029        // If boot-strapping a data-file, put these four lines in comments.
030        Vector<Pattern> v = RegExFiles.LFEC_JAR_ZIP
031            (Torello.Data.DataFileLoader.class, "data09.sdat");
032
033        P1 = v.elementAt(0); // The JSON reply
034        P2 = v.elementAt(1); // Words wrapped in quotation marks
035        P3 = v.elementAt(2); // Integer array
036    }
037
038    /**
039     * This is the original query-string - <I>in Simplified Chinese</I>.
040     *
041     * <BR /><BR /><B>NOTE:</B> How accurate the traditional/simplified
042     * conversion is is not known, and probably not exact.
043     */
044    public final String simpSENTENCE;
045
046    /** This is the original query-string in Traditional Chinese */
047    public final String tradSENTENCE;
048
049    /**
050     * The Simplified Conversion - in this version, characters are grouped by "Word Boundaries"
051     */
052    public final String[] simp;
053
054    /** The Pronunciation of each element in the "Word Boundaries" version. */
055    public final String[] pron;
056
057    /**
058     * The traditional conversion - In this version, also, characters are grouped by
059     * "Word Boundaries"
060     */
061    public final String[] trad;
062
063    /**
064     * These are the English Translations of each word; some Chinese Words are translated into
065     * multiple english words.
066     */
067    public final String[] engl;
068
069    /**
070     * This is a 1-to-1 mapping between the chinese characters in the query, and their mandarin
071     * pronunciation.  Specifically, each element in this array contains a single ('one') syllable
072     * of pinyin romanization - a.k.a. one chinese character's pronunciation.  The array
073     * {@code final String[] pron} contains as many syllables as the "Grouped By" Chinese Word
074     * does.
075     */
076    public final String[] pronSINGLE_WORD;
077
078    /**
079     * This is just an array of integers that was actually returned from the server.  It identifies
080     * the number of Chinese Characters that are in each Chinese Word.
081     */
082    public final int[] wordGroups;
083
084    /**
085     * Unless specifically requested, this variable is null. It is the JSON as a String returned
086     * from the server
087     */
088    public final String JSON;
089
090    /** Run the translation from the command-line. */
091    public static void main(String[] argv) throws IOException
092    { System.out.println(getResponse((argv.length == 1) ? argv[0] : sampleText, true).toString()); }
093
094    /**
095     * This translates a sentence from Mandarin into English.  The response object is an instance
096     * of this exact class.
097     * 
098     * @param chineseText This is a text String input, and is expected to be in Mandarin.  The
099     * server  does not seem to mind whether the input is in "Simplified" or "Traditional."
100     * 
101     * @param saveJSON If this is true, the field {@code public final String JSON} will contain the
102     * original JSON  string retrieved from the server.  If this is FALSE, the JSON field will be
103     * null.
104     * 
105     * @return An instance of this class, with all of the constant {@code final} fields filled in.
106     */
107    public static Pin1Yin1 getResponse(String chineseText, boolean saveJSON) throws IOException
108    { return new Pin1Yin1(chineseText, saveJSON); }
109
110    private Pin1Yin1(String chineseText, boolean saveJSON) throws IOException
111    {
112        String  receivedJSON    = Scrape.scrapePage(PIN1_YIN1_URL + chineseText).toString();
113        Matcher m               = P1.matcher(receivedJSON);
114
115        if (! m.find()) throw new IllegalStateException
116            ("The regular expression didn't match the response from pin1yin1.com");
117
118        JSON                = saveJSON ? receivedJSON : null;
119        String question     = m.group(1);
120        simpSENTENCE        = m.group(2);
121        tradSENTENCE        = m.group(3);
122        pronSINGLE_WORD     = parseQuotationJSONArray(m.group(4)).toArray(new String[0]);
123        engl                = parseQuotationJSONArray(m.group(5)).toArray(new String[0]);
124        wordGroups          = parseIntJSONArray(m.group(6));
125
126        String[][] matched  = matchThemUp(simpSENTENCE, tradSENTENCE, pronSINGLE_WORD, wordGroups);
127        simp                = matched[0];
128        trad                = matched[1];
129        pron                = matched[2];
130    }
131
132    /** This converts {@code 'this'} result to a {@code java.lang.String} */
133    public String toString()
134    {
135        StringBuilder sb = new StringBuilder();
136        if (JSON != null) sb.append(JSON + "\n");
137        sb.append("Simp: " + simpSENTENCE + "\n" + "Trad: " + tradSENTENCE);
138
139        sb.append("\nSimp-Words:\t");
140        for (String s : simp) sb.append(s + ", ");
141        int len = sb.length();
142        sb.delete(len - 2, len); // DELETE THE ENDING COMMA-SPACE 
143
144        sb.append("\nPron-Words:\t");
145        for (String s : pron) sb.append(s + ", ");
146        len = sb.length();
147        sb.delete(len - 2, len); // DELETE THE ENDING COMMA-SPACE 
148
149        sb.append("\nTrad-Words:\t");
150        for (String s : trad) sb.append(s + ", ");
151        len = sb.length();
152        sb.delete(len - 2, len); // DELETE THE ENDING COMMA-SPACE 
153
154        sb.append("\nEngl-Words:\t");
155        for (String s : engl) sb.append("[" + s + "] ");
156
157        return sb.toString();
158    }
159
160    private static Vector<String> parseQuotationJSONArray(String lineJSON)
161    {
162        Vector<String> ret = new Vector<String>();
163        Matcher m = P2.matcher(lineJSON);
164        while (m.find()) ret.addElement(m.group(1));
165        return ret;
166    }
167
168    private static int[] parseIntJSONArray(String lineJSON)
169    {
170        IntStream.Builder b = IntStream.builder();
171        Matcher m = P3.matcher(lineJSON);
172        while (m.find()) b.add(Integer.parseInt(m.group(1)));
173        return b.build().toArray(); 
174    }
175
176    /**
177     * Matches the characters together.
178     * 
179     * @param simp Simplified Character's
180     * @param trad Traditional Charater's
181     * @param pron Pronunciation
182     * @param intMatchArr m
183     * 
184     * @return Matches up the characters and pronunciation.
185     * 
186     * <BR /><BR /><UL CLASS=JDUL>
187     * <LI>{@code ret[0]:} Simplified Chinese Characters Grouped into words</LI>
188     * <LI>{@code ret[1]:} Traditional Chinese Characters Grouped into words</LI>
189     * <LI>{@code ret[2]:} Pronunciation Grouped into words</LI>
190     * <UL>
191     */
192    private static String[][] matchThemUp
193        (String simp, String trad, String[] pron, int[] intMatchArr)
194    {
195        String          tempStr = "";
196        Vector<String>  tempVec = new Vector<String>();
197        String[][]      ret     = new String[3][0];
198        int             pos     = 0;
199
200        // Line up the Simplified characters into word boundaries
201        for (int numChineseChars : intMatchArr)
202        {
203            for (int i=0; i < numChineseChars; i++) tempStr += simp.charAt(pos + i);
204            pos += numChineseChars;
205            tempVec.addElement(tempStr);
206            tempStr = "";
207        }
208        ret[0] = tempVec.toArray(ret[0]);
209
210        // Line up the Traditional Characters into word boundaries
211        pos = 0;
212        tempVec.removeAllElements();
213        for (int numChineseChars : intMatchArr)
214        {
215            for (int i=0; i < numChineseChars; i++) tempStr += trad.charAt(pos + i);
216            pos += numChineseChars;
217            tempVec.addElement(tempStr);
218            tempStr = "";
219        }
220        ret[1] = tempVec.toArray(ret[1]);
221
222        // Lne up the Pronunciation "PinYin-Characters" into word boundaries
223        pos = 0;
224        tempVec.removeAllElements();
225        for (int numChineseChars : intMatchArr)
226        {
227            for (int i=0; i < numChineseChars; i++) tempStr += pron[pos + i];
228            pos += numChineseChars;
229            tempVec.addElement(tempStr);
230            tempStr = "";
231        }
232        ret[2] = tempVec.toArray(ret[2]);
233
234        return ret;
235    }
236}