Pin1Yin1.java.html

package Torello.Languages;

import java.util.*;
import java.util.regex.*;
import java.io.*;

import java.util.stream.IntStream;

import Torello.HTML.*;
import Torello.Java.*;

/**
 * Pin1Yin1 (罗马拼音 - Online Internet Translation Service).
 * 
 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=P1Y1>
 */
public class Pin1Yin1
{
    /** PinYin {@code URL} */
    public static final String PIN1_YIN1_URL = "https://pin1yin1.com/pinyin/convert/?c=";

    private static final String sampleText = "中国政府网_中央人民政府门户网站";

    /** {@code pin1yin1.com} JSON parse */
    private static final Pattern P1, P2, P3;

    static
    { 
        // If boot-strapping a data-file, put these four lines in comments.
        Vector<Pattern> v = RegExFiles.LFEC_JAR_ZIP
            (Pin1Yin1.class, "data-files/Pin1Yin1.sdat");

        P1 = v.elementAt(0); // The JSON reply
        P2 = v.elementAt(1); // Words wrapped in quotation marks
        P3 = v.elementAt(2); // Integer array
    }

    /**
     * This is the original query-string - <I>in Simplified Chinese</I>.
     *
     * <BR /><BR /><B>NOTE:</B> How accurate the traditional/simplified
     * conversion is is not known, and probably not exact.
     */
    public final String simpSENTENCE;

    /** This is the original query-string in Traditional Chinese */
    public final String tradSENTENCE;

    /**
     * The Simplified Conversion - in this version, characters are grouped by "Word Boundaries"
     */
    public final String[] simp;

    /** The Pronunciation of each element in the "Word Boundaries" version. */
    public final String[] pron;

    /**
     * The traditional conversion - In this version, also, characters are grouped by
     * "Word Boundaries"
     */
    public final String[] trad;

    /**
     * These are the English Translations of each word; some Chinese Words are translated into
     * multiple english words.
     */
    public final String[] engl;

    /**
     * This is a 1-to-1 mapping between the chinese characters in the query, and their mandarin
     * pronunciation.  Specifically, each element in this array contains a single ('one') syllable
     * of pinyin romanization - a.k.a. one chinese character's pronunciation.  The array
     * {@code final String[] pron} contains as many syllables as the "Grouped By" Chinese Word
     * does.
     */
    public final String[] pronSINGLE_WORD;

    /**
     * This is just an array of integers that was actually returned from the server.  It identifies
     * the number of Chinese Characters that are in each Chinese Word.
     */
    public final int[] wordGroups;

    /**
     * Unless specifically requested, this variable is null. It is the JSON as a String returned
     * from the server
     */
    public final String JSON;

    /** Run the translation from the command-line. */
    public static void main(String[] argv) throws IOException
    {
        // System.out.println("P1: " + P1 + "\nP2: " + P2 + "\nP3: " + P3);
        // System.exit(1);

        System.out.println(
            getResponse(
                ((argv.length == 1) ? argv[0] : sampleText),
                true
            )
            .toString()
        );
    }

    /**
     * This translates a sentence from Mandarin into English.  The response object is an instance
     * of this exact class.
     * 
     * @param chineseText This is a text String input, and is expected to be in Mandarin.  The
     * server  does not seem to mind whether the input is in "Simplified" or "Traditional."
     * 
     * @param saveJSON If this is true, the field {@code public final String JSON} will contain the
     * original JSON  string retrieved from the server.  If this is FALSE, the JSON field will be
     * null.
     * 
     * @return An instance of this class, with all of the constant {@code final} fields filled in.
     */
    public static Pin1Yin1 getResponse(String chineseText, boolean saveJSON) throws IOException
    { return new Pin1Yin1(chineseText, saveJSON); }

    private Pin1Yin1(String chineseText, boolean saveJSON) throws IOException
    {
        String  receivedJSON    = Scrape.scrapePage(PIN1_YIN1_URL + chineseText).toString();
        Matcher m               = P1.matcher(receivedJSON);

        if (! m.find()) throw new IllegalStateException
            ("The regular expression didn't match the response from pin1yin1.com");

        JSON				= saveJSON ? receivedJSON : null;
        String question		= m.group(1);
        simpSENTENCE		= m.group(2);
        tradSENTENCE		= m.group(3);
        pronSINGLE_WORD		= parseQuotationJSONArray(m.group(4)).toArray(new String[0]);
        engl				= parseQuotationJSONArray(m.group(5)).toArray(new String[0]);
        wordGroups			= parseIntJSONArray(m.group(6));

        String[][] matched	= matchThemUp(simpSENTENCE, tradSENTENCE, pronSINGLE_WORD, wordGroups);
        simp				= matched[0];
        trad				= matched[1];
        pron				= matched[2];
    }

    /** This converts {@code 'this'} result to a {@code java.lang.String} */
    public String toString()
    {
        StringBuilder sb = new StringBuilder();
        if (JSON != null) sb.append(JSON + "\n");
        sb.append("Simp: " + simpSENTENCE + "\n" + "Trad: " + tradSENTENCE);

        sb.append("\nSimp-Words:\t");
        for (String s : simp) sb.append(s + ", ");
        int len = sb.length();
        sb.delete(len - 2, len); // DELETE THE ENDING COMMA-SPACE 

        sb.append("\nPron-Words:\t");
        for (String s : pron) sb.append(s + ", ");
        len = sb.length();
        sb.delete(len - 2, len); // DELETE THE ENDING COMMA-SPACE 

        sb.append("\nTrad-Words:\t");
        for (String s : trad) sb.append(s + ", ");
        len = sb.length();
        sb.delete(len - 2, len); // DELETE THE ENDING COMMA-SPACE 

        sb.append("\nEngl-Words:\t");
        for (String s : engl) sb.append("[" + s + "] ");

        return sb.toString();
    }

    private static Vector<String> parseQuotationJSONArray(String lineJSON)
    {
        Vector<String> ret = new Vector<String>();
        Matcher m = P2.matcher(lineJSON);
        while (m.find()) ret.addElement(m.group(1));
        return ret;
    }

    private static int[] parseIntJSONArray(String lineJSON)
    {
        IntStream.Builder b = IntStream.builder();
        Matcher m = P3.matcher(lineJSON);
        while (m.find()) b.add(Integer.parseInt(m.group(1)));
        return b.build().toArray();	
    }

    /**
     * Matches the characters together.
     * 
     * @param simp Simplified Character's
     * @param trad Traditional Charater's
     * @param pron Pronunciation
     * @param intMatchArr m
     * 
     * @return Matches up the characters and pronunciation.
     * 
     * <BR /><BR /><UL CLASS=JDUL>
     * <LI>{@code ret[0]:} Simplified Chinese Characters Grouped into words</LI>
     * <LI>{@code ret[1]:} Traditional Chinese Characters Grouped into words</LI>
     * <LI>{@code ret[2]:} Pronunciation Grouped into words</LI>
     * <UL>
     */
    private static String[][] matchThemUp
        (String simp, String trad, String[] pron, int[] intMatchArr)
    {
        String			tempStr	= "";
        Vector<String>	tempVec	= new Vector<String>();
        String[][]		ret		= new String[3][0];
        int				pos		= 0;

        // Line up the Simplified characters into word boundaries
        for (int numChineseChars : intMatchArr)
        {
            for (int i=0; i < numChineseChars; i++) tempStr += simp.charAt(pos + i);
            pos += numChineseChars;
            tempVec.addElement(tempStr);
            tempStr = "";
        }
        ret[0] = tempVec.toArray(ret[0]);

        // Line up the Traditional Characters into word boundaries
        pos	= 0;
        tempVec.removeAllElements();
        for (int numChineseChars : intMatchArr)
        {
            for (int i=0; i < numChineseChars; i++) tempStr += trad.charAt(pos + i);
            pos += numChineseChars;
            tempVec.addElement(tempStr);
            tempStr = "";
        }
        ret[1] = tempVec.toArray(ret[1]);

        // Lne up the Pronunciation "PinYin-Characters" into word boundaries
        pos	= 0;
        tempVec.removeAllElements();
        for (int numChineseChars : intMatchArr)
        {
            for (int i=0; i < numChineseChars; i++) tempStr += pron[pos + i];
            pos += numChineseChars;
            tempVec.addElement(tempStr);
            tempStr = "";
        }
        ret[2] = tempVec.toArray(ret[2]);

        return ret;
    }
}