001package Torello.Languages;
002
003import java.io.*;
004import java.util.*;
005
006/**
007 * PinYinParse (罗马拼音).
008 * 
009 * <BR /><BR /><EMBED CLASS="external-html" DATA-FILE-ID="PYP">
010 */
011@Torello.HTML.Tools.JavaDoc.StaticFunctional
012public class PinYinParse
013{
014    private PinYinParse() { }
015
016    /**
017     * The purpose of this is produce the Parallel arrays (Vector<String>) which contain
018     * Chinese Characters and Chinese PinYin based on the results of the Google Translate Query.
019     *
020     * <BR /><BR /><B>NOTE: </B> This is of "limited use" - since primarily the input to this
021     * function is a String that is scraped from the Google Translate Website, not from a query to
022     * Google Cloud Server's Translate-API.  The API version of Mandarin Translations literally
023     * leaves out the Pin-Yin Romanizations, and makes the entire package a lot less useable.  The
024     * web-site itself can be scraped, and the Pin-Yin obtained, but that String comes from a
025     * web-site that changes from time-to-time. 
026     * 
027     * <BR /><BR /><B>NOTE: </B> If scraping Google's Translate Web-site conjurs images of the
028     * police coming to your door, another web-site that seems to do pretty good Romanization is
029     * Pin1Yin1.com.  I have another class that scrapes that site.
030     *
031     * @param DOUT This is filled up with Debug Information as this class is run.  It may be any
032     * implementation of java's {@code java.lang.Appendable} interface.
033     * 
034     * @param simpSentence This is the complete simplified-Mandarin sentence obtained from
035     * news-article.
036     * 
037     * @param pronSentence This is the pronunciation of the simplified-Mandarin sentence.  This
038     * should have already been obtained from Google Translate.
039     * 
040     * @param characters This should be an empty vector.  It will be populated by the words from
041     * the original Mandarin sentence, based on the pronunciation obtained from Google Translate.
042     * 
043     * @param pronunciation This should also be an empty vector.  It will be populated after the
044     * words from the pronunciation sentence have been parsed into individual words.
045     * 
046     * @return boolean This is true if there was possibly an error along the way.
047     * The specific requirements for the boolean value are: <BR />
048     * {@code (cSent.length() != totalChinese) && (totalChinese > 0);}
049     * 
050     * @throws IOException The {@code interface java.lang.Appendable} mandates that the
051     * {@code IOException} must be treated as a checked exception for all output operations.
052     * Therefore {@code IOException} is a required exception in this method' throws clause.
053     */
054    public static boolean parse(
055            Appendable DOUT, 
056            String simpSentence, String pronSentence,
057            Vector<String> characters, Vector<String> pronunciation
058        )
059        throws IOException
060    {
061        int totalChinese = 0;
062        DOUT.append("********************************************\n");
063        DOUT.append("chin = " + simpSentence + "\n");
064        DOUT.append("pron = " + pronSentence + "\n");
065        String cSent = ZH.convertAnyAUC(simpSentence); // remove "alternate" (AUC) versions of A...Z or 0..9 are there..
066
067        // CHANGED 2018.09.24 - dellAllPunctuation does not remove '.' and ',' between numbers!
068        String pSent = ZH.delAllPunctuationPINYIN(pronSentence);
069
070        cSent = ZH.delAllPunctuationCHINESE(cSent);
071
072        DOUT.append("********************************************\n");
073        DOUT.append("After Removing non-alphanumeric UniCode, and Alt-UniCode:\n");
074        DOUT.append("cSent=" + cSent + "\n");
075        DOUT.append("pSent=" + pSent + "\n");
076        DOUT.append("********************************************\n");
077
078        // Leading or ending blanks messes this up
079        // *** Use trim()
080        String[] pWords = pSent.trim().split(" ");
081
082        for (int i = 0; i < pWords.length; i++)
083        {
084            String pronWord = pWords[i].trim();
085
086            if (pronWord.length() == 0) continue;
087
088            // Sometimes alphabetic characters appear in the chinese string.
089            int leading = ZH.countLeadingLettersAndNumbers(cSent.substring(totalChinese));
090            if (leading > 0)
091            {
092                String alphaNumericASCII = cSent.substring(totalChinese, totalChinese + leading);
093
094                DOUT.append("*** Found English and Numbers ASCII in Chinese Sentence ***\n");
095                DOUT.append("There are " + leading + " leading alpha numeric characters.");
096                DOUT.append(" [" + alphaNumericASCII + "]\n");
097                DOUT.append("pronunciation word is: [" + pronWord + "]\n");
098
099                pronunciation.add(pronWord);
100                characters.add(alphaNumericASCII);
101
102                totalChinese += leading;
103            }
104            // else - it's just normal characters in the chinese string
105            else
106            {
107                int numChinese      = ZH.countSyllablesAndNonChinese(pronWord, DOUT);
108                String chineseWord  = cSent.substring(totalChinese, totalChinese + numChinese);
109                          
110                DOUT.append("The word [" + pronWord + "] ");
111                DOUT.append("corresponds to " + numChinese + " Unicode Characters ");
112                DOUT.append("[" + chineseWord + "]\n");
113
114                // Add the new word to the list
115                pronunciation.add(pronWord);
116                characters.add(chineseWord);
117
118                totalChinese += numChinese;
119            }
120        }
121
122        DOUT.append(
123            "********************************************\n" +
124            "COMPLETED SENTENCE LOOP\n" +
125            "SUMMARY:\n" +
126            "FOUND (" + totalChinese + ") characters in Chinese String\n" +
127            "STRING CONTAINS (" + cSent.length() + ") characters\n" +
128            ((totalChinese != cSent.length()) ? "\nPOSSIBLE ERROR MISMATCH\n\n" : "") +
129            "********************************************\n"
130        );
131
132        return (cSent.length() != totalChinese) && (totalChinese > 0);
133    }
134}