Source code

001package Torello.Languages;
002
003import java.io.*;
004import java.util.*;
005
006/**
007 * PinYinParse (罗马拼音).
008 * 
009 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=PYP>
010 */
011@Torello.JavaDoc.StaticFunctional
012public class PinYinParse
013{
014    private PinYinParse() { }
015
016    /**
017     * The purpose of this is produce the Parallel arrays (Vector<String>) which contain
018     * Chinese Characters and Chinese PinYin based on the results of the Google Translate Query.
019     *
020     * <BR /><BR /><B CLASS=JDDescLabel>Scrape, non-API Invoation:</B>
021     * 
022     * <BR />This is of "limited use" - since primarily the input to this function is a
023     * {@code String} that has been scraped from the <B>{@code Google Translate Website}</B>, not
024     * a {@code String} from a query to Google Cloud Server's <B>{@code Translate-API}</B>.
025     * 
026     * <BR /><BR />The API version of Mandarin Translations literally leaves out the Pin-Yin
027     * Romanizations, and makes the entire package a lot less useable.  The web-site itself can be
028     * scraped, and the Pin-Yin obtained, but that String comes from a web-site that changes from
029     * time-to-time. 
030     * 
031     * <BR /><BR /><B CLASS=JDDescLabel>Using a Bot:</B>
032     * 
033     * <BR />If scraping Google's Translate Web-site conjurs images of the police coming to your
034     * door, another web-site that seems to do pretty good Romanization is Pin1Yin1.com.  I have
035     * another class that scrapes that site.
036     *
037     * @param DOUT This is filled up with Debug Information as this class is run.  It may be any
038     * implementation of java's {@code java.lang.Appendable} interface.
039     * 
040     * @param simpSentence This is the complete simplified-Mandarin sentence obtained from
041     * news-article.
042     * 
043     * @param pronSentence This is the pronunciation of the simplified-Mandarin sentence.  This
044     * should have already been obtained from Google Translate.
045     * 
046     * @param characters This should be an empty vector.  It will be populated by the words from
047     * the original Mandarin sentence, based on the pronunciation obtained from Google Translate.
048     * 
049     * @param pronunciation This should also be an empty vector.  It will be populated after the
050     * words from the pronunciation sentence have been parsed into individual words.
051     * 
052     * @return boolean This is true if there was possibly an error along the way.
053     * The specific requirements for the boolean value are: <BR />
054     * {@code (cSent.length() != totalChinese) && (totalChinese > 0);}
055     * 
056     * @throws IOException The {@code interface java.lang.Appendable} mandates that the
057     * {@code IOException} must be treated as a checked exception for all output operations.
058     * Therefore {@code IOException} is a required exception in this method' throws clause.
059     */
060    public static boolean parse(
061            Appendable      DOUT, 
062            String          simpSentence,
063            String          pronSentence,
064            Vector<String>  characters,
065            Vector<String>  pronunciation
066        )
067        throws IOException
068    {
069        int totalChinese = 0;
070        DOUT.append("********************************************\n");
071        DOUT.append("chin = " + simpSentence + "\n");
072        DOUT.append("pron = " + pronSentence + "\n");
073
074        // remove "alternate" (AUC) versions of A...Z or 0..9 are there..
075        String cSent = ZH.convertAnyAUC(simpSentence);
076
077        // CHANGED 2018.09.24 - dellAllPunctuation does not remove '.' and ',' between numbers!
078        String pSent = ZH.delAllPunctuationPINYIN(pronSentence);
079
080        cSent = ZH.delAllPunctuationCHINESE(cSent);
081
082        DOUT.append("********************************************\n");
083        DOUT.append("After Removing non-alphanumeric UniCode, and Alt-UniCode:\n");
084        DOUT.append("cSent=" + cSent + "\n");
085        DOUT.append("pSent=" + pSent + "\n");
086        DOUT.append("********************************************\n");
087
088        // Leading or ending blanks messes this up
089        // *** Use trim()
090
091        String[] pWords = pSent.trim().split(" ");
092
093        for (int i = 0; i < pWords.length; i++)
094        {
095            String pronWord = pWords[i].trim();
096
097            if (pronWord.length() == 0) continue;
098
099            // Sometimes alphabetic characters appear in the chinese string.
100            int leading = ZH.countLeadingLettersAndNumbers(cSent.substring(totalChinese));
101
102            if (leading > 0)
103            {
104                String alphaNumericASCII = cSent.substring(totalChinese, totalChinese + leading);
105
106                DOUT.append("*** Found English and Numbers ASCII in Chinese Sentence ***\n");
107                DOUT.append("There are " + leading + " leading alpha numeric characters.");
108                DOUT.append(" [" + alphaNumericASCII + "]\n");
109                DOUT.append("pronunciation word is: [" + pronWord + "]\n");
110
111                pronunciation.add(pronWord);
112                characters.add(alphaNumericASCII);
113
114                totalChinese += leading;
115            }
116
117            // else - it's just normal characters in the chinese string
118            else
119            {
120                int numChinese      = ZH.countSyllablesAndNonChinese(pronWord, DOUT);
121                String chineseWord  = cSent.substring(totalChinese, totalChinese + numChinese);
122
123                DOUT.append("The word [" + pronWord + "] ");
124                DOUT.append("corresponds to " + numChinese + " Unicode Characters ");
125                DOUT.append("[" + chineseWord + "]\n");
126
127                // Add the new word to the list
128                pronunciation.add(pronWord);
129                characters.add(chineseWord);
130
131                totalChinese += numChinese;
132            }
133        }
134
135        DOUT.append(
136            "********************************************\n" +
137            "COMPLETED SENTENCE LOOP\n" +
138            "SUMMARY:\n" +
139            "FOUND (" + totalChinese + ") characters in Chinese String\n" +
140            "STRING CONTAINS (" + cSent.length() + ") characters\n" +
141            ((totalChinese != cSent.length()) ? "\nPOSSIBLE ERROR MISMATCH\n\n" : "") +
142            "********************************************\n"
143        );
144
145        return (cSent.length() != totalChinese) && (totalChinese > 0);
146    }
147}