PinYinParse.java.html

package Torello.Languages;

import java.io.*;
import java.util.*;

/**
 * PinYinParse (罗马拼音).
 * 
 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=PYP>
 */
@Torello.JavaDoc.StaticFunctional
public class PinYinParse
{
    private PinYinParse() { }

    /**
     * The purpose of this is produce the Parallel arrays (Vector<String>) which contain
     * Chinese Characters and Chinese PinYin based on the results of the Google Translate Query.
     *
     * <BR /><BR /><B CLASS=JDDescLabel>Scrape, non-API Invoation:</B>
     * 
     * <BR />This is of "limited use" - since primarily the input to this function is a
     * {@code String} that has been scraped from the <B>{@code Google Translate Website}</B>, not
     * a {@code String} from a query to Google Cloud Server's <B>{@code Translate-API}</B>.
     * 
     * <BR /><BR />The API version of Mandarin Translations literally leaves out the Pin-Yin
     * Romanizations, and makes the entire package a lot less useable.  The web-site itself can be
     * scraped, and the Pin-Yin obtained, but that String comes from a web-site that changes from
     * time-to-time. 
     * 
     * <BR /><BR /><B CLASS=JDDescLabel>Using a Bot:</B>
     * 
     * <BR />If scraping Google's Translate Web-site conjurs images of the police coming to your
     * door, another web-site that seems to do pretty good Romanization is Pin1Yin1.com.  I have
     * another class that scrapes that site.
     *
     * @param DOUT This is filled up with Debug Information as this class is run.  It may be any
     * implementation of java's {@code java.lang.Appendable} interface.
     * 
     * @param simpSentence This is the complete simplified-Mandarin sentence obtained from
     * news-article.
     * 
     * @param pronSentence This is the pronunciation of the simplified-Mandarin sentence.  This
     * should have already been obtained from Google Translate.
     * 
     * @param characters This should be an empty vector.  It will be populated by the words from
     * the original Mandarin sentence, based on the pronunciation obtained from Google Translate.
     * 
     * @param pronunciation This should also be an empty vector.  It will be populated after the
     * words from the pronunciation sentence have been parsed into individual words.
     * 
     * @return boolean This is true if there was possibly an error along the way.
     * The specific requirements for the boolean value are: <BR />
     * {@code (cSent.length() != totalChinese) && (totalChinese > 0);}
     * 
     * @throws IOException The {@code interface java.lang.Appendable} mandates that the
     * {@code IOException} must be treated as a checked exception for all output operations.
     * Therefore {@code IOException} is a required exception in this method' throws clause.
     */
    public static boolean parse(
            Appendable      DOUT, 
            String          simpSentence,
            String          pronSentence,
            Vector<String>  characters,
            Vector<String>  pronunciation
        )
        throws IOException
    {
        int totalChinese = 0;
        DOUT.append("********************************************\n");
        DOUT.append("chin = " + simpSentence + "\n");
        DOUT.append("pron = " + pronSentence + "\n");

        // remove "alternate" (AUC) versions of A...Z or 0..9 are there..
        String cSent = ZH.convertAnyAUC(simpSentence);

        // CHANGED 2018.09.24 - dellAllPunctuation does not remove '.' and ',' between numbers!
        String pSent = ZH.delAllPunctuationPINYIN(pronSentence);

        cSent = ZH.delAllPunctuationCHINESE(cSent);

        DOUT.append("********************************************\n");
        DOUT.append("After Removing non-alphanumeric UniCode, and Alt-UniCode:\n");
        DOUT.append("cSent=" + cSent + "\n");
        DOUT.append("pSent=" + pSent + "\n");
        DOUT.append("********************************************\n");

        // Leading or ending blanks messes this up
        // *** Use trim()

        String[] pWords = pSent.trim().split(" ");

        for (int i = 0; i < pWords.length; i++)
        {
            String pronWord = pWords[i].trim();

            if (pronWord.length() == 0) continue;

            // Sometimes alphabetic characters appear in the chinese string.
            int leading = ZH.countLeadingLettersAndNumbers(cSent.substring(totalChinese));

            if (leading > 0)
            {
                String alphaNumericASCII = cSent.substring(totalChinese, totalChinese + leading);

                DOUT.append("*** Found English and Numbers ASCII in Chinese Sentence ***\n");
                DOUT.append("There are " + leading + " leading alpha numeric characters.");
                DOUT.append(" [" + alphaNumericASCII + "]\n");
                DOUT.append("pronunciation word is: [" + pronWord + "]\n");

                pronunciation.add(pronWord);
                characters.add(alphaNumericASCII);

                totalChinese += leading;
            }

            // else - it's just normal characters in the chinese string
            else
            {
                int numChinese      = ZH.countSyllablesAndNonChinese(pronWord, DOUT);
                String chineseWord  = cSent.substring(totalChinese, totalChinese + numChinese);

                DOUT.append("The word [" + pronWord + "] ");
                DOUT.append("corresponds to " + numChinese + " Unicode Characters ");
                DOUT.append("[" + chineseWord + "]\n");

                // Add the new word to the list
                pronunciation.add(pronWord);
                characters.add(chineseWord);

                totalChinese += numChinese;
            }
        }

        DOUT.append(
            "********************************************\n" +
            "COMPLETED SENTENCE LOOP\n" +
            "SUMMARY:\n" +
            "FOUND (" + totalChinese + ") characters in Chinese String\n" +
            "STRING CONTAINS (" + cSent.length() + ") characters\n" +
            ((totalChinese != cSent.length()) ? "\nPOSSIBLE ERROR MISMATCH\n\n" : "") +
            "********************************************\n"
        );

        return (cSent.length() != totalChinese) && (totalChinese > 0);
    }
}