1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
package Torello.Languages;

import java.io.*;
import java.util.*;

/**
 * PinYinParse (罗马拼音).
 * 
 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=PYP>
 */
@Torello.JavaDoc.StaticFunctional
public class PinYinParse
{
    private PinYinParse() { }

    /**
     * The purpose of this is produce the Parallel arrays (Vector<String>) which contain
     * Chinese Characters and Chinese PinYin based on the results of the Google Translate Query.
     *
     * <BR /><BR /><B CLASS=JDDescLabel>Scrape, non-API Invoation:</B>
     * 
     * <BR />This is of "limited use" - since primarily the input to this function is a
     * {@code String} that has been scraped from the <B>{@code Google Translate Website}</B>, not
     * a {@code String} from a query to Google Cloud Server's <B>{@code Translate-API}</B>.
     * 
     * <BR /><BR />The API version of Mandarin Translations literally leaves out the Pin-Yin
     * Romanizations, and makes the entire package a lot less useable.  The web-site itself can be
     * scraped, and the Pin-Yin obtained, but that String comes from a web-site that changes from
     * time-to-time. 
     * 
     * <BR /><BR /><B CLASS=JDDescLabel>Using a Bot:</B>
     * 
     * <BR />If scraping Google's Translate Web-site conjurs images of the police coming to your
     * door, another web-site that seems to do pretty good Romanization is Pin1Yin1.com.  I have
     * another class that scrapes that site.
     *
     * @param DOUT This is filled up with Debug Information as this class is run.  It may be any
     * implementation of java's {@code java.lang.Appendable} interface.
     * 
     * @param simpSentence This is the complete simplified-Mandarin sentence obtained from
     * news-article.
     * 
     * @param pronSentence This is the pronunciation of the simplified-Mandarin sentence.  This
     * should have already been obtained from Google Translate.
     * 
     * @param characters This should be an empty vector.  It will be populated by the words from
     * the original Mandarin sentence, based on the pronunciation obtained from Google Translate.
     * 
     * @param pronunciation This should also be an empty vector.  It will be populated after the
     * words from the pronunciation sentence have been parsed into individual words.
     * 
     * @return boolean This is true if there was possibly an error along the way.
     * The specific requirements for the boolean value are: <BR />
     * {@code (cSent.length() != totalChinese) && (totalChinese > 0);}
     * 
     * @throws IOException The {@code interface java.lang.Appendable} mandates that the
     * {@code IOException} must be treated as a checked exception for all output operations.
     * Therefore {@code IOException} is a required exception in this method' throws clause.
     */
    public static boolean parse(
            Appendable      DOUT, 
            String          simpSentence,
            String          pronSentence,
            Vector<String>  characters,
            Vector<String>  pronunciation
        )
        throws IOException
    {
        int totalChinese = 0;
        DOUT.append("********************************************\n");
        DOUT.append("chin = " + simpSentence + "\n");
        DOUT.append("pron = " + pronSentence + "\n");

        // remove "alternate" (AUC) versions of A...Z or 0..9 are there..
        String cSent = ZH.convertAnyAUC(simpSentence);

        // CHANGED 2018.09.24 - dellAllPunctuation does not remove '.' and ',' between numbers!
        String pSent = ZH.delAllPunctuationPINYIN(pronSentence);

        cSent = ZH.delAllPunctuationCHINESE(cSent);

        DOUT.append("********************************************\n");
        DOUT.append("After Removing non-alphanumeric UniCode, and Alt-UniCode:\n");
        DOUT.append("cSent=" + cSent + "\n");
        DOUT.append("pSent=" + pSent + "\n");
        DOUT.append("********************************************\n");

        // Leading or ending blanks messes this up
        // *** Use trim()

        String[] pWords = pSent.trim().split(" ");

        for (int i = 0; i < pWords.length; i++)
        {
            String pronWord = pWords[i].trim();

            if (pronWord.length() == 0) continue;

            // Sometimes alphabetic characters appear in the chinese string.
            int leading = ZH.countLeadingLettersAndNumbers(cSent.substring(totalChinese));

            if (leading > 0)
            {
                String alphaNumericASCII = cSent.substring(totalChinese, totalChinese + leading);

                DOUT.append("*** Found English and Numbers ASCII in Chinese Sentence ***\n");
                DOUT.append("There are " + leading + " leading alpha numeric characters.");
                DOUT.append(" [" + alphaNumericASCII + "]\n");
                DOUT.append("pronunciation word is: [" + pronWord + "]\n");

                pronunciation.add(pronWord);
                characters.add(alphaNumericASCII);

                totalChinese += leading;
            }

            // else - it's just normal characters in the chinese string
            else
            {
                int numChinese      = ZH.countSyllablesAndNonChinese(pronWord, DOUT);
                String chineseWord  = cSent.substring(totalChinese, totalChinese + numChinese);

                DOUT.append("The word [" + pronWord + "] ");
                DOUT.append("corresponds to " + numChinese + " Unicode Characters ");
                DOUT.append("[" + chineseWord + "]\n");

                // Add the new word to the list
                pronunciation.add(pronWord);
                characters.add(chineseWord);

                totalChinese += numChinese;
            }
        }

        DOUT.append(
            "********************************************\n" +
            "COMPLETED SENTENCE LOOP\n" +
            "SUMMARY:\n" +
            "FOUND (" + totalChinese + ") characters in Chinese String\n" +
            "STRING CONTAINS (" + cSent.length() + ") characters\n" +
            ((totalChinese != cSent.length()) ? "\nPOSSIBLE ERROR MISMATCH\n\n" : "") +
            "********************************************\n"
        );

        return (cSent.length() != totalChinese) && (totalChinese > 0);
    }
}