001package Torello.Languages;
002
003import java.io.*;
004import java.util.*;
005import java.util.regex.*;
006import Torello.Java.*;
007
008/**
009 * ZH (Mandarin Chinese) Many tools for parsing constructs from Mandarin News & other
010 * Web-Sites.
011 * 
012 * <BR /><BR /><EMBED CLASS="external-html" DATA-FILE-ID="ZH">
013 */
014@Torello.HTML.Tools.JavaDoc.StaticFunctional
015public class ZH
016{
017    private ZH() { }
018
019    static void main1(String[] argv) throws IOException
020    {
021        StringBuilder sb = new StringBuilder();
022        sb.append("<HTML>\n<HEAD>\n<TITLE>AUC Test</TITLE>\n");
023        sb.append("<META http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />\n");
024        sb.append("<BODY>\n");
025        sb.append(testAUC() + "\n<BR />\n");
026        sb.append("</BODY>\n</HTML>\n");
027        FileRW.writeFile(sb, "out.html");
028    }
029
030    /*
031    static void main(String argv[]) throws IOException
032    {
033        for (int i=0; i < H2CV.length; i++)
034            System.out.print(H2CV[i] + ":" + ((char) H2CV[i]) + ":" + CV2RV[i] + ",\t");
035        String s = "À, É, à, á, è, é, ì, í, ò, ó, ù,  ú, ü, Ā, ā, ē, ě, ī, ō, ū, ǎ, ǐ, ǒ, ǔ";
036        System.out.println(s);
037        System.out.println(toneVowelsToRegularVowels(s));
038    }
039    */
040
041    /*
042    * This is the list of the ASCII/UTF-8 character codes for the vowels with tone symbols on
043    * top of them.  Google Translate returns many of the PinYin Romanization results as: <BR />
044    * &#363;  <B><I>INSTEAD OF</B></i> the character as a UTF-8 character.   Essentially, this
045    * array contains a list of those character codes.
046    */
047    private static final int[] H2CV = { 39, 192, 201, 224, 225, 232, 233, 236, 237, 242, 243,
048                249, 250, 252, 256, 257, 275, 283, 299, 333, 363, 462, 464, 466, 468, 474, 476 };
049
050    private static final char[] CV = { '\'', 'À', 'É', 'à', 'á', 'è', 'é', 'ì', 'í', 'ò', 'ó',
051        'ù',  'ú', 'ü', 'Ā', 'ā', 'ē', 'ě', 'ī', 'ō', 'ū', 'ǎ', 'ǐ', 'ǒ', 'ǔ', 'ǚ', 'ǜ' };
052
053    private static final char[] CV2RV = { '\'', 'A', 'E', 'a', 'a', 'e', 'e', 'i', 'i', 'o', 'o',
054        'u',  'u', 'u', 'A', 'a', 'e', 'e', 'i', 'o', 'u', 'a', 'i', 'o', 'u', 'u', 'u' };
055
056    private static final Pattern P1 = Pattern.compile("&#(\\d+);", Pattern.CASE_INSENSITIVE);
057
058    /**
059     * This makes the problems of dealing with the tone/accent marks above vowels in Chinese
060     * Pin-Yin easier.  These convert vowels with tones over them into regular vowels.  This
061     * can be useful for certain {@code String} operations, although clearly the original meaning
062     * of the word would be decimated.
063     * 
064     * @param c any character from <B>ASCII / UTF-8 / UniCode</B> Basic Multi Lingual Plane.
065     * 
066     * @return if this is a {@code UTF-8} character that is an accented vowel, the un-accented
067     * version of that vowel is returned.  If this is not a PinYin symbol for a tone-vowel, 
068     * {@code ASCII 0} is returned.
069     * 
070     * @see #toneVowelsToRegularVowels(String)
071     */
072    public static char toneVowelToRegularVowel(char c)
073    {
074        for (int i=0; i < CV.length; i++) if (CV[i] == c) return CV2RV[i];
075        return (char) 0;
076    }
077
078    /**
079     * Counts the number of tone vowels in a <B>PinYin</B> {@code String}.
080     * 
081     * @param pinYinStr A {@code String}, usually generated by <B>Google Translate</B>, (and
082     * scraped from Google Translate) that contains <B>PinYin.</B>
083     * 
084     * @return The number of Mandarin Chinese Pin-Yin "Tone Vowels"
085     */
086    public static int countToneVowels(String pinYinStr)
087    {
088        int count=0;
089
090        TOP:
091        for (int i = pinYinStr.length()-1; i >= 0; i--)
092            for (int j=0; j < CV.length; j++)
093                if (pinYinStr.charAt(i) == CV[j])
094                    { count++; continue TOP; }
095
096        return count;
097    }
098
099    /**
100     * This performs a conversion of all vowels in a {@code String} from those with tones over them
101     * to the normal (un-accented) equivalent.  It uses the single-character-version of the
102     * synonymously named method
103     * 
104     * @param s any {@code java.lang.String} containing Mandarin Romanizations.
105     * 
106     * @return a {@code String} with all accented vowel's converted to regular vowels.
107     * 
108     * @see #toneVowelToRegularVowel(char)
109     */
110    public static String toneVowelsToRegularVowels(String s)
111    {
112        int             strlen  = s.length();
113        StringBuilder   sb      = new StringBuilder(s.length());
114        char            c;
115
116        for (int i=0; i < strlen; i++)
117            if ((c = toneVowelToRegularVowel(s.charAt(i))) != 0)
118                sb.append(c);
119            else
120                sb.append(s.charAt(i));
121
122        return sb.toString();
123    }
124
125    /**
126     * Google Translate returns some text encoded as {@code "&#num;" (the "ord(c)").}  This is also
127     * called {@code HTML Escaped Code} - because instead of actual <B>ASCII/UTF8</B> characters
128     * themselves, their "Ord" are returned - surrounded by the usual <I>HTML Escape Character
129     * Sequence</I> &amp;#num; This method does the {@code chr(html-hex-escape-code);} and replaces
130     * the {@code escape-sequence} (which again is &amp;#NUM;) with the actual ASCII character.
131     *
132     * <BR /><BR /><B>NOTE:</B> all of these are for "Chinese Tone Vowel" ASCII - The Google
133     * Translate module uses this method quite a bit.  Here are a few examples of
134     * HTML-Escape-Sequence and the corresponding ASCII.
135     *
136     * <BR /><TABLE CLASS="BRIEFTABLE">
137     * <TR><TH>HTML-Escaped</TH><TH>ASCII/UTF-8 Character</TH></TR>
138     * <TR><TD>&amp;#192;</TD><TD>À</TD></TR>
139     * <TR><TD>&amp;#225;</TD><TD>á</TD></TR>
140     * <TR><TD>&amp;#283;</TD><TD>ě</TD></TR>
141     * <TR><TD>&amp;#363;</TD><TD>ū</TD></TR>
142     * <TR><TD>&amp;#474;</TD><TD>ǚ</TD></TR>
143     * <TR><TD COLSPAN="2">... see array below for list</TD></TR>
144     * </TABLE>
145     *
146     * <BR /><BR /><B>NOTE:</B> {@code HTML2UTF8(String)} ==&gt; This method does the exact same
147     * thing - but does not limit the characters to be converted to only Chinese Tone Vowels.  This
148     * method only converts HTML-Escaped-Characters from this list:
149     *
150     * <BR /><BR /><CODE>
151     *  private static final int[] H2CV = { 39, 192, 201, 224, 225, 232, 233, 236, 237, 242, 243, <BR />
152     *  249, 250, 252, 256, 257, 275, 283, 299, 333, 363, 462, 464, 466, 468, 474, 476 };       <BR />
153     * </CODE>
154     * 
155     * @see #HTML2UTF8(String)
156     */
157    public static String HTML2ChineseVowels(String s)
158    {
159        for (int i=0; i < H2CV.length; i++)
160            s = s.replaceAll("&#" + H2CV[i] + ";", "" + (char) H2CV[i]);
161
162        return s;
163    }
164
165    /**
166     * NOTE: This does the same as {@code HTML2ChineseVowels(String)} <B><I>EXCEPT</B></I> that it
167     * converts <B><I>ANY</B></I> HTML string that has been encoded as: {@code &amp;#NUM;} - not 
168     * just the characters having accents and corresponding to Chinese Tone Vowels.
169     * 
170     * @see #HTML2ChineseVowels(String)
171     */
172    public static String HTML2UTF8(String s)
173    {
174        // Build the list of UTF8/ASCII character values (as Ord(c) / int) first.
175        HashSet<Integer>    utfList = new HashSet<Integer>();
176        Matcher             m       = P1.matcher(s);
177
178        while (m.find()) utfList.add(Integer.parseInt(m.group(1)));
179
180        // Now convert them.
181        for (Integer i : utfList) s = s.replaceAll("&#" + i.toString() + ";", "" + ((char) i.intValue()));
182
183        return s;
184    }
185
186    /** 
187     * This is used to convert a Chinese Character into a full {@code String} that includes the
188     * <B>UTF-8</B> code represented as a {@code HEXADECIMAL} number and a {@code decimal} number
189     *
190     * @param c any ASCII/UniCode/UTF-8 char - but, generally, expected to be a
191     * "Chinese Character."
192     *
193     * <BR /><BR /><B>NOTE:</B> The choice for parameter {@code char c} has no actual constraints
194     * on its input value.
195     *
196     * @return A {@code String} of this format: {@code 掭(0x63AD, 25517)}
197     */
198    public static String formatUTF8Chinese(char c)
199    { return c + "(0x" + String.format("%x", ((int) c)).toUpperCase() + ", " + ((int) c) + ")"; }
200    
201    
202    /**
203     * Helper function - checks if this is a character in the UTF-8 &amp; ASCII ranges that contain
204     * Mandarin Chinese characters.  This is not guaranteed to be accurate - some non-Chinese
205     * Japanese characters exist in this range.  For the precise definition of what this function
206     * actually does, see the ranges printed below.
207     *
208     * <BR /><BR />COPIED FROM*** <BR />
209     * <B><CODE><A HREF="http://www.khngai.com/chinese/charmap/tbluni.php?page=0" TARGET="_blank">
210     * http://www.khngai.com/chinese/charmap/tbluni.php?page=0</A></CODE></B>
211     *
212     * <BR /><BR />AND: {@code ((c >= 0x4E00) && (c <= 0x9FFF)) }
213     *
214     * <BR /><BR />COPIED FROM*** <BR />
215     * <B><CODE><A HREF="http://www.khngai.com/chinese/charmap/tblgb.php?page=1" TARGET="_blank">
216     * http://www.khngai.com/chinese/charmap/tblgb.php?page=1</A></CODE></B>
217     *
218     * @param c any UTF-8, ASCII or UniCode character available.
219     * 
220     * @return <B>TRUE</B> if the input character {@code 'c'} is in the UTF-8/UniCode range 
221     * for Chinese Characters
222     */
223    public static boolean isChinese(char c)
224    {   
225        if ((c >= 0x4E00) && (c <= 0x9FFF)) return true;
226        if ((c >= 0xB0A0) && (c <= 0xBFFF)) return true;
227        if ((c >= 0xC0A0) && (c <= 0xCFFF)) return true;
228        if ((c >= 0xD0A0) && (c <= 0xDFFF)) return true;
229        if ((c >= 0xE0A0) && (c <= 0xEFFF)) return true;
230        if ((c >= 0xF0A0) && (c <= 0xF7FF)) return true;
231
232        return false;
233    }   
234
235    /**
236     * Checks a {@code char} is something that is not {@code Alpha Numeric} or {@code White Space}
237     *
238     * @param c any UTF-8, ASCII or UniCode character available.
239     * 
240     * @return {@code ((!isAlphaNumeric(c)) && (!isSpace(c)));}
241     */
242    public static boolean isOther(char c)
243    { return ((!isAlphaNumeric(c)) && (!isSpace(c))); }
244
245    /**
246     * Checks if a {@code char} is Alpha Numberic.
247     *
248     * @param c any {@code UTF-8, ASCII} or {@code UniCode} character available.
249     * 
250     * @return {@code (isAlpha(c) || isNumber(c));}
251     */
252    public static boolean isAlphaNumeric(char c)
253    { return (isAlpha(c) || isNumber(c)); }
254
255    /**
256     * Checks if a {@code char} is Alphabetic.
257     *
258     * @param c any {@code UTF-8, ASCII} or {@code UniCode} character available.
259     * 
260     * @return {@code (isToneVowel(c) || isRegVowel(c) || isRegLetter(c));}
261     */
262    public static boolean isAlpha(char c) 
263    { return (isToneVowel(c) || isRegVowel(c) || isRegLetter(c)); }
264
265    /**
266     * This is a helper function for the Mandarin Chinese accented vowel symbols in
267     * {@code UTF-8, ASCII} and {@code UniCode}.  The exact character code numbers are printed
268     * below.
269     *
270     * <BR /><BR /><B>NOTE:</B> In 罗马拼音 (Pin-Yin Romanization), there are a few symbols that 
271     * should never come up - at least as the software pertains to 罗马拼音-results provided by
272     * <B>Google Cloud Server Translation API</B> {@code (GCS-TS/TAPI)}.  This is because
273     * <B><I>NO</I></B> word in Pin-Yin ever starts with the letter's I or U, or the U with an
274     * umlau - <B><I>so</B></I> - capitalized versions of these letters ought to never occur -
275     * unless the entire PinYin were capitalized - which is something GCSTS never does.
276     *
277     * @param c any UTF-8, ASCII or UniCode character available.
278     * 
279     * @return <B>TRUE</B> if the input character {@code 'c'} is one of the following:
280     * 
281     * <BR /><BR /><TABLE CLASS="BRIEFTABLE">
282     * <TR><TH>Simple ASCII</TH><TH>UTF-8 Tone Vowel</TH></TR>
283     * <TR><TD>a</TD><TD> ā (257), á (225), ǎ (462), à (224)</TD></TR>
284     * <TR><TD>e</TD><TD> ē (275), é (233), ě (283), è (232)</TD></TR>
285     * <TR><TD>i</TD><TD> ī (299), í (237), ǐ (464), ì (236)</TD></TR>
286     * <TR><TD>o</TD><TD> ō (333), ó (243), ǒ (466), ò (242)</TD></TR>
287     * <TR><TD>u</TD><TD> ū (363), ú (250), ǔ (468), ù (249)</TD></TR>
288     * <TR><TD>u</TD><TD> ǖ (470), ǘ (472), ǚ (474), ǜ (476)</TD></TR>
289     * <TR><TD>A</TD><TD> Ā (256), Á (193), Ǎ (461), À (192)</TD></TR>
290     * <TR><TD>E</TD><TD> Ē (274), É (201), Ě (282), È (200)</TD></TR>
291     * <TR><TD>O</TD><TD> Ō (332), Ó (211), Ǒ (465), Ò (210)</TD></TR>
292     * </TABLE>
293     *
294     * <BR />In Mandarin Chinese, PinYin-words cannot start with these letters below.
295     * Therefore it would be highly unlikely to see a "capitalized" version of these tone-vowels.
296     *
297     * <BR /><BR /><TABLE CLASS="BRIEFTABLE">
298     * <TR><TH>Simple ASCII</TH><TH>UTF-8 Tone Vowel</TH></TR>
299     * <TR><TD>I</TD><TD>Ī (298), Í (205), (there are 2: Ǐ (463), Ĭ (300)), Ì (204)</TD></TR>
300     * <TR><TD>U</TD><TD>Ū (362), Ú (218), Ŭ (364), Ù (217)</TD></TR>
301     * <TR><TD>U</TD><TD>(Ü (220) -no tone): Ǖ (469), Ǘ (471), Ǘ (473), Ǜ (475)</TD></TR>
302     * </TABLE>
303     */
304    public static boolean isToneVowel(char c)
305    {
306        // A, ā 257, á 225, ǎ 462, à 224
307        if ((c == 257) || (c == 225) || (c == 462) || (c == 224)) return true;
308
309        // E, ē 275, é 233, ě 283, è 232
310        if ((c == 275) || (c == 233) || (c == 283) || (c == 232)) return true;
311              
312        // I, ī 299, í 237, ǐ 464, ì 236 
313        if ((c == 299) || (c == 237) || (c == 464) || (c == 236)) return true;
314
315        // O, ō 333, ó 243, ǒ   466, ò 242
316        if ((c == 333) || (c == 243) || (c == 466) || (c == 242)) return true;
317
318        // U, ū 363, ú 250, ǔ 468, ù 249
319        if ((c == 363) || (c == 250) || (c == 468) || (c == 249)) return true;
320
321        // U, ǖ 470, ǘ 472, ǚ 474, ǜ 476
322        if ((c == 470) || (c == 472) || (c == 474) || (c == 476)) return true;
323
324        // *******
325        // Capital vowels with tone symbols
326
327        // Ā 256, Á 193, Ǎ 461, À 192
328        if ((c == 256) || (c == 193) || (c == 461) || (c == 192)) return true;
329
330        // Ē 274, É 201, Ě 282, È 200
331        if ((c == 274) || (c == 201) || (c == 282) || (c == 200)) return true;
332
333        // Ō 332, Ó 211, Ǒ 465, Ò 210
334        if ((c == 332) || (c == 211) || (c == 465) || (c == 210)) return true;
335
336        // Not sure about these - found them on a website
337        // **********************************************
338        //       1234 5678 9ABC DEF
339        // A8A0  āáǎà ēéěè  īíǐì ōóǒ
340        //
341        //       0 1234 5678 9 A
342        // A8B0  ò ūúǔù  ǖǘǚǜ  ü ê
343        // **********************************************
344        if ((c >= 0xA8A1) && (c <= 0xA8Ba)) return true;
345
346        return false;
347    }
348
349    /**
350     * Checks that a character is a standard vowel.
351     * 
352     * @param c any {@code UTF-8, ASCII} or {@code UniCode} character available.
353     * 
354     * @return <B>TRUE</B> if the input character {@code 'c'} EQUALS one of these ten letters: 
355     * <B>a, e, i, o, u, A, E, I, O, U</B>
356     */
357    public static boolean isRegVowel(char c)
358    {
359        // The normal vowels
360
361        // a 97, A 65
362        if ((c == 97) || (c == 65))     return true;
363
364        // e 101, E 69
365        if ((c == 101) || (c == 69))    return true;
366
367        // i 105, I 73
368        if ((c == 105) || (c == 73))    return true;
369
370        // o 111, O 79
371        if ((c == 111) || (c == 79))    return true;
372
373        // u 117, U 85
374        if ((c == 117) || (c == 85))    return true;
375
376        return false;
377    }
378
379    /**
380     * Regular Letters Include: {@code 'A' ... 'Z'} (65 - 90),  {@code 'a' ... 'z'} (97 - 122)
381     * 
382     * @param c any {@code UTF-8, ASCII} or {@code UniCode} character available.
383     * 
384     * @return <B>TRUE</B> if the input character {@code 'c'} is any letter in lower-level
385     * <B>ASCII</B> (and not any of the AUC).
386     */
387    public static boolean isRegLetter(char c)
388    { return ((c >= 65) && (c <= 90)) || ((c >= 97) && (c <= 122)); }
389
390    /**
391     * Regular Numbers Include: {@code '0' ... '9'}
392     * 
393     * @param c any {@code UTF-8, ASCII} or {@code UniCode} character available.
394     * 
395     * @return <B>TRUE</B> if the input character {@code 'c'} is in the range of ASCII 
396     * {@code '0' ... '9'} (not any of the AUC)
397     */
398    public static boolean isNumber(char c)
399    { return ((c >= 48) && (c <= 57)); }
400
401    /**
402     * Checks for WhiteSpace: {@code '\t', '\n', '\r', ' '}
403     * 
404     * @param c any {@code UTF-8, ASCII} or {@code UniCode} character available.
405     * 
406     * @return <B>TRUE</B> if the input character {@code 'c'} is a whitespace character code from
407     * the above list
408     */
409    public static boolean isSpace(char c)
410    { return ((c == 9) || (c == 12) || (c == 15) || (c == 32)); }
411
412
413    /**
414     * Bullet List characters in upper {@code UniCode / UTF-8}.  These characters exist in
415     * <B>UTF-8</B> - and they are occasionally used in documents found on Chinese News Websites.
416     * They are all "bullet-list" points.  An integer is returned for each of these, that is equal
417     * to the number represented by the UTF-8/UniCode character here.
418     *
419     * <BR /><BR /><UL CLASS="JDUL">
420     * <LI>0 1 2 3 4 5 6 7 8 9 a b c d e f</LI>
421     * <LI>N ⒈ ⒉ ⒊ ⒋ ⒌ ⒍ ⒎ ⒏ ⒐ ⒑ ⒒ ⒓ ⒔ ⒕ ⒖</LI>
422     * <LI>⒗ ⒘ ⒙ ⒚ ⒛ ⑴ ⑵ ⑶ ⑷ ⑸ ⑹ ⑺ ⑻ ⑼ ⑽ ⑾</LI>
423     * <LI>⑿ ⒀ ⒁ ⒂ ⒃ ⒄ ⒅ ⒆ ⒇ ① ② ③ ④ ⑤ ⑥ ⑦  </LI>
424     * <LI>⑧ ⑨ ⑩ N N ㈠ ㈡ ㈢ ㈣ ㈤ ㈥ ㈦ ㈧ ㈨ ㈩ N</LI>
425     * <LI>N Ⅰ Ⅱ Ⅲ Ⅳ Ⅴ Ⅵ Ⅶ Ⅷ Ⅸ Ⅹ Ⅺ Ⅻ</LI>
426     * </UL>
427     *
428     * @param c any character as input
429     * 
430     * @return The number equivalent represented by this bullet point.
431     */
432    public static int bulletListAUC(char c)
433    {
434        // ⒈ ==> ⒛
435        if ((c >= 0x2488) && (c <= 0x249B)) return ((int) c) - 0x2487;
436
437        // ⑴ ==> ⒇
438        if ((c >= 0x2474) && (c <= 0x2487)) return ((int) c) - 0x2473;  
439
440        // ① ==> ⑩
441        if ((c >= 0x2460) && (c <= 0x2469)) return ((int) c) - 0x245F;
442
443        // ㈠ ==> ㈩
444        if ((c >= 0x3220) && (c <= 0x3229)) return ((int) c) - 0x321F;
445
446        // Ⅰ ==> Ⅻ
447        if ((c >= 0x2160) && (c <= 0x216B)) return ((int) c) - 0x215F;
448
449        return 0;
450    }
451
452    /**
453     * Alpha-Numeric character code from  upper UniCode / UTF-8
454     *
455     * <BR /><BR />These characters exist in <B>UTF-8</B> - but they ARE NOT the usual ASCII
456     * characters for the letters {@code 'A' ... 'Z'} or the numbers {@code '0' ... '9'}  They,
457     * however, are sometimes found in documents on Chinese News Websites, etc.
458     *
459     * <BR /><BR />Copied from:<BR />
460     * <B><CODE><A HREF="http://www.khngai.com/chinese/charmap/tblgb.php?page=0" TARGET="_blank">
461     * http://www.khngai.com/chinese/charmap/tblgb.php?page=0</A></CODE></B>
462     *
463     * <BR /><BR /><UL CLASS="JDUL">
464     * <LI>0 1 2 3 4 5 6 7 8 9 a b c d e f </LI>
465     * <LI>! " # ¥ % & ' ( ) * + , - . /</LI>
466     * <LI>0 1 2 3 4 5 6 7 8 9 : ; < = > ?</LI>
467     * <LI>@ A B C D E F G H I J K L M N O</LI>
468     * <LI>P Q R S T U V W X Y Z [ \ ] ^ _</LI>
469     * <LI>a b c d e f g h i j k l m n o</LI>
470     * <LI>p q r s t u v w x y z { | }  ̄</LI>
471     * </UL>
472     *
473     * @param c any character as input
474     * 
475     * @return the "lower-level-ASCII" version of that character. 
476     */
477    public static char alphaNumericAUC(char c)
478    {
479        // ASCII 'A' is 65
480        if ((c > 0xFF20) && (c < 0xFF3B))   return (char) (65 + (c - 0xFF21));
481
482        // ASCII 'a' is 97
483        if ((c > 0xFF40) && (c < 0xFF5B))   return (char) (97 + (c - 0xFF41));
484
485        // ASCII '0' is 48
486        if ((c >= 0xFF10) && (c <= 0xFF1A)) return (char) (48 + (c - 0xFF10));
487
488        return 0;
489    }
490
491    /**
492     * This method, {@code punctuationAUC(char)}, converts any characters which are common on many 
493     * Mandarin Chinese websites into a lower-level, more typical/normal ASCII equivalent.  This is
494     * can be very useful when trying to make sense of brackets, parenthesis, quotes, commas and
495     * other punctuation marks - and quickly convert them into a simple version of the character.
496     *
497     * <BR /><BR />If the input character has an "Alternate Version" in the lower-level-ASCII
498     * range, that lower level ASCII character is returned.  If this isn't AUC, ASCII-0 is
499     * returned.
500     *
501     * <BR /><BR /><B>For Instance:</B>
502     *
503     * <BR /><BR /><TABLE CLASS="BRIEFTABLE">
504     * <TR><TH>Input</TH><TH>Output</TH></TR>
505     * <TR><TD>〖 〗 【 】      </TD><TD> [ ] [ ] </TD></TR>
506     * <TR><TD> 。 ○ ● .       </TD><TD>. (ASCII-period) </TD></TR>
507     * <TR><TD>¨ 〃 “ ” ″ "    </TD><TD>" (ASCII-double-quote)   </TD></TR>
508     * <TR><TD>, (ASCII-comma)  </TD><TD>ASCII-0 </TD></TR>
509     * <TR><TD>+ (ASCII-plus)   </TD><TD>ASCII-0 </TD></TR>
510     * </TABLE>
511     * 
512     * @param c any character as input
513     * 
514     * @return the "lower-level-ASCII" version of that character
515     * 
516     * <BR /><BR /><B>NOTE:</B> ASCII-0 is returned if this is not a valid "AUC"
517     * {@code UTF-8 / UniCode} code!
518     */
519    public static char punctuationAUC(char c)
520    {
521        // Copied from: 
522        // *** http://www.khngai.com/chinese/charmap/tblgb.php?page=0
523        //
524        // 0 2 3 4 5 6 7 8 9 a b c d e f
525        // N N 、 。 · ˉ ˇ ¨ 〃 々 — ~ ‖ … ‘ ’ 
526        // “ ” 〔 〕 〈 〉 《 》 「 」 『 』 〖 〗 【 】
527        // ± × ÷ ∶ ∧ ∨ ∑ ∏ ∪ ∩ ∈ ∷ √ ⊥ ∥ ∠
528        // ⌒ ⊙ ∫ ∮ ≡ ≌ ≈ ∽ ∝ ≠ ≮ ≯ ≤ ≥ ∞ ∵ 
529        // ∴ ♂ ♀ ° ′ ″ ℃ $ ¤ ¢ £ ‰ § № ☆ ★
530        // ○ ● ◎ ◇ ◆ □ ■ △ ▲ ※ → ← ↑ ↓ 〓 
531        //
532        // 0 1 2 3 4 5 6 7 8 9 a b c d e f
533        // ! " # ¥ % & ' ( ) * + , - . /
534        // 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
535        // @ A B C D E F G H I J K L M N O
536        // P Q R S T U V W X Y Z [ \ ] ^ _
537        // ` a b c d e f g h i j k l m n o
538        // p q r s t u v w x y z { | }  ̄     
539
540        switch (c)
541        {
542            // 、 ,
543            case 0x3001:               // 、
544            case 0xFF0C: return ',';   // ,
545
546            // 。 ○ ● .
547            case 0x3002:               // 。
548            case 0x25CB:               // ○
549            case 0x25CF:               // ●
550            case 0xFF0E: return '.';   // .
551
552            // ‘ ’ ′ ' `
553            case 0x2018:               // ‘
554            case 0x2019:               // ’
555            case 0x2032:               // ′
556            case 0xFF07:               // '
557            case 0xFF40: return '\'';  // `
558
559            // ¨ 〃 “ ” ″ "
560            case 0x00A8:               // ¨
561            case 0x3003:               // 〃
562            case 0x201C:               // “
563            case 0x201D:               // ”
564            case 0x2033:               // ″
565            case 0xFF02: return '\"';  // "
566
567            // 〔 (
568            case 0x3014:               // 〔
569            case 0xFF08: return '(';   // (
570
571            // 〕 )
572            case 0x3015:               // 〕
573            case 0xFF09: return ')';   // )
574
575            // 〈 <
576            case 0x3008:               // 〈
577            case 0xFF1C: return '<';   // <
578
579            // 〉 >
580            case 0x3009:               // 〉
581            case 0xFF1E: return '>';   // >
582
583            // 「 『 〖 【 [
584            case 0x300C:               // 「
585            case 0x300E:               // 『
586            case 0x3016:               // 〖
587            case 0x3010:               // 【
588            case 0xFF3B: return '[';   // [
589
590            // 」 』 〗】 ]
591            case 0x300D:               // 」
592            case 0x300F:               // 』
593            case 0x3017:               // 〗
594            case 0x3011:               // 】
595            case 0xFF3D: return ']';   // ]
596
597            // ∶ :
598            case 0x2236:               // ∶
599            case 0xFF1A: return ':';   // :
600
601            case 0xFF01: return '!';   // !
602            case 0xFF03: return '#';   // #
603            case 0xFF05: return '%';   // %
604            case 0xFF06: return '&';   // &
605            case 0xFF1F: return '?';   // ?
606            case 0xFF0F: return '/';   // /
607            case 0xFF3E: return '^';   // ^
608            case 0xFF5B: return '{';   // {
609            case 0xFF5D: return '}';   // }
610            case 0xFF5C: return '|';   // |
611            case 0xFF0B: return '+';   // +
612            case 0xFF3C: return '\\';  // \
613            case 0xFF3F: return '_';   // _
614
615            // — -
616            case 0x2014:               // —
617            case 0xFF0D: return '-';   // -
618
619            // 〓 =
620            case 0x3013:               // 〓
621            case 0xFF1D: return '=';   // =
622        }
623        return 0;
624    }
625
626    /**
627     * Bo Po Mo Fo (注音符號).
628     * 
629     * <BR /><BR />This is a popular pronunciation system for Mandarin Characters in Taiwan &amp;
630     * Hong Kong.
631     * 
632     * <BR /><BR /><UL CLASS="JDUL">
633     * <LI>N N N N N ㄅ ㄆ ㄇ ㄈ ㄉ ㄊ ㄋ ㄌ ㄍ ㄎ ㄏ</LI>
634     * <LI>ㄐ ㄑ ㄒ ㄓ ㄔ ㄕ ㄖ ㄗ ㄘ ㄙ ㄚ ㄛ ㄜ ㄝ ㄞ ㄟ</LI>
635     * <LI>ㄠ ㄡ ㄢ ㄣ ㄤ ㄥ ㄦ ㄧ ㄨ ㄩ N N N N N N</LI>
636     * </UL>
637     * 
638     * @param c any {@code UTF-8, ASCII} or {@code UniCode} character available from
639     * {@code Plane 0}, the <B>Basic Multi-Lingual Plane</B>
640     * 
641     * @return <B>TRUE</B> if the input character {@code 'c'} is in this UTF-8/UniCode range.
642     * The {@code HEXADECIMAL / UTF-8} representation of the <B>'Bo Po Mo Fo'</B> range is:
643     * {@code 0x3110 ... 0x3129}.
644     */
645    public static boolean isBPMFAUC(char c)
646    {
647        // 0 1 2 3 4 5 6 7 8 9 a b c d e f
648        // N N N N N ㄅ ㄆ ㄇ ㄈ ㄉ ㄊ ㄋ ㄌ ㄍ ㄎ ㄏ
649        // ㄐ ㄑ ㄒ ㄓ ㄔ ㄕ ㄖ ㄗ ㄘ ㄙ ㄚ ㄛ ㄜ ㄝ ㄞ ㄟ
650        // ㄠ ㄡ ㄢ ㄣ ㄤ ㄥ ㄦ ㄧ ㄨ ㄩ N N N N N N
651
652        return (c >= 0x3110) && (c <= 0x3129);
653    }
654
655    /**
656     * Checks for end-of-sentence punctuation marks - and "down-converts" them to the simple ASCII
657     * equivalent version of that punctuation mark.  If the input character code is not an AUC
658     * version of a typical Mandarin-Chinese end-of-sentence punctuation mark - then ASCII-zero is
659     * returned.
660     *
661     * <BR /><BR /><B>NOTE:</B> if a lower-level-ASCII (normal) punctuation mark is input - then
662     * ASCII-0 is returned.
663     *
664     * <BR /><BR /><B>SPECIFICALLY:</B> with {@code '.' '?'} and {@code '!'} as input to this
665     * function, ASCII-0 will be returned.
666     *
667     * <BR /><BR /><B>USE:</B> {@code endOfSentence(c)} to have those punctuation marks included in
668     * non-zero results.
669     * 
670     * @param c any UTF-8, ASCII or UniCode character available.
671     * 
672     * @return if the input character {@code 'c'} is an "alternate UTF-8" version of the 
673     * punctuation marks:
674     * 
675     * <BR /><BR /><UL CLASS="JDUL">
676     * <LI>a period ('.')</LI>
677     * <LI>an exclamation-point ('!')</LI>
678     * <LI>a question-mark ('?')</LI>
679     * </UL>
680     * 
681     * <BR /><BR />Then the output to this method shall be determined by the table below:
682     * 
683     * <BR /><BR /><TABLE CLASS="BRIEFTABLE">
684     * <TR><TH>Input Character</TH><TH>Output Character</TH></TR>
685     * <TR><TD>。 ○ ● .</TD><TD>'.' (normal period)</TD></TR>
686     * <TR><TD>!</TD><TD>'!' (regular exclamation point)</TD></TR>
687     * <TR><TD>?</TD><TD>'?' (usual question mark)</TD></TR>
688     * </TABLE>
689     *
690     * <BR /><BR />
691     * <B>NOTE:</B> If the normal period, question, or exclamation are passed as input to this
692     * function, this function will return ASCII-0
693     * 
694     * @see #endOfSentence(char)
695     */
696    public static char endOfSentenceAUC(char c)
697    {
698        char auc = punctuationAUC(c);
699
700        if (auc != 0) c = auc;
701
702        // A 'switch' is used instead of an 'if' with a char-cast because it is easier to
703        // read on this page.  Only the three characters with ASCII 46, 33, and 63 should
704        // return non-zero values.
705        switch ((int) auc)
706        {
707            // These characters identify an "End of Sentence" marker.
708            case 0x2E: return '.';  // DEC: 46
709            case 0x21: return '!';  // DEC: 33
710            case 0x3F: return '?';  // DEC: 63
711
712            // All other characters should result in a '0'
713            default:   return (char) 0;
714        }
715    }
716
717    /**
718     * Checks for end-of-sentence punctuation marks.  This Helper function is *almost* identitical
719     * to the {@code endOfSentenceAUC(c)} method.
720     * 
721     * <BR /><BR />{@code endOfSentenceAUC(c)} returns ASCII-0 for the usual-punctuation marks -
722     * {@code '.', '!'} and {@code '?'}.
723     *
724     * <BR /><BR />{@code endOfSentence(c)} does not 'leave-out' or 'deny' these lower-level-ASCII
725     * punctuation symbols.
726     * 
727     * @param c any UTF-8, ASCII or UniCode character available.
728     * 
729     * @return If the input character {@code 'c'} is a period {@code ('.')}, an exclamation-point
730     * {@code ('!')}, or a question-mark {@code ('?')} - <B><I>or an AUC version of that
731     * punctuation,</B></I> then that punctuation is returned.  Otherwise ASCII-0 is returned.
732     * 
733     * @see #endOfSentenceAUC(char)
734     */
735    public static char endOfSentence(char c)
736    {
737        char auc = endOfSentenceAUC(c);
738
739        if (auc != 0) c = auc;
740
741        // These three characters identify an "End of Sentence" Marker
742        if ((c == '.') || (c == '!') || (c == '?')) return c;
743
744        return (char) 0;
745    }
746
747    /**
748     * Checks for end-of-phrase punctuation marks - and "down-converts" them to the simple ASCII
749     * equivalent version of that punctuation mark.  If the input character code is not an AUC
750     * version of a typical Mandarin-Chinese phrase-delimiting punctuation mark - then ASCII-zero
751     * is returned.
752     *
753     * <BR /><BR /><B>NOTE:</B> if a lower-level-ASCII (normal) punctuation mark is input - then
754     * ASCII-0 is returned.
755     *
756     * <BR /><BR /><B>SPECIFICALLY:</B> with {@code ',' ':' ';'} and other common phrase-ending
757     * marks in Mandarin as input to this function, ASCII-0 will be returned.
758     *
759     * <BR /><BR /><B>USE:</B> {@code endOfPhrase(c)} to have those punctuation marks included in
760     * non-zero results.
761     * 
762     * @param c any UTF-8, ASCII or UniCode character available.
763     * 
764     * @return if the input character {@code 'c'} is an "alternate UTF-8" (AUC) version of the 
765     * punctuation marks:
766     *
767     * <BR /><BR /><TABLE CLASS="BRIEFTABLE">
768     * <TR><TH>Punctuation</TH><TH>Symbol and ASCII-Code</TH></TR>
769     * <TR><TD>semi-colon       </TD><TD>';'    HEX:0x3B, DEC: 59</TD></TR>
770     * <TR><TD>comma            </TD><TD>','    HEX:0x2C, DEC: 44</TD></TR>
771     * <TR><TD>colon            </TD><TD>':'    HEX:0x3A, DEC: 58</TD></TR>
772     * <TR><TD>double-quote     </TD><TD>'\"'   HEX:0x22, DEC: 34</TD></TR>
773     * <TR><TD>single-quote     </TD><TD>'\''   HEX:0x27, DEC: 39</TD></TR>
774     * <TR><TD>left-bracket     </TD><TD>'['    HEX:0x5B, DEC: 91</TD></TR>
775     * <TR><TD>right-bracket    </TD><TD>']'    HEX:0x5D, DEC: 93</TD></TR>
776     * <TR><TD>less-than        </TD><TD>'&lt;' HEX:0x3C, DEC: 60</TD></TR>
777     * <TR><TD>greater-than     </TD><TD>'&gt;' HEX:0x3E, DEC: 62</TD></TR>
778     * <TR><TD>left-paren       </TD><TD>'('    HEX:0x28, DEC: 40</TD></TR>
779     * <TR><TD>right-paren      </TD><TD>')'    HEX:0x29, DEC: 41</TD></TR>
780     * </TABLE>
781     *
782     * <BR /><BR />
783     * <B>IMPORTANT NOTE:</B> *only* the upper-level-UTF-8/UniCode versions of these
784     * punctuation marks will produce a non-zero result.  An actual ASCII comma, semi-colon, quote,
785     * bracket, or parenthesis (etc...) will cause this method to return ASCII-0.  Please use 
786     * endOfPhrase(char) to include the lower-level (Already down-converted ASCII) with non-zero
787     * results.
788     * 
789     * @see #endOfPhrase(char)
790     */
791    public static char endOfPhraseAUC(char c)
792    {
793        char auc = punctuationAUC(c);
794
795        if (auc != 0) c = auc;
796
797        // A 'switch' is used instead of an 'if' with a char-cast because it is easier to
798        // read on this page.  Only the characters having ASCII 59, 44, 58, 34, etc... should
799        // return non-zero values.
800        switch ((int) auc)
801        {
802            // These characters constitute an "End of Phrase" marker
803            case 0x3B: return ';';  // DEC: 59
804            case 0x2C: return ',';  // DEC: 44
805            case 0x3A: return ':';  // DEC: 58
806            case 0x22: return '\"'; // DEC: 34
807            case 0x27: return '\''; // DEC: 39
808            case 0x5B: return '[';  // DEC: 91
809            case 0x5D: return ']';  // DEC: 93
810            case 0x3C: return '<';  // DEC: 60
811            case 0x3E: return '>';  // DEC: 62
812            case 0x28: return '(';  // DEC: 40
813            case 0x29: return ')';  // DEC: 41
814
815            // All other results should return '0'
816            default: return 0;
817        }
818    }
819
820    /**
821     * endOfPhrase - any version of the end-of-phrase markers usually used in Mandarin Chinese
822     * text.  This method returns the exact same results as the {@code endOfPhraseAUC(char)}
823     * method.
824     *
825     * <BR /><BR /><B><SPAN STYLE="color: red;">EXCEPT:</SPAN></B>
826     * The regular/normal version of that punctuation mark (ASCII for semi-colon, comma, quote,
827     * etc...) will return the exact-same semi-colon, comma or quote - <I><B>instead of</B></I>
828     * ASCII-0
829     *
830     * <BR /><TABLE CLASS="BRIEFTABLE">
831     * <TR><TH>Input &amp; Method Called:</TH><TH>Result</TH></TR>
832     * <TR><TD>endOfPhrase(';')     </TD><TD>';'    // Normal ASCII semi-colon symbol</TD></TR>
833     * <TR><TD>endOfPhraseAUC(';')  </TD><TD>0      // ASCII-0 returned</TD></TR>
834     * <TR><TD>endOfPhrase('】')    </TD><TD>']'    // left-bracket returned</TD></TR>
835     * <TR><TD>endOfPhraseAUC('】') </TD><TD>']'    // left-bracket returned</TD></TR>
836     * <TR><TD>endOfPhrase(']')     </TD><TD>']'    // left-bracket returned</TD></TR>
837     * <TR><TD>endOfPhraseAUC(']')  </TD><TD>0      // ASCII-0 returned</TD></TR>
838     * </TABLE>
839     *
840     * <BR /><BR />
841     * The list of end-of-phrase characters include the following:<BR />
842     * <B STYLE="color:red">{@code ';' ',' ':' '\"' '\'' '[' ']' '<' '>' '(' ')'}</B>
843     * 
844     * @param c Any character in the entire UniCode range. 0x0000 to 0xFFFF
845     * 
846     * @return If {@code 'c'} is an "AUC" version of and end-of-phrase marker - or a regular
847     * lower-level ASCII version - then that punctuation mark is returned.  Otherwise 0 is
848     * returned.
849     * 
850     * @see #punctuationAUC(char)
851     */
852    public static char endOfPhrase(char c)
853    {
854        char auc = punctuationAUC(c);
855
856        if (auc != 0) c = auc;
857
858        if ((c == ';')  ||  (c == ',')  || (c == ':') ||
859            (c == '\"') ||  (c == '\'') ||
860            (c == '[')  ||  (c == ']')  || 
861            (c == '<')  ||  (c == '>')  ||
862            (c == '(')  ||  (c == ')'))
863            return c;
864
865        return (char) 0;
866    }
867
868    /**
869     * Quotes - any version.  &nbsp;&nbsp;<B><I>AUC or normal-ASCII, (BOTH)</B></I> single or 
870     * double quote.
871     * 
872     * @param c Any character in the entire <B>UniCode</B> range. {@code 0x0000 to 0xFFFF} which is
873     * the {@code Basic Multi Lingual Plane}.
874     * 
875     * @return If the input character {@code 'c'} is an <B><I>"AUC" version</B></i> of the single 
876     * (or double) quote, or the <B><I>regular-ASCII</B></I> single/double quote, then the
877     * appropriate single or double-quote is returned.  Otherwise 0 is returned.
878     * 
879     * @see #punctuationAUC(char)
880     */
881    public static char quoteAUC(char c)
882    {
883        char auc = punctuationAUC(c);
884
885        if (auc != 0) c = auc;
886
887        switch ((int) c)
888        {
889            case 0x22:  return '\"';    // DEC: 34
890            case 0x27:  return '\'';    // DEC: 39
891            default:    return (char) 0;
892        }
893    }
894
895    /**
896    * Comma - any version.  &nbsp;&nbsp;<B><I>AUC or normal-ASCII, (BOTH)</B></I> comma
897    * @param c Any character in the entire <B>UTF-8</B> range. {@code 0x0000 to 0xFFFF}, the
898    * {@code Basic Multi-Lingual Plane}.
899    * @return If the input character {@code 'c'} is an <B><I>"AUC" version</B></i> of the comma,
900    * or the <B><I>regular-ASCII</B></I> comma, then the comma is returned.  Otherwise 0 is returned.
901    * @see #punctuationAUC(char)
902    */
903    public static char commaAUC(char c)
904    {
905        char auc = punctuationAUC(c);
906
907        if (auc != 0) c = auc;
908
909        switch ((int) c)
910        {
911            case 0x2c:  return ','; // DEC: 44
912            default:    return (char) 0;
913        }
914    }
915
916    /**
917     * Brackets - any version.  &nbsp;&nbsp;<B><I>AUC or normal-ASCII, (BOTH)</B></I> brackets
918     * 
919     * @param c Any character in the entirbrackets UniCode range. 0x0000 to 0xFFFF
920     * 
921     * @return If the input character {@code 'c'} is an <B><I>"AUC" version</B></i> of the
922     * brackets, or the <B><I>regular-ASCII</B></I> brackets, then the appropriate brackets are
923     * returned. Otherwise 0 is returned.
924     * 
925     * @see #punctuationAUC(char)
926     */
927    public static char bracketAUC(char c)
928    {
929        char auc = punctuationAUC(c);
930
931        if (auc != 0) c = auc;
932
933        switch ((int) c)
934        {
935            case 0x5B:  return '['; // DEC: 91
936            case 0x5D:  return ']'; // DEC: 93
937            case 0x3C:  return '<'; // DEC: 60
938            case 0x3E:  return '>'; // DEC: 62
939            default:    return (char) 0;
940        }
941    }
942
943    /**
944    * Parenthesis - any version.  &nbsp;&nbsp;<B><I>AUC or normal-ASCII, (BOTH)</B></I> parenthesis
945    * @param c Any character in the entire UniCode range. 0x0000 to 0xFFFF
946    * @return If the input character {@code 'c'} is an <B><I>"AUC" version</B></i> of the parenthesis,
947    * or the <B><I>regular-ASCII</B></I> parenthesis, then the appropriate parenthesis are
948    * returned.  Otherwise 0 is returned.
949    * @see #punctuationAUC(char)
950    */
951    public static char parenAUC(char c)
952    {
953        char auc = punctuationAUC(c);
954
955        if (auc != 0) c = auc;
956
957        switch ((int) c)
958        {
959            case 0x28:  return '('; // DEC: 40
960            case 0x29:  return ')'; // DEC: 41
961            default:    return (char) 0;
962        }
963    }
964     
965    /**
966     * The complete list of "higher-level" (alternate) Uni-Code chars.  Many of these are alternate
967     * punctuation marks used in documents that contain Mandarin Chinese.
968     */
969    public static final String AUC = 
970        // Special Punctuation characters found in Chinese HTML Pages
971        "、 。 · ˉ ˇ ¨ 〃 々 — ~ ‖ … ‘ ’ "             +
972        "“ ” 〔 〕 〈 〉 《 》 「 」 『 』 〖 〗 【 】"     +
973        "± × ÷ ∶ ∧ ∨ ∑ ∏ ∪ ∩ ∈ ∷ √ ⊥ ∥ ∠"               +
974        "⌒ ⊙ ∫ ∮ ≡ ≌ ≈ ∽ ∝ ≠ ≮ ≯ ≤ ≥ ∞ ∵ "           +
975        "∴ ♂ ♀ ° ′ ″ ℃ $ ¤ ¢ £ ‰ § № ☆ ★"          +
976        "○ ● ◎ ◇ ◆ □ ■ △ ▲ ※ → ← ↑ ↓ 〓 "            +
977        "! " # ¥ % & ' ( ) * + , - . /"      +
978
979        // Extra Alphabetic and Numeric Characters sometimes used
980        // on web-pages written in Chinese
981        "0 1 2 3 4 5 6 7 8 9 : ; < = > ?"   +
982        "@ A B C D E F G H I J K L M N O"   +
983        "P Q R S T U V W X Y Z [ \ ] ^ _"   +
984        "` a b c d e f g h i j k l m n o"   +
985        "p q r s t u v w x y z { | }  ̄"      +
986
987        // Certain "Bullet List" / "Bullet Point" markers
988        "⒈ ⒉ ⒊ ⒋ ⒌ ⒍ ⒎ ⒏ ⒐ ⒑ ⒒ ⒓ ⒔ ⒕ ⒖"      +
989        "⒗ ⒘ ⒙ ⒚ ⒛ ⑴ ⑵ ⑶ ⑷ ⑸ ⑹ ⑺ ⑻ ⑼ ⑽ ⑾"   +
990        "⑿ ⒀ ⒁ ⒂ ⒃ ⒄ ⒅ ⒆ ⒇ ① ② ③ ④ ⑤ ⑥ ⑦"         +
991        "⑧ ⑨ ⑩ ㈠ ㈡ ㈢ ㈣ ㈤ ㈥ ㈦ ㈧ ㈨ ㈩"               +
992        "Ⅰ Ⅱ Ⅲ Ⅳ Ⅴ Ⅵ Ⅶ Ⅷ Ⅸ Ⅹ Ⅺ Ⅻ"               +
993
994        // The "Bo Po Mo Fo" Pronunciation Used for Chinese Characters
995        "ㄐ ㄑ ㄒ ㄓ ㄔ ㄕ ㄖ ㄗ ㄘ ㄙ ㄚ ㄛ ㄜ ㄝ ㄞ ㄟ"   +
996        "ㄠ ㄡ ㄢ ㄣ ㄤ ㄥ ㄦ ㄧ ㄨ ㄩ";
997
998    /**
999     * 
1000     * @return An HTML &lt;TABLE&gt; that contains many tests of the subroutines in this class
1001     */
1002    public static String testAUC()
1003    {
1004        StringBuilder ret = new StringBuilder();
1005        ret.append( "<TABLE BORDER=\"1\"><TR>"      +
1006                    "<TD WIDTH=\"30\">&nbsp;</TD>"  +
1007                    "<TD WIDTH=\"70\">&nbsp;</TD>"  +
1008                    "<TD WIDTH=\"70\">&nbsp;</TD>"  +
1009                    "<TD WIDTH=\"30\">&nbsp;</TD>"  );
1010
1011        for (int i=4; i < 12; i++)
1012            ret.append("<TD WIDTH=\"70\">&nbsp;</TD>");
1013        ret.append("</TR>");;
1014
1015        for (int i=0; i < AUC.length(); i++)
1016        {
1017            char c = AUC.charAt(i);
1018
1019            if (c == ' ') continue;
1020
1021            // Check original character (not punctuation-converted cc)
1022            char    bl          = Integer.toString(bulletListAUC(c)).charAt(0);
1023            boolean bpmf        = isBPMFAUC(c);
1024
1025            // first, convert the punctuation to normal-ASCII punctuation
1026            // These are the "translated" characters
1027            // The "translated character" is where, for example '〗' ==> ']'
1028            char    newC       = punctuationAUC(c);
1029
1030            // These are used for building <TABLE> & <TD> entry strings
1031            char    q           = quoteAUC(newC);
1032            char    es          = endOfSentenceAUC(newC);
1033            char    ep          = endOfPhraseAUC(newC);
1034            char    com         = commaAUC(newC);
1035            char    br          = bracketAUC(newC);
1036            char    p           = parenAUC(newC);
1037
1038            char    ascii       = punctuationAUC(c);
1039            if (ascii   == 0)   ascii = alphaNumericAUC(c);
1040            if (bl      != 0)   ascii = bl;
1041            if (bpmf)           ascii = c;
1042            if (ascii   == 0)   ascii = 'x';
1043
1044            // =================================================
1045            // This is for debugging this test function
1046            String  tmp =   " newCC = " + newC  + ", q="    + q     +
1047                            ", es="     + es    + ", ep="   + ep    +
1048                            ", com="    + com   + ", br="   + br    +
1049                            ", p="      + p     + ", bl ="  + bl    +
1050                            ", bpmf="   + bpmf;
1051
1052            tmp = tmp.replaceAll("<", "&lt;").replaceAll(">", "&gt;");
1053
1054            // Build the HTML Table 
1055            ret.append("<TR>");
1056
1057            ret.append("<TD>" + c + "</TD>");
1058            ret.append("<TD>" + ((int) c) + "</TD>");
1059            ret.append("<TD>" + "0x" + String.format("%x",(int) c).toUpperCase() + "</TD>");
1060            ret.append("<TD>" + ascii + "</TD>");
1061
1062            ret.append("<TD>" + ((q     == 0)   ? "" : "Quote")     + "</TD>"); 
1063            ret.append("<TD>" + ((es    == 0)   ? "" : "Sentence")  + "</TD>");
1064            ret.append("<TD>" + ((ep    == 0)   ? "" : "Phrase")    + "</TD>");
1065            ret.append("<TD>" + ((com   == 0)   ? "" : "Comma")     + "</TD>");
1066            ret.append("<TD>" + ((br    == 0)   ? "" : "Bracket")   + "</TD>");
1067            ret.append("<TD>" + ((p     == 0)   ? "" : "Paren")     + "</TD>");
1068            ret.append("<TD>" + ((bl    == 0)   ? "" : "Bullet")    + "</TD>"); 
1069            ret.append("<TD>" + (bpmf ? "BPMF" : "") + "</TD>");
1070
1071            // ==========================================================
1072            // Un-Comment this if you want to debug this print function
1073            // outStr += "</TR><TR><TD COLSPAN=\"12\">" + tmp + "</TD></TR>";
1074
1075        }
1076        ret.append("</TABLE>");
1077        return ret.toString();
1078    }
1079
1080    /**
1081     * Checks for any leading alphabetic {@code ('a' ... 'z')} and numeric {@code ('0' ... '9')}
1082     * characters in a Chinese {@code String}.
1083     *
1084     * <B>CHANGED:</B> 2018.09.24 - I left comma's and period's in the {@code String} (when
1085     * situated between digits). These are considered to be part of the "Leading Letters and
1086     * Numbers"
1087     *
1088     * @param chineseSentence A sentence that may or may not have leading letters &amp; numbers.
1089     * 
1090     * @return the {@code String}-index of the first non-alphabetic, non-numeric character in the
1091     * {@code String}.
1092     * 
1093     * <BR /><BR /><B>NOTE:</B> white-space does not count, and the position of the first
1094     * white-space character will be returned, if white-space is contained in this {@code String}.
1095     * 
1096     * @see #isAlphaNumeric(char)
1097     */
1098    public static int countLeadingLettersAndNumbers(String chineseSentence)
1099    {
1100        for (int i = 0; i < chineseSentence.length(); i++)
1101        {
1102            char c = chineseSentence.charAt(i);
1103            if ((! isAlphaNumeric(c)) && (c != '.') && (c != ',')) return i;
1104        }
1105
1106        return chineseSentence.length(); // This really ought not to happen, but just in case....
1107    }
1108
1109    /**
1110     * Checks for higher-Unicode letters and numbers, and converts them into lower-level versions
1111     * of the appropriate letter or number.
1112     *
1113     * <BR /><BR /><B>SPECIFICALLY:</B>  This method is just a "for-loop" which
1114     * makes a call to  {@code alphaNumericaAUC()} and if zero is not returned from that
1115     * method-call, then the input {@code String} is modified at the index which contained such a
1116     * higher {@code UTF-8} letter or number.
1117     * 
1118     * @param s This may or may not have "Alternate UniCode" Characters for letters and numbers.
1119     * 
1120     * @return if the "alternate" versions of <CODE>'A' ... 'Z'</CODE> or <CODE>'0' ... '9'</CODE>
1121     * are there, this will make sure to change them.
1122     * 
1123     * @see #alphaNumericAUC(char)
1124     */
1125    public static String convertAnyAUC(String s)
1126    {
1127        char[] cArr = s.toCharArray();
1128
1129        for (int i = 0; i < cArr.length; i++)
1130        {
1131            char auc = alphaNumericAUC(cArr[i]);
1132            if (auc != 0) cArr[i] = auc;
1133        }
1134
1135        return new String(cArr);
1136    }
1137    
1138    /**
1139     * Counts syllables in a "word" of PinYin.  The input {@code String} is expected to not have
1140     * any spaces!
1141     *
1142     * <BR /><BR />
1143     * <B>NOTE:</B>The number of syllables in a Chinese PinYin "word" identifies the 
1144     * number of Chinese Characters that were used to generate the input
1145     * <B>PinYin {@code String}</B>.
1146     *
1147     * <BR /><BR /><B>CHANGED:</B> 2018.09.24 - Added a test for periods and commas that are
1148     * situated directly between two digits.  In the String "5.0" the period between 5 and 0 is no
1149     * longer removed!
1150     *
1151     * <BR /><BR />If the {@code String} "5.0" were passed as the "word" parameter, the result
1152     * should be 3!
1153     * 
1154     * @param word A word in the "PinYin" format. (罗马拼音)
1155     * 
1156     * @param DOUT This must implement {@code java.lang.Appendable}
1157     * 
1158     * @return the number of syllables (specifically: Chinese Characters) in the input word.
1159     * 
1160     * @throws IOException The interface {@code java.lang.Appendable} mandates that the 
1161     * {@code IOException} must be treated as a checked exception for all output operations.  
1162     * Therefore {@code IOException} is a required exception in this method' throws clause.
1163     */
1164    public static int countSyllablesAndNonChinese(String word, Appendable DOUT)
1165        throws IOException
1166    {
1167        int numChinese  = 0;
1168
1169        // Tone-Vowels & Numbers always correspond to a character
1170        for (int letter = 0; letter < word.length(); letter++)
1171        {
1172            char c = word.charAt(letter);
1173            if (    ZH.isToneVowel(c)   ||
1174                    ZH.isNumber(c)      ||
1175                    (c == '.')          ||
1176                    (c == ',')
1177                )
1178                numChinese++;
1179        }
1180
1181        // Checks for vowel-strings that don't contain a tone
1182        // ==> Checks for "clear tone"
1183        String copyW = "" + word;
1184
1185        DOUT.append("[" + copyW + "] - ");
1186
1187        for (int letterIndex = 0; letterIndex < copyW.length(); letterIndex++)
1188            if (    ! ZH.isRegVowel(copyW.charAt(letterIndex))      &&
1189                    ! ZH.isToneVowel(copyW.charAt(letterIndex)) )
1190                copyW = StringParse.setChar(copyW, letterIndex, ' ');
1191            
1192        DOUT.append("after erasing non-vowels [" + copyW + "]\n");
1193         
1194        String[] syllables = copyW.trim().split(" ");
1195
1196        DOUT.append("Syllables are:");
1197        for (int sylIndex = 0; sylIndex < syllables.length; sylIndex++  )
1198            DOUT.append("[" + syllables[sylIndex] + "]");
1199        DOUT.append("\n");
1200
1201        TOP:
1202        for (int sylIndex = 0; sylIndex < syllables.length; sylIndex++)
1203        {
1204            String  syllable    = syllables[sylIndex].trim();
1205            boolean foundTone   = false;
1206
1207            // The split(' ') function sometimes provides blanks
1208            if (syllable.length() == 0) continue TOP;
1209
1210            for (int vowelIndex = 0; vowelIndex < syllable.length(); vowelIndex++)
1211                if (ZH.isToneVowel(syllable.charAt(vowelIndex)))
1212                    continue TOP;
1213
1214            numChinese++;
1215            DOUT.append("NOTE: *** FOUND CLEAR TONE\n");
1216        }
1217
1218        return numChinese;
1219    }
1220
1221    /**
1222     * Deletes all punctuation &amp; non-character symbols.  The {@code String} that is returned
1223     * will be shortened by precisely the number of punctuation characters were contained by that
1224     * {@code String}.
1225     *
1226     * <BR /><BR /><B>NOTE:</B> {@code '.'} and {@code ','} (periods and commas) between 
1227     * number/digits are not removed!
1228     *
1229     * @param s An input {@code String} (in Mandarin - 普通话)
1230     *
1231     * @return a {@code String} that is the same as the input {@code String} - after skipping
1232     * characters as follows:
1233     *
1234     * <BR /><DIV CLASS="SNIP">{@code
1235     * if (isChinese(c) || isAlphaNumeric(c) || (alphaNumericAUC(c) != 0)) continue;
1236     * (else) s = StringParse.delChar(s, chr--);
1237     * }</DIV>
1238     */
1239    public static String delAllPunctuationCHINESE(String s)
1240    {
1241        char[]  cArr        = s.toCharArray();
1242        int     sourcePos   = 0;
1243        int     destPos     = 0;
1244
1245        while (sourcePos < cArr.length)
1246        {
1247            char c = cArr[sourcePos];
1248
1249            // Check for things like 5.0 or 1,120,987 - SPECIFICALLY Comma's and Period's situated
1250            // directly between 2 numbers.
1251
1252            if (    ((c == '.') || (c == ','))
1253                &&  (((sourcePos-1) == -1)          || isNumber(cArr[sourcePos-1]))
1254                &&  (((sourcePos+1) == s.length())  || isNumber(cArr[sourcePos+1]))
1255            )
1256                { cArr[destPos++] = cArr[sourcePos++]; continue; }
1257
1258            // AUC were converted before calling this function ... (alphaNumericAUC(c) != 0)) 
1259
1260            if (isChinese(c) || isAlphaNumeric(c))
1261                { cArr[destPos++] = cArr[sourcePos++]; continue; }
1262
1263            sourcePos++;
1264        }
1265
1266        return s;
1267    }
1268
1269    /**
1270     * Deletes all punctuation &amp; non-character symbols from a {@code String} of PinYin.
1271     * The returned {@code String} will have the same length as it originally did, but the
1272     * locations where punctuation existed will have been replaced with a space character.
1273     * 
1274     * <BR /><BR /><B>NOTE:</B> {@code '.'} and {@code ','} (periods and commas) between
1275     * number/digits are not removed!
1276     *
1277     * @param s An input {@code String} in 罗马拼音
1278     *
1279     * @return A {@code String} that is the same as the input {@code String} - after skipping
1280     * characters as follows:
1281     *
1282     * <BR /><DIV CLASS="SNIP">{@code 
1283     * if (isAlphaNumeric(c) || (alphaNumericAUC(c) != 0)) continue;
1284     * (else) s = StringParse.setChar(s, chr, ' ');
1285     * }</DIV>
1286     */
1287    public static String delAllPunctuationPINYIN(String s)
1288    {
1289        char[] cArr = s.toCharArray();
1290
1291        // This loop cnverts all non-AlphaNumeric unicode to a space        
1292        for (int i = 0; i < cArr.length; i++)
1293        {
1294            char c = cArr[i];
1295
1296            if (isAlphaNumeric(c) || (alphaNumericAUC(c) != 0)) continue;
1297
1298            // Check for things like 5.0 or 1,120,987 - SPECIFICALLY Comma's and Period's
1299            // situated directly between 2 numbers.
1300
1301            if (    ((c == '.') || (c == ','))
1302                &&  (((i-1) == -1)          || isNumber(cArr[i-1]))
1303                &&  (((i+1) == s.length())  || isNumber(cArr[i+1]))
1304            )
1305                continue;
1306
1307            cArr[i] = ' ';
1308        }
1309
1310        return new String(cArr);
1311    }
1312
1313    // ****************************************************************************************
1314    // Constants
1315    // ****************************************************************************************
1316
1317    /** Special Quotation Mark, left-side */
1318    public static final char CONSTSpecialQuoteLeft = (char) 0x201C;
1319
1320    /** Special Quotation Mark, right-side */
1321    public static final char CONSTSpecialQuoteRight = (char) 0x201D;
1322
1323    /**
1324     * <B STYLE="color: red;">GTPPE: Google Translate Punctuation Pronunciation Equivalent</B>
1325     * This searches through a {@code String} to find the location of the "equivalent punctuation
1326     * mark"
1327     * 
1328     * @param s The input {@code String}, expected to be the result of a <B>GCS TS</B> query.  This
1329     * function is totally useless for any {@code Pronunciation String} that hasn't been obtained
1330     * from <B>GCS TS</B>.
1331     *
1332     * <BR /><BR /><B>NOTE:</B> The input {@code String} is intended to be in "PinYin" (罗马拼音)
1333     *
1334     * @param c The original punctuation character to look for...  Generally, this is used to
1335     * search for higher-level <B>UTF-8 {@code chars}</B> that have been "down-converted" by <B>GCS
1336     * TS</B>
1337     *
1338     * @return the {@code indexOf()} of the character in the original input String.  The actual
1339     * character is not looked for, BUT RATHER, the <B>Google Cloud Server Transation Services</B>
1340     * equivalent character.  Specifically, {@code GCSTS} has a "substitute punctuation" for many
1341     * higher-level <B>UTF-8</B> and <B>UniCode</B> chars.  There are 5 different versions of a
1342     * quote...
1343     */
1344    public static int GTPPEIndexOf(String s, char c)
1345    {
1346        int cc = (int) c;
1347
1348        // if (c == '∶')    return s.indexOf(c);
1349        if (cc == 0x2236)   return s.indexOf(c);
1350        // if (c == ':')    return s.indexOf(':');
1351        if (cc == 0xFF1A)   return s.indexOf(':');  // (0x003A);
1352        // if (c == ':')    return s.indexOf(c);    // Natural colon
1353        if (cc == 0x003A)   return s.indexOf(c);
1354
1355        // commas
1356        // if (c == '、')    return s.indexOf(',');
1357        if (cc == 0x3001)   return s.indexOf(',');  // (0x002C);
1358        // if (c == ',')    return s.indexOf(',');
1359        if (cc == 0xFF0C)   return s.indexOf(',');  // (0x002C);
1360        // if (c == ',')    return s.indexOf(c);    // natural comma
1361        if (cc == 0x002C)   return s.indexOf(c);
1362
1363        // periods
1364        // if (c == '。')    return s.indexOf('.');
1365        if (cc == 0x3002)   return s.indexOf('.');  // (0x002E);
1366        // if (c == '○')    return s.indexOf(c);
1367        if (cc == 0x25CB)   return s.indexOf(c);
1368        // if (c == '●')    return s.indexOf(c);
1369        if (cc == 0x25CF)   return s.indexOf(c);
1370        // if (c == '.')    return s.indexOf('.');
1371        if (cc == 0xFF0E)   return s.indexOf('.');  // (0x002E);
1372        // if (c == '.')    return s.indexOf(c);    // natural period
1373        if (cc == 0x002E)   return s.indexOf(c);
1374
1375
1376        // Exclamation & Question
1377        // if (c == '?')    return s.indexOf(c);    // natural question-mark
1378        if (cc == 0x003F)   return s.indexOf(c);
1379        // if (c == '?')    return s.indexOf('?');
1380        if (cc == 0xFF1F)   return s.indexOf('?');  // (0x003F);
1381        // if (c == '!')    return s.indexOf('!');
1382        if (cc == 0xFF01)   return s.indexOf('!');  // (0x0021);
1383        // if (c == '!')    return s.indexOf(c);    // natural exclamation
1384        if (cc == 0x0021)   return s.indexOf(c);
1385
1386        // single-quotes
1387        // if (c == '‘')    return s.indexOf(c);
1388        if (cc == 0x2018)   return s.indexOf(c);
1389        // if (c == '’')    return s.indexOf(c);
1390        if (cc == 0x2019)   return s.indexOf(c);
1391        // if (c == '′')    return s.indexOf(c);
1392        if (cc == 0x2032)   return s.indexOf(c);
1393        // if (c == ''')    return s.indexOf('\'');
1394        if (cc == 0xFF07)   return s.indexOf('\''); // (0x0027);
1395        // if (c == '`')    return s.indexOf('`');
1396        if (cc == 0xFF40)   return s.indexOf('`');  // (0x0060);
1397        // if (c == '\'')   return s.indexOf(c);    // natural single-quotes
1398        if (cc == 0x0027)   return s.indexOf(c);
1399 
1400
1401        // NOT DETECTED RIGHT NOW.. 
1402        // if (c == '《')    return s.indexOf('“');
1403        if (cc == 0x300A)   return s.indexOf(CONSTSpecialQuoteLeft);
1404        // if (c == '》')    return s.indexOf('”');
1405        if (cc == 0x300B)   return s.indexOf(CONSTSpecialQuoteRight);
1406
1407        // double-quotes
1408        // if (c == '¨')    return s.indexOf(c);
1409        if (cc == 0x00A8)   return s.indexOf(c);
1410        // if (c == '〃')    return s.indexOf(c);
1411        if (cc == 0x3003)   return s.indexOf(c);
1412        // if (c == '“')    return s.indexOf(c);
1413        if (cc == 0x201C)   return s.indexOf(c);
1414        // if (c == '”')    return s.indexOf(c);
1415        if (cc == 0x201D)   return s.indexOf(c);
1416        // if (c == '″')    return s.indexOf(c);
1417        if (cc == 0x2033)   return s.indexOf(c);
1418        // if (c == '"')    return s.indexOf('\"');
1419        if (cc == 0xFF02)   return s.indexOf('\"'); // (0x0022);
1420        // if (c == '\"')   return s.indexOf(c);    // natural double quotes
1421        if (cc == 0x0022)   return s.indexOf(c);
1422
1423
1424        // Brackets
1425        // if (c == '[')    return s.indexOf(c);
1426        if (cc == 0x005B)   return s.indexOf(c);
1427        // if (c == ']')    return s.indexOf(c);
1428        if (cc == 0x005D)   return s.indexOf(c);
1429        // if (c == '[')    return s.indexOf('[');
1430        if (cc == 0xFF3B)   return s.indexOf('[');  // (0x005B);
1431        // if (c == ']')    return s.indexOf(']');
1432        if (cc == 0xFF3D)   return s.indexOf(']');  // (0x005D);
1433        // if (c == '【')    return s.indexOf('[');
1434        if (cc == 0x3010)   return s.indexOf('[');  // (0x005B);
1435        // if (c == '】')    return s.indexOf(']');
1436        if (cc == 0x3011)   return s.indexOf(']');  // (0x005D);
1437        // if (c == '〖')    return s.indexOf(c);
1438        if (cc == 0x3016)   return s.indexOf(c);
1439        // if (c == '〗')    return s.indexOf(c);
1440        if (cc == 0x3017)   return s.indexOf(c);
1441        // if (c == '『')    return s.indexOf('“');
1442        if (cc == 0x300E)   return s.indexOf(CONSTSpecialQuoteLeft);
1443        // if (c == '』')    return s.indexOf('”');
1444        if (cc == 0x300F)   return s.indexOf(CONSTSpecialQuoteRight);
1445        // if (c == '「')    return s.indexOf('`');
1446        if (cc == 0x300C)   return s.indexOf('`');  // (0x0060);
1447        // if (c == '」')    return s.indexOf('\'');
1448        if (cc == 0x300D)   return s.indexOf('\''); // (0x0027);
1449
1450
1451        // Parenthesis
1452        // if (c == '(')    return s.indexOf(c);
1453        if (cc == 0x0028)   return s.indexOf(c);
1454        // if (c == ')')    return s.indexOf(c);
1455        if (cc == 0x0029)   return s.indexOf(c);
1456        // if (c == '(')    return s.indexOf('(');
1457        if (cc == 0xFF08)   return s.indexOf('(');  // (0x0028);
1458        // if (c == ')')    return s.indexOf(')');
1459        if (cc == 0xFF09)   return s.indexOf(')');  // (0x0029);
1460        // if (c == '〔')    return s.indexOf(c);
1461        if (cc == 0x3014)   return s.indexOf(c);
1462        // if (c == '〕')    return s.indexOf(c);
1463        if (cc == 0x3015)   return s.indexOf(c);
1464
1465        System.out.println("character not found: \'" + c + "\'\nZH.GTPPEIndexOf(String s, char c)");
1466        System.exit(0);
1467        return 0;
1468    }
1469}