ZH.java.html

package Torello.Languages;

import java.io.*;
import java.util.*;
import java.util.regex.*;
import Torello.Java.*;

/**
 * ZH (Mandarin Chinese) Many tools for parsing constructs from Mandarin News &amp; other
 * Web-Sites.
 * 
 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=ZH>
 */
@Torello.JavaDoc.StaticFunctional
public class ZH
{
    private ZH() { }

    static void main1(String[] argv) throws IOException
    {
        StringBuilder sb = new StringBuilder();
        sb.append("<HTML>\n<HEAD>\n<TITLE>AUC Test</TITLE>\n");
        sb.append("<META http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />\n");
        sb.append("<BODY>\n");
        sb.append(testAUC() + "\n<BR />\n");
        sb.append("</BODY>\n</HTML>\n");
        FileRW.writeFile(sb, "out.html");
    }

    /*
    static void main(String argv[]) throws IOException
    {
        for (int i=0; i < H2CV.length; i++)
            System.out.print(H2CV[i] + ":" + ((char) H2CV[i]) + ":" + CV2RV[i] + ",\t");
        String s = "À, É, à, á, è, é, ì, í, ò, ó, ù,  ú, ü, Ā, ā, ē, ě, ī, ō, ū, ǎ, ǐ, ǒ, ǔ";
        System.out.println(s);
        System.out.println(toneVowelsToRegularVowels(s));
    }
    */

    /*
    * This is the list of the ASCII/UTF-8 character codes for the vowels with tone symbols on
    * top of them.  Google Translate returns many of the PinYin Romanization results as: <BR />
    * &#363;  <B><I>INSTEAD OF</B></i> the character as a UTF-8 character.   Essentially, this
    * array contains a list of those character codes.
    */
    private static final int[] H2CV = { 39, 192, 201, 224, 225, 232, 233, 236, 237, 242, 243,
                249, 250, 252, 256, 257, 275, 283, 299, 333, 363, 462, 464, 466, 468, 474, 476 };

    private static final char[] CV = { '\'', 'À', 'É', 'à', 'á', 'è', 'é', 'ì', 'í', 'ò', 'ó',
        'ù',  'ú', 'ü', 'Ā', 'ā', 'ē', 'ě', 'ī', 'ō', 'ū', 'ǎ', 'ǐ', 'ǒ', 'ǔ', 'ǚ', 'ǜ' };

    private static final char[] CV2RV = { '\'', 'A', 'E', 'a', 'a', 'e', 'e', 'i', 'i', 'o', 'o',
        'u',  'u', 'u', 'A', 'a', 'e', 'e', 'i', 'o', 'u', 'a', 'i', 'o', 'u', 'u', 'u' };

    private static final Pattern P1 = Pattern.compile("&#(\\d+);", Pattern.CASE_INSENSITIVE);

    /**
     * This makes the problems of dealing with the tone/accent marks above vowels in Chinese
     * Pin-Yin easier.  These convert vowels with tones over them into regular vowels.  This
     * can be useful for certain {@code String} operations, although clearly the original meaning
     * of the word would be decimated.
     * 
     * @param c any character from <B>ASCII / UTF-8 / UniCode</B> Basic Multi Lingual Plane.
     * 
     * @return if this is a {@code UTF-8} character that is an accented vowel, the un-accented
     * version of that vowel is returned.  If this is not a PinYin symbol for a tone-vowel, 
     * {@code ASCII 0} is returned.
     * 
     * @see #toneVowelsToRegularVowels(String)
     */
    public static char toneVowelToRegularVowel(char c)
    {
        for (int i=0; i < CV.length; i++) if (CV[i] == c) return CV2RV[i];
        return (char) 0;
    }

    /**
     * Counts the number of tone vowels in a <B>PinYin</B> {@code String}.
     * 
     * @param pinYinStr A {@code String}, usually generated by <B>Google Translate</B>, (and
     * scraped from Google Translate) that contains <B>PinYin.</B>
     * 
     * @return The number of Mandarin Chinese Pin-Yin "Tone Vowels"
     */
    public static int countToneVowels(String pinYinStr)
    {
        int count=0;

        TOP:
        for (int i = pinYinStr.length()-1; i >= 0; i--)
            for (int j=0; j < CV.length; j++)
                if (pinYinStr.charAt(i) == CV[j])
                    { count++; continue TOP; }

        return count;
    }

    /**
     * This performs a conversion of all vowels in a {@code String} from those with tones over them
     * to the normal (un-accented) equivalent.  It uses the single-character-version of the
     * synonymously named method
     * 
     * @param s any {@code java.lang.String} containing Mandarin Romanizations.
     * 
     * @return a {@code String} with all accented vowel's converted to regular vowels.
     * 
     * @see #toneVowelToRegularVowel(char)
     */
    public static String toneVowelsToRegularVowels(String s)
    {
        int             strlen  = s.length();
        StringBuilder   sb      = new StringBuilder(s.length());
        char            c;

        for (int i=0; i < strlen; i++)
            if ((c = toneVowelToRegularVowel(s.charAt(i))) != 0)
                sb.append(c);
            else
                sb.append(s.charAt(i));

        return sb.toString();
    }

    /**
     * Google Translate returns some text encoded as {@code "&#num;" (the "ord(c)").}  This is also
     * called {@code HTML Escaped Code} - because instead of actual <B>ASCII/UTF8</B> characters
     * themselves, their "Ord" are returned - surrounded by the usual <I>HTML Escape Character
     * Sequence</I> &amp;#num; This method does the {@code chr(html-hex-escape-code);} and replaces
     * the {@code escape-sequence} (which again is &amp;#NUM;) with the actual ASCII character.
     *
     * <BR /><BR /><B>NOTE:</B> all of these are for "Chinese Tone Vowel" ASCII - The Google
     * Translate module uses this method quite a bit.  Here are a few examples of
     * HTML-Escape-Sequence and the corresponding ASCII.
     *
     * <BR /><TABLE CLASS=JDBriefTable>
     * <TR><TH>HTML-Escaped</TH><TH>ASCII/UTF-8 Character</TH></TR>
     * <TR><TD>&amp;#192;</TD><TD>À</TD></TR>
     * <TR><TD>&amp;#225;</TD><TD>á</TD></TR>
     * <TR><TD>&amp;#283;</TD><TD>ě</TD></TR>
     * <TR><TD>&amp;#363;</TD><TD>ū</TD></TR>
     * <TR><TD>&amp;#474;</TD><TD>ǚ</TD></TR>
     * <TR><TD COLSPAN="2">... see array below for list</TD></TR>
     * </TABLE>
     *
     * <BR /><BR /><B>NOTE:</B> {@code HTML2UTF8(String)} ==&gt; This method does the exact same
     * thing - but does not limit the characters to be converted to only Chinese Tone Vowels.  This
     * method only converts HTML-Escaped-Characters from this list:
     *
     * <BR /><BR /><CODE>
     *	private static final int[] H2CV = { 39, 192, 201, 224, 225, 232, 233, 236, 237, 242, 243, <BR />
     *	249, 250, 252, 256, 257, 275, 283, 299, 333, 363, 462, 464, 466, 468, 474, 476 };		<BR />
     * </CODE>
     * 
     * @see #HTML2UTF8(String)
     */
    public static String HTML2ChineseVowels(String s)
    {
        for (int i=0; i < H2CV.length; i++)
            s = s.replaceAll("&#" + H2CV[i] + ";", "" + (char) H2CV[i]);

        return s;
    }

    /**
     * NOTE: This does the same as {@code HTML2ChineseVowels(String)} <B><I>EXCEPT</B></I> that it
     * converts <B><I>ANY</B></I> HTML string that has been encoded as: {@code &amp;#NUM;} - not 
     * just the characters having accents and corresponding to Chinese Tone Vowels.
     * 
     * @see #HTML2ChineseVowels(String)
     */
    public static String HTML2UTF8(String s)
    {
        // Build the list of UTF8/ASCII character values (as Ord(c) / int) first.
        HashSet<Integer>    utfList = new HashSet<Integer>();
        Matcher             m       = P1.matcher(s);

        while (m.find()) utfList.add(Integer.parseInt(m.group(1)));

        // Now convert them.
        for (Integer i : utfList) s = s.replaceAll("&#" + i.toString() + ";", "" + ((char) i.intValue()));

        return s;
    }

    /** 
     * This is used to convert a Chinese Character into a full {@code String} that includes the
     * <B>UTF-8</B> code represented as a {@code HEXADECIMAL} number and a {@code decimal} number
     *
     * @param c any ASCII/UniCode/UTF-8 char - but, generally, expected to be a
     * "Chinese Character."
     *
     * <BR /><BR /><B>NOTE:</B> The choice for parameter {@code char c} has no actual constraints
     * on its input value.
     *
     * @return A {@code String} of this format: {@code 掭(0x63AD, 25517)}
     */
    public static String formatUTF8Chinese(char c)
    { return c + "(0x" + String.format("%x", ((int) c)).toUpperCase() + ", " + ((int) c) + ")"; }
    
    
    /**
     * Helper function - checks if this is a character in the UTF-8 &amp; ASCII ranges that contain
     * Mandarin Chinese characters.  This is not guaranteed to be accurate - some non-Chinese
     * Japanese characters exist in this range.  For the precise definition of what this function
     * actually does, see the ranges printed below.
     *
     * <BR /><BR />COPIED FROM*** <BR />
     * <B><CODE><A HREF="http://www.khngai.com/chinese/charmap/tbluni.php?page=0" TARGET="_blank">
     * http://www.khngai.com/chinese/charmap/tbluni.php?page=0</A></CODE></B>
     *
     * <BR /><BR />AND: {@code ((c >= 0x4E00) && (c <= 0x9FFF)) }
     *
     * <BR /><BR />COPIED FROM*** <BR />
     * <B><CODE><A HREF="http://www.khngai.com/chinese/charmap/tblgb.php?page=1" TARGET="_blank">
     * http://www.khngai.com/chinese/charmap/tblgb.php?page=1</A></CODE></B>
     *
     * @param c any UTF-8, ASCII or UniCode character available.
     * 
     * @return {@code TRUE} if the input character {@code 'c'} is in the UTF-8/UniCode range 
     * for Chinese Characters
     */
    public static boolean isChinese(char c)
    {	
        if ((c >= 0x4E00) && (c <= 0x9FFF)) return true;
        if ((c >= 0xB0A0) && (c <= 0xBFFF)) return true;
        if ((c >= 0xC0A0) && (c <= 0xCFFF)) return true;
        if ((c >= 0xD0A0) && (c <= 0xDFFF)) return true;
        if ((c >= 0xE0A0) && (c <= 0xEFFF)) return true;
        if ((c >= 0xF0A0) && (c <= 0xF7FF)) return true;

        return false;
    }	

    /**
     * Checks a {@code char} is something that is not {@code Alpha Numeric} or {@code White Space}
     *
     * @param c any UTF-8, ASCII or UniCode character available.
     * 
     * @return {@code ((!isAlphaNumeric(c)) && (!isSpace(c)));}
     */
    public static boolean isOther(char c)
    { return ((!isAlphaNumeric(c)) && (!isSpace(c))); }

    /**
     * Checks if a {@code char} is Alpha Numberic.
     *
     * @param c any {@code UTF-8, ASCII} or {@code UniCode} character available.
     * 
     * @return {@code (isAlpha(c) || isNumber(c));}
     */
    public static boolean isAlphaNumeric(char c)
    { return (isAlpha(c) || isNumber(c)); }

    /**
     * Checks if a {@code char} is Alphabetic.
     *
     * @param c any {@code UTF-8, ASCII} or {@code UniCode} character available.
     * 
     * @return {@code (isToneVowel(c) || isRegVowel(c) || isRegLetter(c));}
     */
    public static boolean isAlpha(char c) 
    { return (isToneVowel(c) || isRegVowel(c) || isRegLetter(c)); }

    /**
     * This is a helper function for the Mandarin Chinese accented vowel symbols in
     * {@code UTF-8, ASCII} and {@code UniCode}.  The exact character code numbers are printed
     * below.
     *
     * <BR /><BR /><B>NOTE:</B> In 罗马拼音 (Pin-Yin Romanization), there are a few symbols that 
     * should never come up - at least as the software pertains to 罗马拼音-results provided by
     * <B>Google Cloud Server Translation API</B> {@code (GCS-TS/TAPI)}.  This is because
     * <B><I>NO</I></B> word in Pin-Yin ever starts with the letter's I or U, or the U with an
     * umlau - <B><I>so</B></I> - capitalized versions of these letters ought to never occur -
     * unless the entire PinYin were capitalized - which is something GCSTS never does.
     *
     * @param c any UTF-8, ASCII or UniCode character available.
     * 
     * @return {@code TRUE} if the input character {@code 'c'} is one of the following:
     * 
     * <BR /><BR /><TABLE CLASS=JDBriefTable>
     * <TR><TH>Simple ASCII</TH><TH>UTF-8 Tone Vowel</TH></TR>
     * <TR><TD>a</TD><TD> ā (257), á (225), ǎ (462), à (224)</TD></TR>
     * <TR><TD>e</TD><TD> ē (275), é (233), ě (283), è (232)</TD></TR>
     * <TR><TD>i</TD><TD> ī (299), í (237), ǐ (464), ì (236)</TD></TR>
     * <TR><TD>o</TD><TD> ō (333), ó (243), ǒ (466), ò (242)</TD></TR>
     * <TR><TD>u</TD><TD> ū (363), ú (250), ǔ (468), ù (249)</TD></TR>
     * <TR><TD>u</TD><TD> ǖ (470), ǘ (472), ǚ (474), ǜ (476)</TD></TR>
     * <TR><TD>A</TD><TD> Ā (256), Á (193), Ǎ (461), À (192)</TD></TR>
     * <TR><TD>E</TD><TD> Ē (274), É (201), Ě (282), È (200)</TD></TR>
     * <TR><TD>O</TD><TD> Ō (332), Ó (211), Ǒ (465), Ò (210)</TD></TR>
     * </TABLE>
     *
     * <BR />In Mandarin Chinese, PinYin-words cannot start with these letters below.
     * Therefore it would be highly unlikely to see a "capitalized" version of these tone-vowels.
     *
     * <BR /><BR /><TABLE CLASS=JDBriefTable>
     * <TR><TH>Simple ASCII</TH><TH>UTF-8 Tone Vowel</TH></TR>
     * <TR><TD>I</TD><TD>Ī (298), Í (205), (there are 2: Ǐ (463), Ĭ (300)), Ì (204)</TD></TR>
     * <TR><TD>U</TD><TD>Ū (362), Ú (218), Ŭ (364), Ù (217)</TD></TR>
     * <TR><TD>U</TD><TD>(Ü (220) -no tone): Ǖ (469), Ǘ (471), Ǘ (473), Ǜ (475)</TD></TR>
     * </TABLE>
     */
    public static boolean isToneVowel(char c)
    {
        // A, ā 257, á 225, ǎ 462, à 224
        if ((c == 257) || (c == 225) || (c == 462) || (c == 224)) return true;

        // E, ē 275, é 233, ě 283, è 232
        if ((c == 275) || (c == 233) || (c == 283) || (c == 232)) return true;
              
        // I, ī 299, í 237, ǐ 464, ì 236 
        if ((c == 299) || (c == 237) || (c == 464) || (c == 236)) return true;

        // O, ō 333, ó 243, ǒ	466, ò 242
        if ((c == 333) || (c == 243) || (c == 466) || (c == 242)) return true;

        // U, ū 363, ú 250, ǔ 468, ù 249
        if ((c == 363) || (c == 250) || (c == 468) || (c == 249)) return true;

        // U, ǖ 470, ǘ 472, ǚ 474, ǜ 476
        if ((c == 470) || (c == 472) || (c == 474) || (c == 476)) return true;

        // *******
        // Capital vowels with tone symbols

        // Ā 256, Á 193, Ǎ 461, À 192
        if ((c == 256) || (c == 193) || (c == 461) || (c == 192)) return true;

        // Ē 274, É 201, Ě 282, È 200
        if ((c == 274) || (c == 201) || (c == 282) || (c == 200)) return true;

        // Ō 332, Ó 211, Ǒ 465, Ò 210
        if ((c == 332) || (c == 211) || (c == 465) || (c == 210)) return true;

        // Not sure about these - found them on a website
        // **********************************************
        //       1234 5678 9ABC DEF
        // A8A0  āáǎà ēéěè  īíǐì ōóǒ
        //
        //       0 1234 5678 9 A
        // A8B0  ò ūúǔù  ǖǘǚǜ  ü ê
        // **********************************************
        if ((c >= 0xA8A1) && (c <= 0xA8Ba)) return true;

        return false;
    }

    /**
     * Checks that a character is a standard vowel.
     * 
     * @param c any {@code UTF-8, ASCII} or {@code UniCode} character available.
     * 
     * @return {@code TRUE} if the input character {@code 'c'} EQUALS one of these ten letters: 
     * <B>a, e, i, o, u, A, E, I, O, U</B>
     */
    public static boolean isRegVowel(char c)
    {
        // The normal vowels

        // a 97, A 65
        if ((c == 97) || (c == 65))     return true;

        // e 101, E 69
        if ((c == 101) || (c == 69))    return true;

        // i 105, I 73
        if ((c == 105) || (c == 73))    return true;

        // o 111, O 79
        if ((c == 111) || (c == 79))    return true;

        // u 117, U 85
        if ((c == 117) || (c == 85))    return true;

        return false;
    }

    /**
     * Regular Letters Include: {@code 'A' ... 'Z'} (65 - 90),  {@code 'a' ... 'z'} (97 - 122)
     * 
     * @param c any {@code UTF-8, ASCII} or {@code UniCode} character available.
     * 
     * @return {@code TRUE} if the input character {@code 'c'} is any letter in lower-level
     * <B>ASCII</B> (and not any of the AUC).
     */
    public static boolean isRegLetter(char c)
    { return ((c >= 65) && (c <= 90)) || ((c >= 97) && (c <= 122)); }

    /**
     * Regular Numbers Include: {@code '0' ... '9'}
     * 
     * @param c any {@code UTF-8, ASCII} or {@code UniCode} character available.
     * 
     * @return {@code TRUE} if the input character {@code 'c'} is in the range of ASCII 
     * {@code '0' ... '9'} (not any of the AUC)
     */
    public static boolean isNumber(char c)
    { return ((c >= 48) && (c <= 57)); }

    /**
     * Checks for WhiteSpace: {@code '\t', '\n', '\r', ' '}
     * 
     * @param c any {@code UTF-8, ASCII} or {@code UniCode} character available.
     * 
     * @return {@code TRUE} if the input character {@code 'c'} is a whitespace character code from
     * the above list
     */
    public static boolean isSpace(char c)
    { return ((c == 9) || (c == 12) || (c == 15) || (c == 32)); }


    /**
     * Bullet List characters in upper {@code UniCode / UTF-8}.  These characters exist in
     * <B>UTF-8</B> - and they are occasionally used in documents found on Chinese News Websites.
     * They are all "bullet-list" points.  An integer is returned for each of these, that is equal
     * to the number represented by the UTF-8/UniCode character here.
     *
     * <BR /><BR /><UL CLASS=JDUL>
     * <LI>0 1 2 3 4 5 6 7 8 9 a b c d e f</LI>
     * <LI>N ⒈ ⒉ ⒊ ⒋ ⒌ ⒍ ⒎ ⒏ ⒐ ⒑ ⒒ ⒓ ⒔ ⒕ ⒖</LI>
     * <LI>⒗ ⒘ ⒙ ⒚ ⒛ ⑴ ⑵ ⑶ ⑷ ⑸ ⑹ ⑺ ⑻ ⑼ ⑽ ⑾</LI>
     * <LI>⑿ ⒀ ⒁ ⒂ ⒃ ⒄ ⒅ ⒆ ⒇ ① ② ③ ④ ⑤ ⑥ ⑦	</LI>
     * <LI>⑧ ⑨ ⑩ N N ㈠ ㈡ ㈢ ㈣ ㈤ ㈥ ㈦ ㈧ ㈨ ㈩ N</LI>
     * <LI>N Ⅰ Ⅱ Ⅲ Ⅳ Ⅴ Ⅵ Ⅶ Ⅷ Ⅸ Ⅹ Ⅺ Ⅻ</LI>
     * </UL>
     *
     * @param c any character as input
     * 
     * @return The number equivalent represented by this bullet point.
     */
    public static int bulletListAUC(char c)
    {
        // ⒈ ==> ⒛
        if ((c >= 0x2488) && (c <= 0x249B))	return ((int) c) - 0x2487;

        // ⑴ ==> ⒇
        if ((c >= 0x2474) && (c <= 0x2487))	return ((int) c) - 0x2473;	

        // ① ==> ⑩
        if ((c >= 0x2460) && (c <= 0x2469))	return ((int) c) - 0x245F;

        // ㈠ ==> ㈩
        if ((c >= 0x3220) && (c <= 0x3229))	return ((int) c) - 0x321F;

        // Ⅰ ==> Ⅻ
        if ((c >= 0x2160) && (c <= 0x216B))	return ((int) c) - 0x215F;

        return 0;
    }

    /**
     * Alpha-Numeric character code from  upper UniCode / UTF-8
     *
     * <BR /><BR />These characters exist in <B>UTF-8</B> - but they ARE NOT the usual ASCII
     * characters for the letters {@code 'A' ... 'Z'} or the numbers {@code '0' ... '9'}  They,
     * however, are sometimes found in documents on Chinese News Websites, etc.
     *
     * <BR /><BR />Copied from:<BR />
     * <B><CODE><A HREF="http://www.khngai.com/chinese/charmap/tblgb.php?page=0" TARGET="_blank">
     * http://www.khngai.com/chinese/charmap/tblgb.php?page=0</A></CODE></B>
     *
     * <BR /><BR /><UL CLASS=JDUL>
     * <LI>0 1 2 3 4 5 6 7 8 9 a b c d e f </LI>
     * <LI>！ ＂ ＃ ￥ ％ ＆ ＇ （ ） ＊ ＋ ， － ． ／</LI>
     * <LI>０ １ ２ ３ ４ ５ ６ ７ ８ ９ ： ； ＜ ＝ ＞ ？</LI>
     * <LI>＠ Ａ Ｂ Ｃ Ｄ Ｅ Ｆ Ｇ Ｈ Ｉ Ｊ Ｋ Ｌ Ｍ Ｎ Ｏ</LI>
     * <LI>Ｐ Ｑ Ｒ Ｓ Ｔ Ｕ Ｖ Ｗ Ｘ Ｙ Ｚ ［ ＼ ］ ＾ ＿</LI>
     * <LI>ａ ｂ ｃ ｄ ｅ ｆ ｇ ｈ ｉ ｊ ｋ ｌ ｍ ｎ ｏ</LI>
     * <LI>ｐ ｑ ｒ ｓ ｔ ｕ ｖ ｗ ｘ ｙ ｚ ｛ ｜ ｝ ￣</LI>
     * </UL>
     *
     * @param c any character as input
     * 
     * @return the "lower-level-ASCII" version of that character. 
     */
    public static char alphaNumericAUC(char c)
    {
        // ASCII 'A' is 65
        if ((c > 0xFF20) && (c < 0xFF3B))	return (char) (65 + (c - 0xFF21));

        // ASCII 'a' is 97
        if ((c > 0xFF40) && (c < 0xFF5B))	return (char) (97 + (c - 0xFF41));

        // ASCII '0' is 48
        if ((c >= 0xFF10) && (c <= 0xFF1A))	return (char) (48 + (c - 0xFF10));

        return 0;
    }

    /**
     * This method, {@code punctuationAUC(char)}, converts any characters which are common on many 
     * Mandarin Chinese websites into a lower-level, more typical/normal ASCII equivalent.  This is
     * can be very useful when trying to make sense of brackets, parenthesis, quotes, commas and
     * other punctuation marks - and quickly convert them into a simple version of the character.
     *
     * <BR /><BR />If the input character has an "Alternate Version" in the lower-level-ASCII
     * range, that lower level ASCII character is returned.  If this isn't AUC, ASCII-0 is
     * returned.
     *
     * <BR /><BR /><B>For Instance:</B>
     *
     * <BR /><BR /><TABLE CLASS=JDBriefTable>
     * <TR><TH>Input</TH><TH>Output</TH></TR>
     * <TR><TD>〖 〗 【 】      </TD><TD> [ ] [ ] </TD></TR>
     * <TR><TD> 。 ○ ● ．       </TD><TD>. (ASCII-period) </TD></TR>
     * <TR><TD>¨ 〃 “ ” ″ ＂    </TD><TD>" (ASCII-double-quote)	</TD></TR>
     * <TR><TD>, (ASCII-comma)  </TD><TD>ASCII-0 </TD></TR>
     * <TR><TD>+ (ASCII-plus)   </TD><TD>ASCII-0 </TD></TR>
     * </TABLE>
     * 
     * @param c any character as input
     * 
     * @return the "lower-level-ASCII" version of that character
     * 
     * <BR /><BR /><B>NOTE:</B> ASCII-0 is returned if this is not a valid "AUC"
     * {@code UTF-8 / UniCode} code!
     */
    public static char punctuationAUC(char c)
    {
        // Copied from: 
        // *** http://www.khngai.com/chinese/charmap/tblgb.php?page=0
        //
        // 0 2 3 4 5 6 7 8 9 a b c d e f
        // N N 、 。 · ˉ ˇ ¨ 〃 々 — ～ ‖ … ‘ ’ 
        // “ ” 〔 〕 〈 〉 《 》 「 」 『 』 〖 〗 【 】
        // ± × ÷ ∶ ∧ ∨ ∑ ∏ ∪ ∩ ∈ ∷ √ ⊥ ∥ ∠
        // ⌒ ⊙ ∫ ∮ ≡ ≌ ≈ ∽ ∝ ≠ ≮ ≯ ≤ ≥ ∞ ∵ 
        // ∴ ♂ ♀ ° ′ ″ ℃ ＄ ¤ ￠ ￡ ‰ § № ☆ ★
        // ○ ● ◎ ◇ ◆ □ ■ △ ▲ ※ → ← ↑ ↓ 〓 
        //
        // 0 1 2 3 4 5 6 7 8 9 a b c d e f
        // ！ ＂ ＃ ￥ ％ ＆ ＇ （ ） ＊ ＋ ， － ． ／
        // ０ １ ２ ３ ４ ５ ６ ７ ８ ９ ： ； ＜ ＝ ＞ ？
        // ＠ Ａ Ｂ Ｃ Ｄ Ｅ Ｆ Ｇ Ｈ Ｉ Ｊ Ｋ Ｌ Ｍ Ｎ Ｏ
        // Ｐ Ｑ Ｒ Ｓ Ｔ Ｕ Ｖ Ｗ Ｘ Ｙ Ｚ ［ ＼ ］ ＾ ＿
        // ｀ ａ ｂ ｃ ｄ ｅ ｆ ｇ ｈ ｉ ｊ ｋ ｌ ｍ ｎ ｏ
        // ｐ ｑ ｒ ｓ ｔ ｕ ｖ ｗ ｘ ｙ ｚ ｛ ｜ ｝ ￣	 

        switch (c)
        {
            // 、 ，
            case 0x3001:               // 、
            case 0xFF0C: return ',';   // ，

            // 。 ○ ● ．
            case 0x3002:               // 。
            case 0x25CB:               // ○
            case 0x25CF:               // ●
            case 0xFF0E: return '.';   // ．

            // ‘ ’ ′ ＇ ｀
            case 0x2018:               // ‘
            case 0x2019:               // ’
            case 0x2032:               // ′
            case 0xFF07:               // ＇
            case 0xFF40: return '\'';  // ｀

            // ¨ 〃 “ ” ″ ＂
            case 0x00A8:               // ¨
            case 0x3003:               // 〃
            case 0x201C:               // “
            case 0x201D:               // ”
            case 0x2033:               // ″
            case 0xFF02: return '\"';  // ＂

            // 〔 （
            case 0x3014:               // 〔
            case 0xFF08: return '(';   // （

            // 〕 ）
            case 0x3015:               // 〕
            case 0xFF09: return ')';   // ）

            // 〈 ＜
            case 0x3008:               // 〈
            case 0xFF1C: return '<';   // ＜

            // 〉 ＞
            case 0x3009:               // 〉
            case 0xFF1E: return '>';   // ＞

            // 「 『 〖 【 ［
            case 0x300C:               // 「
            case 0x300E:               // 『
            case 0x3016:               // 〖
            case 0x3010:               // 【
            case 0xFF3B: return '[';   // ［

            // 」 』 〗】 ］
            case 0x300D:               // 」
            case 0x300F:               // 』
            case 0x3017:               // 〗
            case 0x3011:               // 】
            case 0xFF3D: return ']';   // ］

            // ∶ ：
            case 0x2236:               // ∶
            case 0xFF1A: return ':';   // ：

            case 0xFF01: return '!';   // ！
            case 0xFF03: return '#';   // ＃
            case 0xFF05: return '%';   // ％
            case 0xFF06: return '&';   // ＆
            case 0xFF1F: return '?';   // ？
            case 0xFF0F: return '/';   // ／
            case 0xFF3E: return '^';   // ＾
            case 0xFF5B: return '{';   // ｛
            case 0xFF5D: return '}';   // ｝
            case 0xFF5C: return '|';   // ｜
            case 0xFF0B: return '+';   // ＋
            case 0xFF3C: return '\\';  // ＼
            case 0xFF3F: return '_';   // ＿

            // — －
            case 0x2014:               // —
            case 0xFF0D: return '-';   // －

            // 〓 ＝
            case 0x3013:               // 〓
            case 0xFF1D: return '=';   // ＝
        }
        return 0;
    }

    /**
     * Bo Po Mo Fo (注音符號).
     * 
     * <BR /><BR />This is a popular pronunciation system for Mandarin Characters in Taiwan &amp;
     * Hong Kong.
     * 
     * <BR /><BR /><UL CLASS=JDUL>
     * <LI>N N N N N ㄅ ㄆ ㄇ ㄈ ㄉ ㄊ ㄋ ㄌ ㄍ ㄎ ㄏ</LI>
     * <LI>ㄐ ㄑ ㄒ ㄓ ㄔ ㄕ ㄖ ㄗ ㄘ ㄙ ㄚ ㄛ ㄜ ㄝ ㄞ ㄟ</LI>
     * <LI>ㄠ ㄡ ㄢ ㄣ ㄤ ㄥ ㄦ ㄧ ㄨ ㄩ N N N N N N</LI>
     * </UL>
     * 
     * @param c any {@code UTF-8, ASCII} or {@code UniCode} character available from
     * {@code Plane 0}, the <B>Basic Multi-Lingual Plane</B>
     * 
     * @return {@code TRUE} if the input character {@code 'c'} is in this UTF-8/UniCode range.
     * The {@code HEXADECIMAL / UTF-8} representation of the <B>'Bo Po Mo Fo'</B> range is:
     * {@code 0x3110 ... 0x3129}.
     */
    public static boolean isBPMFAUC(char c)
    {
        // 0 1 2 3 4 5 6 7 8 9 a b c d e f
        // N N N N N ㄅ ㄆ ㄇ ㄈ ㄉ ㄊ ㄋ ㄌ ㄍ ㄎ ㄏ
        // ㄐ ㄑ ㄒ ㄓ ㄔ ㄕ ㄖ ㄗ ㄘ ㄙ ㄚ ㄛ ㄜ ㄝ ㄞ ㄟ
        // ㄠ ㄡ ㄢ ㄣ ㄤ ㄥ ㄦ ㄧ ㄨ ㄩ N N N N N N

        return (c >= 0x3110) && (c <= 0x3129);
    }

    /**
     * Checks for end-of-sentence punctuation marks - and "down-converts" them to the simple ASCII
     * equivalent version of that punctuation mark.  If the input character code is not an AUC
     * version of a typical Mandarin-Chinese end-of-sentence punctuation mark - then ASCII-zero is
     * returned.
     *
     * <BR /><BR /><B>NOTE:</B> if a lower-level-ASCII (normal) punctuation mark is input - then
     * ASCII-0 is returned.
     *
     * <BR /><BR /><B>SPECIFICALLY:</B> with {@code '.' '?'} and {@code '!'} as input to this
     * function, ASCII-0 will be returned.
     *
     * <BR /><BR /><B>USE:</B> {@code endOfSentence(c)} to have those punctuation marks included in
     * non-zero results.
     * 
     * @param c any UTF-8, ASCII or UniCode character available.
     * 
     * @return if the input character {@code 'c'} is an "alternate UTF-8" version of the 
     * punctuation marks:
     * 
     * <BR /><BR /><UL CLASS=JDUL>
     * <LI>a period ('.')</LI>
     * <LI>an exclamation-point ('!')</LI>
     * <LI>a question-mark ('?')</LI>
     * </UL>
     * 
     * <BR /><BR />Then the output to this method shall be determined by the table below:
     * 
     * <BR /><BR /><TABLE CLASS=JDBriefTable>
     * <TR><TH>Input Character</TH><TH>Output Character</TH></TR>
     * <TR><TD>。 ○ ● ．</TD><TD>'.' (normal period)</TD></TR>
     * <TR><TD>！</TD><TD>'!' (regular exclamation point)</TD></TR>
     * <TR><TD>？</TD><TD>'?' (usual question mark)</TD></TR>
     * </TABLE>
     *
     * <BR /><BR />
     * <B>NOTE:</B> If the normal period, question, or exclamation are passed as input to this
     * function, this function will return ASCII-0
     * 
     * @see #endOfSentence(char)
     */
    public static char endOfSentenceAUC(char c)
    {
        char auc = punctuationAUC(c);

        if (auc != 0) c = auc;

        // A 'switch' is used instead of an 'if' with a char-cast because it is easier to
        // read on this page.  Only the three characters with ASCII 46, 33, and 63 should
        // return non-zero values.
        switch ((int) auc)
        {
            // These characters identify an "End of Sentence" marker.
            case 0x2E: return '.';	// DEC: 46
            case 0x21: return '!';	// DEC: 33
            case 0x3F: return '?';	// DEC: 63

            // All other characters should result in a '0'
            default:   return (char) 0;
        }
    }

    /**
     * Checks for end-of-sentence punctuation marks.  This Helper function is *almost* identitical
     * to the {@code endOfSentenceAUC(c)} method.
     * 
     * <BR /><BR />{@code endOfSentenceAUC(c)} returns ASCII-0 for the usual-punctuation marks -
     * {@code '.', '!'} and {@code '?'}.
     *
     * <BR /><BR />{@code endOfSentence(c)} does not 'leave-out' or 'deny' these lower-level-ASCII
     * punctuation symbols.
     * 
     * @param c any UTF-8, ASCII or UniCode character available.
     * 
     * @return If the input character {@code 'c'} is a period {@code ('.')}, an exclamation-point
     * {@code ('!')}, or a question-mark {@code ('?')} - <B><I>or an AUC version of that
     * punctuation,</B></I> then that punctuation is returned.  Otherwise ASCII-0 is returned.
     * 
     * @see #endOfSentenceAUC(char)
     */
    public static char endOfSentence(char c)
    {
        char auc = endOfSentenceAUC(c);

        if (auc != 0) c = auc;

        // These three characters identify an "End of Sentence" Marker
        if ((c == '.') || (c == '!') || (c == '?')) return c;

        return (char) 0;
    }

    /**
     * Checks for end-of-phrase punctuation marks - and "down-converts" them to the simple ASCII
     * equivalent version of that punctuation mark.  If the input character code is not an AUC
     * version of a typical Mandarin-Chinese phrase-delimiting punctuation mark - then ASCII-zero
     * is returned.
     *
     * <BR /><BR /><B>NOTE:</B> if a lower-level-ASCII (normal) punctuation mark is input - then
     * ASCII-0 is returned.
     *
     * <BR /><BR /><B>SPECIFICALLY:</B> with {@code ',' ':' ';'} and other common phrase-ending
     * marks in Mandarin as input to this function, ASCII-0 will be returned.
     *
     * <BR /><BR /><B>USE:</B> {@code endOfPhrase(c)} to have those punctuation marks included in
     * non-zero results.
     * 
     * @param c any UTF-8, ASCII or UniCode character available.
     * 
     * @return if the input character {@code 'c'} is an "alternate UTF-8" (AUC) version of the 
     * punctuation marks:
     *
     * <BR /><BR /><TABLE CLASS=JDBriefTable>
     * <TR><TH>Punctuation</TH><TH>Symbol and ASCII-Code</TH></TR>
     * <TR><TD>semi-colon       </TD><TD>';'    HEX:0x3B, DEC: 59</TD></TR>
     * <TR><TD>comma            </TD><TD>','    HEX:0x2C, DEC: 44</TD></TR>
     * <TR><TD>colon            </TD><TD>':'    HEX:0x3A, DEC: 58</TD></TR>
     * <TR><TD>double-quote     </TD><TD>'\"'   HEX:0x22, DEC: 34</TD></TR>
     * <TR><TD>single-quote     </TD><TD>'\''   HEX:0x27, DEC: 39</TD></TR>
     * <TR><TD>left-bracket     </TD><TD>'['    HEX:0x5B, DEC: 91</TD></TR>
     * <TR><TD>right-bracket    </TD><TD>']'    HEX:0x5D, DEC: 93</TD></TR>
     * <TR><TD>less-than        </TD><TD>'&lt;' HEX:0x3C, DEC: 60</TD></TR>
     * <TR><TD>greater-than     </TD><TD>'&gt;' HEX:0x3E, DEC: 62</TD></TR>
     * <TR><TD>left-paren       </TD><TD>'('    HEX:0x28, DEC: 40</TD></TR>
     * <TR><TD>right-paren      </TD><TD>')'    HEX:0x29, DEC: 41</TD></TR>
     * </TABLE>
     *
     * <BR /><BR />
     * <B>IMPORTANT NOTE:</B> *only* the upper-level-UTF-8/UniCode versions of these
     * punctuation marks will produce a non-zero result.  An actual ASCII comma, semi-colon, quote,
     * bracket, or parenthesis (etc...) will cause this method to return ASCII-0.  Please use 
     * endOfPhrase(char) to include the lower-level (Already down-converted ASCII) with non-zero
     * results.
     * 
     * @see #endOfPhrase(char)
     */
    public static char endOfPhraseAUC(char c)
    {
        char auc = punctuationAUC(c);

        if (auc != 0) c = auc;

        // A 'switch' is used instead of an 'if' with a char-cast because it is easier to
        // read on this page.  Only the characters having ASCII 59, 44, 58, 34, etc... should
        // return non-zero values.
        switch ((int) auc)
        {
            // These characters constitute an "End of Phrase" marker
            case 0x3B: return ';';	// DEC: 59
            case 0x2C: return ',';	// DEC: 44
            case 0x3A: return ':';	// DEC: 58
            case 0x22: return '\"';	// DEC: 34
            case 0x27: return '\'';	// DEC: 39
            case 0x5B: return '[';	// DEC: 91
            case 0x5D: return ']';	// DEC: 93
            case 0x3C: return '<';	// DEC: 60
            case 0x3E: return '>';	// DEC: 62
            case 0x28: return '(';	// DEC: 40
            case 0x29: return ')';	// DEC: 41

            // All other results should return '0'
            default: return 0;
        }
    }

    /**
     * endOfPhrase - any version of the end-of-phrase markers usually used in Mandarin Chinese
     * text.  This method returns the exact same results as the {@code endOfPhraseAUC(char)}
     * method.
     *
     * <BR /><BR /><B><SPAN STYLE="color: red;">EXCEPT:</SPAN></B>
     * The regular/normal version of that punctuation mark (ASCII for semi-colon, comma, quote,
     * etc...) will return the exact-same semi-colon, comma or quote - <I><B>instead of</B></I>
     * ASCII-0
     *
     * <BR /><TABLE CLASS=JDBriefTable>
     * <TR><TH>Input &amp; Method Called:</TH><TH>Result</TH></TR>
     * <TR><TD>endOfPhrase(';')     </TD><TD>';'    // Normal ASCII semi-colon symbol</TD></TR>
     * <TR><TD>endOfPhraseAUC(';')  </TD><TD>0      // ASCII-0 returned</TD></TR>
     * <TR><TD>endOfPhrase('】')    </TD><TD>']'    // left-bracket returned</TD></TR>
     * <TR><TD>endOfPhraseAUC('】') </TD><TD>']'    // left-bracket returned</TD></TR>
     * <TR><TD>endOfPhrase(']')     </TD><TD>']'    // left-bracket returned</TD></TR>
     * <TR><TD>endOfPhraseAUC(']')  </TD><TD>0      // ASCII-0 returned</TD></TR>
     * </TABLE>
     *
     * <BR /><BR />
     * The list of end-of-phrase characters include the following:<BR />
     * <B STYLE="color:red">{@code ';' ',' ':' '\"' '\'' '[' ']' '<' '>' '(' ')'}</B>
     * 
     * @param c Any character in the entire UniCode range. 0x0000 to 0xFFFF
     * 
     * @return If {@code 'c'} is an "AUC" version of and end-of-phrase marker - or a regular
     * lower-level ASCII version - then that punctuation mark is returned.  Otherwise 0 is
     * returned.
     * 
     * @see #punctuationAUC(char)
     */
    public static char endOfPhrase(char c)
    {
        char auc = punctuationAUC(c);

        if (auc != 0) c = auc;

        if ((c == ';')  ||  (c == ',')  || (c == ':') ||
            (c == '\"') ||  (c == '\'') ||
            (c == '[')  ||  (c == ']')  || 
            (c == '<')  ||  (c == '>')  ||
            (c == '(')  ||  (c == ')'))
            return c;

        return (char) 0;
    }

    /**
     * Quotes - any version.  &nbsp;&nbsp;<B><I>AUC or normal-ASCII, (BOTH)</B></I> single or 
     * double quote.
     * 
     * @param c Any character in the entire <B>UniCode</B> range. {@code 0x0000 to 0xFFFF} which is
     * the {@code Basic Multi Lingual Plane}.
     * 
     * @return If the input character {@code 'c'} is an <B><I>"AUC" version</B></i> of the single 
     * (or double) quote, or the <B><I>regular-ASCII</B></I> single/double quote, then the
     * appropriate single or double-quote is returned.  Otherwise 0 is returned.
     * 
     * @see #punctuationAUC(char)
     */
    public static char quoteAUC(char c)
    {
        char auc = punctuationAUC(c);

        if (auc != 0) c = auc;

        switch ((int) c)
        {
            case 0x22:  return '\"';	// DEC: 34
            case 0x27:  return '\'';	// DEC: 39
            default:    return (char) 0;
        }
    }

    /**
    * Comma - any version.  &nbsp;&nbsp;<B><I>AUC or normal-ASCII, (BOTH)</B></I> comma
    * @param c Any character in the entire <B>UTF-8</B> range. {@code 0x0000 to 0xFFFF}, the
    * {@code Basic Multi-Lingual Plane}.
    * @return If the input character {@code 'c'} is an <B><I>"AUC" version</B></i> of the comma,
    * or the <B><I>regular-ASCII</B></I> comma, then the comma is returned.  Otherwise 0 is returned.
    * @see #punctuationAUC(char)
    */
    public static char commaAUC(char c)
    {
        char auc = punctuationAUC(c);

        if (auc != 0) c = auc;

        switch ((int) c)
        {
            case 0x2c:  return ',';	// DEC: 44
            default:    return (char) 0;
        }
    }

    /**
     * Brackets - any version.  &nbsp;&nbsp;<B><I>AUC or normal-ASCII, (BOTH)</B></I> brackets
     * 
     * @param c Any character in the entirbrackets UniCode range. 0x0000 to 0xFFFF
     * 
     * @return If the input character {@code 'c'} is an <B><I>"AUC" version</B></i> of the
     * brackets, or the <B><I>regular-ASCII</B></I> brackets, then the appropriate brackets are
     * returned. Otherwise 0 is returned.
     * 
     * @see #punctuationAUC(char)
     */
    public static char bracketAUC(char c)
    {
        char auc = punctuationAUC(c);

        if (auc != 0) c = auc;

        switch ((int) c)
        {
            case 0x5B:  return '[';	// DEC: 91
            case 0x5D:  return ']';	// DEC: 93
            case 0x3C:  return '<';	// DEC: 60
            case 0x3E:  return '>';	// DEC: 62
            default:    return (char) 0;
        }
    }

    /**
    * Parenthesis - any version.  &nbsp;&nbsp;<B><I>AUC or normal-ASCII, (BOTH)</B></I> parenthesis
    * @param c Any character in the entire UniCode range. 0x0000 to 0xFFFF
    * @return If the input character {@code 'c'} is an <B><I>"AUC" version</B></i> of the parenthesis,
    * or the <B><I>regular-ASCII</B></I> parenthesis, then the appropriate parenthesis are
    * returned.  Otherwise 0 is returned.
    * @see #punctuationAUC(char)
    */
    public static char parenAUC(char c)
    {
        char auc = punctuationAUC(c);

        if (auc != 0) c = auc;

        switch ((int) c)
        {
            case 0x28:  return '(';	// DEC: 40
            case 0x29:  return ')';	// DEC: 41
            default:    return (char) 0;
        }
    }
     
    /**
     * The complete list of "higher-level" (alternate) Uni-Code chars.  Many of these are alternate
     * punctuation marks used in documents that contain Mandarin Chinese.
     */
    public static final String AUC = 
        // Special Punctuation characters found in Chinese HTML Pages
        "、 。 · ˉ ˇ ¨ 〃 々 — ～ ‖ … ‘ ’ "             +
        "“ ” 〔 〕 〈 〉 《 》 「 」 『 』 〖 〗 【 】"	  +
        "± × ÷ ∶ ∧ ∨ ∑ ∏ ∪ ∩ ∈ ∷ √ ⊥ ∥ ∠"               +
        "⌒ ⊙ ∫ ∮ ≡ ≌ ≈ ∽ ∝ ≠ ≮ ≯ ≤ ≥ ∞ ∵ "           +
        "∴ ♂ ♀ ° ′ ″ ℃ ＄ ¤ ￠ ￡ ‰ § № ☆ ★"          +
        "○ ● ◎ ◇ ◆ □ ■ △ ▲ ※ → ← ↑ ↓ 〓 "            +
        "！ ＂ ＃ ￥ ％ ＆ ＇ （ ） ＊ ＋ ， － ． ／"      +

        // Extra Alphabetic and Numeric Characters sometimes used
        // on web-pages written in Chinese
        "０ １ ２ ３ ４ ５ ６ ７ ８ ９ ： ； ＜ ＝ ＞ ？"   +
        "＠ Ａ Ｂ Ｃ Ｄ Ｅ Ｆ Ｇ Ｈ Ｉ Ｊ Ｋ Ｌ Ｍ Ｎ Ｏ"   +
        "Ｐ Ｑ Ｒ Ｓ Ｔ Ｕ Ｖ Ｗ Ｘ Ｙ Ｚ ［ ＼ ］ ＾ ＿"   +
        "｀ ａ ｂ ｃ ｄ ｅ ｆ ｇ ｈ ｉ ｊ ｋ ｌ ｍ ｎ ｏ"   +
        "ｐ ｑ ｒ ｓ ｔ ｕ ｖ ｗ ｘ ｙ ｚ ｛ ｜ ｝ ￣"      +

        // Certain "Bullet List" / "Bullet Point" markers
        "⒈ ⒉ ⒊ ⒋ ⒌ ⒍ ⒎ ⒏ ⒐ ⒑ ⒒ ⒓ ⒔ ⒕ ⒖"      +
        "⒗ ⒘ ⒙ ⒚ ⒛ ⑴ ⑵ ⑶ ⑷ ⑸ ⑹ ⑺ ⑻ ⑼ ⑽ ⑾"   +
        "⑿ ⒀ ⒁ ⒂ ⒃ ⒄ ⒅ ⒆ ⒇ ① ② ③ ④ ⑤ ⑥ ⑦"         +
        "⑧ ⑨ ⑩ ㈠ ㈡ ㈢ ㈣ ㈤ ㈥ ㈦ ㈧ ㈨ ㈩"               +
        "Ⅰ Ⅱ Ⅲ Ⅳ Ⅴ Ⅵ Ⅶ Ⅷ Ⅸ Ⅹ Ⅺ Ⅻ"               +

        // The "Bo Po Mo Fo" Pronunciation Used for Chinese Characters
        "ㄐ ㄑ ㄒ ㄓ ㄔ ㄕ ㄖ ㄗ ㄘ ㄙ ㄚ ㄛ ㄜ ㄝ ㄞ ㄟ"   +
        "ㄠ ㄡ ㄢ ㄣ ㄤ ㄥ ㄦ ㄧ ㄨ ㄩ";

    /**
     * 
     * @return An HTML &lt;TABLE&gt; that contains many tests of the subroutines in this class
     */
    public static String testAUC()
    {
        StringBuilder ret = new StringBuilder();
        ret.append( "<TABLE BORDER=\"1\"><TR>"      +
                    "<TD WIDTH=\"30\">&nbsp;</TD>"  +
                    "<TD WIDTH=\"70\">&nbsp;</TD>"  +
                    "<TD WIDTH=\"70\">&nbsp;</TD>"  +
                    "<TD WIDTH=\"30\">&nbsp;</TD>"  );

        for (int i=4; i < 12; i++)
            ret.append("<TD WIDTH=\"70\">&nbsp;</TD>");
        ret.append("</TR>");;

        for (int i=0; i < AUC.length(); i++)
        {
            char c = AUC.charAt(i);

            if (c == ' ') continue;

            // Check original character (not punctuation-converted cc)
            char    bl          = Integer.toString(bulletListAUC(c)).charAt(0);
            boolean bpmf        = isBPMFAUC(c);

            // first, convert the punctuation to normal-ASCII punctuation
            // These are the "translated" characters
            // The "translated character" is where, for example '〗' ==> ']'
            char	newC       = punctuationAUC(c);

            // These are used for building <TABLE> & <TD> entry strings
            char    q           = quoteAUC(newC);
            char    es          = endOfSentenceAUC(newC);
            char    ep          = endOfPhraseAUC(newC);
            char    com         = commaAUC(newC);
            char    br          = bracketAUC(newC);
            char    p           = parenAUC(newC);

            char    ascii       = punctuationAUC(c);
            if (ascii   == 0)   ascii = alphaNumericAUC(c);
            if (bl      != 0)   ascii = bl;
            if (bpmf)           ascii = c;
            if (ascii   == 0)   ascii = 'x';

            // =================================================
            // This is for debugging this test function
            String	tmp =   " newCC = " + newC  + ", q="    + q     +
                            ", es="     + es    + ", ep="   + ep    +
                            ", com="    + com   + ", br="   + br    +
                            ", p="      + p	    + ", bl ="  + bl    +
                            ", bpmf="   + bpmf;

            tmp = tmp.replaceAll("<", "&lt;").replaceAll(">", "&gt;");

            // Build the HTML Table 
            ret.append("<TR>");

            ret.append("<TD>" + c + "</TD>");
            ret.append("<TD>" + ((int) c) + "</TD>");
            ret.append("<TD>" + "0x" + String.format("%x",(int) c).toUpperCase() + "</TD>");
            ret.append("<TD>" + ascii + "</TD>");

            ret.append("<TD>" + ((q		== 0)	? "" : "Quote")		+ "</TD>");	
            ret.append("<TD>" + ((es	== 0)	? "" : "Sentence")	+ "</TD>");
            ret.append("<TD>" + ((ep	== 0)	? "" : "Phrase")	+ "</TD>");
            ret.append("<TD>" + ((com	== 0)	? "" : "Comma")		+ "</TD>");
            ret.append("<TD>" + ((br	== 0)	? "" : "Bracket")	+ "</TD>");
            ret.append("<TD>" + ((p		== 0)	? "" : "Paren")		+ "</TD>");
            ret.append("<TD>" + ((bl	== 0)	? "" : "Bullet")	+ "</TD>"); 
            ret.append("<TD>" + (bpmf ? "BPMF" : "") + "</TD>");

            // ==========================================================
            // Un-Comment this if you want to debug this print function
            // outStr += "</TR><TR><TD COLSPAN=\"12\">" + tmp + "</TD></TR>";

        }
        ret.append("</TABLE>");
        return ret.toString();
    }

    /**
     * Checks for any leading alphabetic {@code ('a' ... 'z')} and numeric {@code ('0' ... '9')}
     * characters in a Chinese {@code String}.
     *
     * <B>CHANGED:</B> 2018.09.24 - I left comma's and period's in the {@code String} (when
     * situated between digits). These are considered to be part of the "Leading Letters and
     * Numbers"
     *
     * @param chineseSentence A sentence that may or may not have leading letters &amp; numbers.
     * 
     * @return the {@code String}-index of the first non-alphabetic, non-numeric character in the
     * {@code String}.
     * 
     * <BR /><BR /><B>NOTE:</B> white-space does not count, and the position of the first
     * white-space character will be returned, if white-space is contained in this {@code String}.
     * 
     * @see #isAlphaNumeric(char)
     */
    public static int countLeadingLettersAndNumbers(String chineseSentence)
    {
        for (int i = 0; i < chineseSentence.length(); i++)
        {
            char c = chineseSentence.charAt(i);
            if ((! isAlphaNumeric(c)) && (c != '.') && (c != ',')) return i;
        }

        return chineseSentence.length(); // This really ought not to happen, but just in case....
    }

    /**
     * Checks for higher-Unicode letters and numbers, and converts them into lower-level versions
     * of the appropriate letter or number.
     *
     * <BR /><BR /><B>SPECIFICALLY:</B>  This method is just a "for-loop" which
     * makes a call to  {@code alphaNumericaAUC()} and if zero is not returned from that
     * method-call, then the input {@code String} is modified at the index which contained such a
     * higher {@code UTF-8} letter or number.
     * 
     * @param s This may or may not have "Alternate UniCode" Characters for letters and numbers.
     * 
     * @return if the "alternate" versions of <CODE>'A' ... 'Z'</CODE> or <CODE>'0' ... '9'</CODE>
     * are there, this will make sure to change them.
     * 
     * @see #alphaNumericAUC(char)
     */
    public static String convertAnyAUC(String s)
    {
        char[] cArr = s.toCharArray();

        for (int i = 0; i < cArr.length; i++)
        {
            char auc = alphaNumericAUC(cArr[i]);
            if (auc != 0) cArr[i] = auc;
        }

        return new String(cArr);
    }
    
    /**
     * Counts syllables in a "word" of PinYin.  The input {@code String} is expected to not have
     * any spaces!
     *
     * <BR /><BR />
     * <B>NOTE:</B>The number of syllables in a Chinese PinYin "word" identifies the 
     * number of Chinese Characters that were used to generate the input
     * <B>PinYin {@code String}</B>.
     *
     * <BR /><BR /><B>CHANGED:</B> 2018.09.24 - Added a test for periods and commas that are
     * situated directly between two digits.  In the String "5.0" the period between 5 and 0 is no
     * longer removed!
     *
     * <BR /><BR />If the {@code String} "5.0" were passed as the "word" parameter, the result
     * should be 3!
     * 
     * @param word A word in the "PinYin" format. (罗马拼音)
     * 
     * @param DOUT This must implement {@code java.lang.Appendable}
     * 
     * @return the number of syllables (specifically: Chinese Characters) in the input word.
     * 
     * @throws IOException The interface {@code java.lang.Appendable} mandates that the 
     * {@code IOException} must be treated as a checked exception for all output operations.  
     * Therefore {@code IOException} is a required exception in this method' throws clause.
     */
    public static int countSyllablesAndNonChinese(String word, Appendable DOUT)
        throws IOException
    {
        int numChinese	= 0;

        // Tone-Vowels & Numbers always correspond to a character
        for (int letter = 0; letter < word.length(); letter++)
        {
            char c = word.charAt(letter);
            if (    ZH.isToneVowel(c)   ||
                    ZH.isNumber(c)      ||
                    (c == '.')          ||
                    (c == ',')
                )
                numChinese++;
        }

        // Checks for vowel-strings that don't contain a tone
        // ==> Checks for "clear tone"
        String copyW = "" + word;

        DOUT.append("[" + copyW + "] - ");

        for (int letterIndex = 0; letterIndex < copyW.length(); letterIndex++)
            if (    ! ZH.isRegVowel(copyW.charAt(letterIndex))      &&
                    ! ZH.isToneVowel(copyW.charAt(letterIndex))	)
                copyW =	StringParse.setChar(copyW, letterIndex, ' ');
            
        DOUT.append("after erasing non-vowels [" + copyW + "]\n");
         
        String[] syllables = copyW.trim().split(" ");

        DOUT.append("Syllables are:");
        for (int sylIndex = 0; sylIndex < syllables.length; sylIndex++	)
            DOUT.append("[" + syllables[sylIndex] + "]");
        DOUT.append("\n");

        TOP:
        for (int sylIndex = 0; sylIndex < syllables.length; sylIndex++)
        {
            String	syllable    = syllables[sylIndex].trim();
            boolean	foundTone   = false;

            // The split(' ') function sometimes provides blanks
            if (syllable.length() == 0) continue TOP;

            for (int vowelIndex = 0; vowelIndex < syllable.length(); vowelIndex++)
                if (ZH.isToneVowel(syllable.charAt(vowelIndex)))
                    continue TOP;

            numChinese++;
            DOUT.append("NOTE: *** FOUND CLEAR TONE\n");
        }

        return numChinese;
    }

    /**
     * Deletes all punctuation &amp; non-character symbols.  The {@code String} that is returned
     * will be shortened by precisely the number of punctuation characters were contained by that
     * {@code String}.
     *
     * <BR /><BR /><B>NOTE:</B> {@code '.'} and {@code ','} (periods and commas) between 
     * number/digits are not removed!
     *
     * @param s An input {@code String} (in Mandarin - 普通话)
     *
     * @return a {@code String} that is the same as the input {@code String} - after skipping
     * characters as follows:
     *
     * <BR /><DIV CLASS="SNIP">{@code
     * if (isChinese(c) || isAlphaNumeric(c) || (alphaNumericAUC(c) != 0)) continue;
     * (else) s = StringParse.delChar(s, chr--);
     * }</DIV>
     */
    public static String delAllPunctuationCHINESE(String s)
    {
        char[]  cArr        = s.toCharArray();
        int     sourcePos   = 0;
        int     destPos     = 0;

        while (sourcePos < cArr.length)
        {
            char c = cArr[sourcePos];

            // Check for things like 5.0 or 1,120,987 - SPECIFICALLY Comma's and Period's situated
            // directly between 2 numbers.

            if (    ((c == '.') || (c == ','))
                &&  (((sourcePos-1) == -1)          || isNumber(cArr[sourcePos-1]))
                &&  (((sourcePos+1) == s.length())  || isNumber(cArr[sourcePos+1]))
            )
                { cArr[destPos++] = cArr[sourcePos++]; continue; }

            // AUC were converted before calling this function ... (alphaNumericAUC(c) != 0)) 

            if (isChinese(c) || isAlphaNumeric(c))
                { cArr[destPos++] = cArr[sourcePos++]; continue; }

            sourcePos++;
        }

        return s;
    }

    /**
     * Deletes all punctuation &amp; non-character symbols from a {@code String} of PinYin.
     * The returned {@code String} will have the same length as it originally did, but the
     * locations where punctuation existed will have been replaced with a space character.
     * 
     * <BR /><BR /><B>NOTE:</B> {@code '.'} and {@code ','} (periods and commas) between
     * number/digits are not removed!
     *
     * @param s An input {@code String} in 罗马拼音
     *
     * @return A {@code String} that is the same as the input {@code String} - after skipping
     * characters as follows:
     *
     * <BR /><DIV CLASS="SNIP">{@code 
     * if (isAlphaNumeric(c) || (alphaNumericAUC(c) != 0)) continue;
     * (else) s = StringParse.setChar(s, chr, ' ');
     * }</DIV>
     */
    public static String delAllPunctuationPINYIN(String s)
    {
        char[] cArr = s.toCharArray();

        // This loop cnverts all non-AlphaNumeric unicode to a space		
        for (int i = 0; i < cArr.length; i++)
        {
            char c = cArr[i];

            if (isAlphaNumeric(c) || (alphaNumericAUC(c) != 0)) continue;

            // Check for things like 5.0 or 1,120,987 - SPECIFICALLY Comma's and Period's
            // situated directly between 2 numbers.

            if (    ((c == '.') || (c == ','))
                &&  (((i-1) == -1)          || isNumber(cArr[i-1]))
                &&  (((i+1) == s.length())  || isNumber(cArr[i+1]))
            )
                continue;

            cArr[i] = ' ';
        }

        return new String(cArr);
    }

    // ****************************************************************************************
    // Constants
    // ****************************************************************************************

    /** Special Quotation Mark, left-side */
    public static final char CONSTSpecialQuoteLeft = (char) 0x201C;

    /** Special Quotation Mark, right-side */
    public static final char CONSTSpecialQuoteRight = (char) 0x201D;

    /**
     * <B STYLE="color: red;">GTPPE: Google Translate Punctuation Pronunciation Equivalent</B>
     * This searches through a {@code String} to find the location of the "equivalent punctuation
     * mark"
     * 
     * @param s The input {@code String}, expected to be the result of a <B>GCS TS</B> query.  This
     * function is totally useless for any {@code Pronunciation String} that hasn't been obtained
     * from <B>GCS TS</B>.
     *
     * <BR /><BR /><B>NOTE:</B> The input {@code String} is intended to be in "PinYin" (罗马拼音)
     *
     * @param c The original punctuation character to look for...  Generally, this is used to
     * search for higher-level <B>UTF-8 {@code chars}</B> that have been "down-converted" by <B>GCS
     * TS</B>
     *
     * @return the {@code indexOf()} of the character in the original input String.  The actual
     * character is not looked for, BUT RATHER, the <B>Google Cloud Server Transation Services</B>
     * equivalent character.  Specifically, {@code GCSTS} has a "substitute punctuation" for many
     * higher-level <B>UTF-8</B> and <B>UniCode</B> chars.  There are 5 different versions of a
     * quote...
     */
    public static int GTPPEIndexOf(String s, char c)
    {
        int cc = (int) c;

        // if (c == '∶')	return s.indexOf(c);
        if (cc == 0x2236)	return s.indexOf(c);
        // if (c == '：')	return s.indexOf(':');
        if (cc == 0xFF1A)	return s.indexOf(':');	// (0x003A);
        // if (c == ':')	return s.indexOf(c);	// Natural colon
        if (cc == 0x003A)	return s.indexOf(c);

        // commas
        // if (c == '、')	return s.indexOf(',');
        if (cc == 0x3001)	return s.indexOf(',');	// (0x002C);
        // if (c == '，')	return s.indexOf(',');
        if (cc == 0xFF0C)	return s.indexOf(',');	// (0x002C);
        // if (c == ',')	return s.indexOf(c);	// natural comma
        if (cc == 0x002C)	return s.indexOf(c);

        // periods
        // if (c == '。')	return s.indexOf('.');
        if (cc == 0x3002)	return s.indexOf('.');	// (0x002E);
        // if (c == '○')	return s.indexOf(c);
        if (cc == 0x25CB)	return s.indexOf(c);
        // if (c == '●')	return s.indexOf(c);
        if (cc == 0x25CF)	return s.indexOf(c);
        // if (c == '．')	return s.indexOf('.');
        if (cc == 0xFF0E)	return s.indexOf('.');	// (0x002E);
        // if (c == '.')	return s.indexOf(c);	// natural period
        if (cc == 0x002E)	return s.indexOf(c);


        // Exclamation & Question
        // if (c == '?')	return s.indexOf(c);	// natural question-mark
        if (cc == 0x003F)	return s.indexOf(c);
        // if (c == '？')	return s.indexOf('?');
        if (cc == 0xFF1F)	return s.indexOf('?');	// (0x003F);
        // if (c == '！')	return s.indexOf('!');
        if (cc == 0xFF01)	return s.indexOf('!');	// (0x0021);
        // if (c == '!')	return s.indexOf(c);	// natural exclamation
        if (cc == 0x0021)	return s.indexOf(c);

        // single-quotes
        // if (c == '‘')	return s.indexOf(c);
        if (cc == 0x2018)	return s.indexOf(c);
        // if (c == '’')	return s.indexOf(c);
        if (cc == 0x2019)	return s.indexOf(c);
        // if (c == '′')	return s.indexOf(c);
        if (cc == 0x2032)	return s.indexOf(c);
        // if (c == '＇')	return s.indexOf('\'');
        if (cc == 0xFF07)	return s.indexOf('\'');	// (0x0027);
        // if (c == '｀')	return s.indexOf('`');
        if (cc == 0xFF40)	return s.indexOf('`');	// (0x0060);
        // if (c == '\'')	return s.indexOf(c);	// natural single-quotes
        if (cc == 0x0027)	return s.indexOf(c);
 

        // NOT DETECTED RIGHT NOW.. 
        // if (c == '《')	return s.indexOf('“');
        if (cc == 0x300A)	return s.indexOf(CONSTSpecialQuoteLeft);
        // if (c == '》')	return s.indexOf('”');
        if (cc == 0x300B)	return s.indexOf(CONSTSpecialQuoteRight);

        // double-quotes
        // if (c == '¨')	return s.indexOf(c);
        if (cc == 0x00A8)	return s.indexOf(c);
        // if (c == '〃')	return s.indexOf(c);
        if (cc == 0x3003)	return s.indexOf(c);
        // if (c == '“')	return s.indexOf(c);
        if (cc == 0x201C)	return s.indexOf(c);
        // if (c == '”')	return s.indexOf(c);
        if (cc == 0x201D)	return s.indexOf(c);
        // if (c == '″')	return s.indexOf(c);
        if (cc == 0x2033)	return s.indexOf(c);
        // if (c == '＂')	return s.indexOf('\"');
        if (cc == 0xFF02)	return s.indexOf('\"');	// (0x0022);
        // if (c == '\"')	return s.indexOf(c);	// natural double quotes
        if (cc == 0x0022)	return s.indexOf(c);


        // Brackets
        // if (c == '[')	return s.indexOf(c);
        if (cc == 0x005B)	return s.indexOf(c);
        // if (c == ']')	return s.indexOf(c);
        if (cc == 0x005D)	return s.indexOf(c);
        // if (c == '［')	return s.indexOf('[');
        if (cc == 0xFF3B)	return s.indexOf('[');	// (0x005B);
        // if (c == '］')	return s.indexOf(']');
        if (cc == 0xFF3D)	return s.indexOf(']');	// (0x005D);
        // if (c == '【')	return s.indexOf('[');
        if (cc == 0x3010)	return s.indexOf('[');	// (0x005B);
        // if (c == '】')	return s.indexOf(']');
        if (cc == 0x3011)	return s.indexOf(']');	// (0x005D);
        // if (c == '〖')	return s.indexOf(c);
        if (cc == 0x3016)	return s.indexOf(c);
        // if (c == '〗')	return s.indexOf(c);
        if (cc == 0x3017)	return s.indexOf(c);
        // if (c == '『')	return s.indexOf('“');
        if (cc == 0x300E)	return s.indexOf(CONSTSpecialQuoteLeft);
        // if (c == '』')	return s.indexOf('”');
        if (cc == 0x300F)	return s.indexOf(CONSTSpecialQuoteRight);
        // if (c == '「')	return s.indexOf('`');
        if (cc == 0x300C)	return s.indexOf('`');	// (0x0060);
        // if (c == '」')	return s.indexOf('\'');
        if (cc == 0x300D)	return s.indexOf('\'');	// (0x0027);


        // Parenthesis
        // if (c == '(')	return s.indexOf(c);
        if (cc == 0x0028)	return s.indexOf(c);
        // if (c == ')')	return s.indexOf(c);
        if (cc == 0x0029)	return s.indexOf(c);
        // if (c == '（')	return s.indexOf('(');
        if (cc == 0xFF08)	return s.indexOf('(');	// (0x0028);
        // if (c == '）')	return s.indexOf(')');
        if (cc == 0xFF09)	return s.indexOf(')');	// (0x0029);
        // if (c == '〔')	return s.indexOf(c);
        if (cc == 0x3014)	return s.indexOf(c);
        // if (c == '〕')	return s.indexOf(c);
        if (cc == 0x3015)	return s.indexOf(c);

        System.out.println("character not found: \'" + c + "\'\nZH.GTPPEIndexOf(String s, char c)");
        System.exit(0);
        return 0;
    }
}