001package Torello.Languages; 002 003import java.io.*; 004import java.util.*; 005import java.util.regex.*; 006import Torello.Java.*; 007 008/** 009 * ZH (Mandarin Chinese) Many tools for parsing constructs from Mandarin News & other 010 * Web-Sites. 011 * 012 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=ZH> 013 */ 014@Torello.JavaDoc.StaticFunctional 015public class ZH 016{ 017 private ZH() { } 018 019 static void main1(String[] argv) throws IOException 020 { 021 StringBuilder sb = new StringBuilder(); 022 sb.append("<HTML>\n<HEAD>\n<TITLE>AUC Test</TITLE>\n"); 023 sb.append("<META http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />\n"); 024 sb.append("<BODY>\n"); 025 sb.append(testAUC() + "\n<BR />\n"); 026 sb.append("</BODY>\n</HTML>\n"); 027 FileRW.writeFile(sb, "out.html"); 028 } 029 030 /* 031 static void main(String argv[]) throws IOException 032 { 033 for (int i=0; i < H2CV.length; i++) 034 System.out.print(H2CV[i] + ":" + ((char) H2CV[i]) + ":" + CV2RV[i] + ",\t"); 035 String s = "À, É, à, á, è, é, ì, í, ò, ó, ù, ú, ü, Ā, ā, ē, ě, ī, ō, ū, ǎ, ǐ, ǒ, ǔ"; 036 System.out.println(s); 037 System.out.println(toneVowelsToRegularVowels(s)); 038 } 039 */ 040 041 /* 042 * This is the list of the ASCII/UTF-8 character codes for the vowels with tone symbols on 043 * top of them. Google Translate returns many of the PinYin Romanization results as: <BR /> 044 * ū <B><I>INSTEAD OF</B></i> the character as a UTF-8 character. Essentially, this 045 * array contains a list of those character codes. 046 */ 047 private static final int[] H2CV = { 39, 192, 201, 224, 225, 232, 233, 236, 237, 242, 243, 048 249, 250, 252, 256, 257, 275, 283, 299, 333, 363, 462, 464, 466, 468, 474, 476 }; 049 050 private static final char[] CV = { '\'', 'À', 'É', 'à', 'á', 'è', 'é', 'ì', 'í', 'ò', 'ó', 051 'ù', 'ú', 'ü', 'Ā', 'ā', 'ē', 'ě', 'ī', 'ō', 'ū', 'ǎ', 'ǐ', 'ǒ', 'ǔ', 'ǚ', 'ǜ' }; 052 053 private static final char[] CV2RV = { '\'', 'A', 'E', 'a', 'a', 'e', 'e', 'i', 'i', 'o', 'o', 054 'u', 'u', 'u', 'A', 'a', 'e', 'e', 'i', 'o', 'u', 'a', 'i', 'o', 'u', 'u', 'u' }; 055 056 private static final Pattern P1 = Pattern.compile("&#(\\d+);", Pattern.CASE_INSENSITIVE); 057 058 /** 059 * This makes the problems of dealing with the tone/accent marks above vowels in Chinese 060 * Pin-Yin easier. These convert vowels with tones over them into regular vowels. This 061 * can be useful for certain {@code String} operations, although clearly the original meaning 062 * of the word would be decimated. 063 * 064 * @param c any character from <B>ASCII / UTF-8 / UniCode</B> Basic Multi Lingual Plane. 065 * 066 * @return if this is a {@code UTF-8} character that is an accented vowel, the un-accented 067 * version of that vowel is returned. If this is not a PinYin symbol for a tone-vowel, 068 * {@code ASCII 0} is returned. 069 * 070 * @see #toneVowelsToRegularVowels(String) 071 */ 072 public static char toneVowelToRegularVowel(char c) 073 { 074 for (int i=0; i < CV.length; i++) if (CV[i] == c) return CV2RV[i]; 075 return (char) 0; 076 } 077 078 /** 079 * Counts the number of tone vowels in a <B>PinYin</B> {@code String}. 080 * 081 * @param pinYinStr A {@code String}, usually generated by <B>Google Translate</B>, (and 082 * scraped from Google Translate) that contains <B>PinYin.</B> 083 * 084 * @return The number of Mandarin Chinese Pin-Yin "Tone Vowels" 085 */ 086 public static int countToneVowels(String pinYinStr) 087 { 088 int count=0; 089 090 TOP: 091 for (int i = pinYinStr.length()-1; i >= 0; i--) 092 for (int j=0; j < CV.length; j++) 093 if (pinYinStr.charAt(i) == CV[j]) 094 { count++; continue TOP; } 095 096 return count; 097 } 098 099 /** 100 * This performs a conversion of all vowels in a {@code String} from those with tones over them 101 * to the normal (un-accented) equivalent. It uses the single-character-version of the 102 * synonymously named method 103 * 104 * @param s any {@code java.lang.String} containing Mandarin Romanizations. 105 * 106 * @return a {@code String} with all accented vowel's converted to regular vowels. 107 * 108 * @see #toneVowelToRegularVowel(char) 109 */ 110 public static String toneVowelsToRegularVowels(String s) 111 { 112 int strlen = s.length(); 113 StringBuilder sb = new StringBuilder(s.length()); 114 char c; 115 116 for (int i=0; i < strlen; i++) 117 if ((c = toneVowelToRegularVowel(s.charAt(i))) != 0) 118 sb.append(c); 119 else 120 sb.append(s.charAt(i)); 121 122 return sb.toString(); 123 } 124 125 /** 126 * Google Translate returns some text encoded as {@code "&#num;" (the "ord(c)").} This is also 127 * called {@code HTML Escaped Code} - because instead of actual <B>ASCII/UTF8</B> characters 128 * themselves, their "Ord" are returned - surrounded by the usual <I>HTML Escape Character 129 * Sequence</I> &#num; This method does the {@code chr(html-hex-escape-code);} and replaces 130 * the {@code escape-sequence} (which again is &#NUM;) with the actual ASCII character. 131 * 132 * <BR /><BR /><B>NOTE:</B> all of these are for "Chinese Tone Vowel" ASCII - The Google 133 * Translate module uses this method quite a bit. Here are a few examples of 134 * HTML-Escape-Sequence and the corresponding ASCII. 135 * 136 * <BR /><TABLE CLASS=JDBriefTable> 137 * <TR><TH>HTML-Escaped</TH><TH>ASCII/UTF-8 Character</TH></TR> 138 * <TR><TD>&#192;</TD><TD>À</TD></TR> 139 * <TR><TD>&#225;</TD><TD>á</TD></TR> 140 * <TR><TD>&#283;</TD><TD>ě</TD></TR> 141 * <TR><TD>&#363;</TD><TD>ū</TD></TR> 142 * <TR><TD>&#474;</TD><TD>ǚ</TD></TR> 143 * <TR><TD COLSPAN="2">... see array below for list</TD></TR> 144 * </TABLE> 145 * 146 * <BR /><BR /><B>NOTE:</B> {@code HTML2UTF8(String)} ==> This method does the exact same 147 * thing - but does not limit the characters to be converted to only Chinese Tone Vowels. This 148 * method only converts HTML-Escaped-Characters from this list: 149 * 150 * <BR /><BR /><CODE> 151 * private static final int[] H2CV = { 39, 192, 201, 224, 225, 232, 233, 236, 237, 242, 243, <BR /> 152 * 249, 250, 252, 256, 257, 275, 283, 299, 333, 363, 462, 464, 466, 468, 474, 476 }; <BR /> 153 * </CODE> 154 * 155 * @see #HTML2UTF8(String) 156 */ 157 public static String HTML2ChineseVowels(String s) 158 { 159 for (int i=0; i < H2CV.length; i++) 160 s = s.replaceAll("&#" + H2CV[i] + ";", "" + (char) H2CV[i]); 161 162 return s; 163 } 164 165 /** 166 * NOTE: This does the same as {@code HTML2ChineseVowels(String)} <B><I>EXCEPT</B></I> that it 167 * converts <B><I>ANY</B></I> HTML string that has been encoded as: {@code &#NUM;} - not 168 * just the characters having accents and corresponding to Chinese Tone Vowels. 169 * 170 * @see #HTML2ChineseVowels(String) 171 */ 172 public static String HTML2UTF8(String s) 173 { 174 // Build the list of UTF8/ASCII character values (as Ord(c) / int) first. 175 HashSet<Integer> utfList = new HashSet<Integer>(); 176 Matcher m = P1.matcher(s); 177 178 while (m.find()) utfList.add(Integer.parseInt(m.group(1))); 179 180 // Now convert them. 181 for (Integer i : utfList) s = s.replaceAll("&#" + i.toString() + ";", "" + ((char) i.intValue())); 182 183 return s; 184 } 185 186 /** 187 * This is used to convert a Chinese Character into a full {@code String} that includes the 188 * <B>UTF-8</B> code represented as a {@code HEXADECIMAL} number and a {@code decimal} number 189 * 190 * @param c any ASCII/UniCode/UTF-8 char - but, generally, expected to be a 191 * "Chinese Character." 192 * 193 * <BR /><BR /><B>NOTE:</B> The choice for parameter {@code char c} has no actual constraints 194 * on its input value. 195 * 196 * @return A {@code String} of this format: {@code 掭(0x63AD, 25517)} 197 */ 198 public static String formatUTF8Chinese(char c) 199 { return c + "(0x" + String.format("%x", ((int) c)).toUpperCase() + ", " + ((int) c) + ")"; } 200 201 202 /** 203 * Helper function - checks if this is a character in the UTF-8 & ASCII ranges that contain 204 * Mandarin Chinese characters. This is not guaranteed to be accurate - some non-Chinese 205 * Japanese characters exist in this range. For the precise definition of what this function 206 * actually does, see the ranges printed below. 207 * 208 * <BR /><BR />COPIED FROM*** <BR /> 209 * <B><CODE><A HREF="http://www.khngai.com/chinese/charmap/tbluni.php?page=0" TARGET="_blank"> 210 * http://www.khngai.com/chinese/charmap/tbluni.php?page=0</A></CODE></B> 211 * 212 * <BR /><BR />AND: {@code ((c >= 0x4E00) && (c <= 0x9FFF)) } 213 * 214 * <BR /><BR />COPIED FROM*** <BR /> 215 * <B><CODE><A HREF="http://www.khngai.com/chinese/charmap/tblgb.php?page=1" TARGET="_blank"> 216 * http://www.khngai.com/chinese/charmap/tblgb.php?page=1</A></CODE></B> 217 * 218 * @param c any UTF-8, ASCII or UniCode character available. 219 * 220 * @return {@code TRUE} if the input character {@code 'c'} is in the UTF-8/UniCode range 221 * for Chinese Characters 222 */ 223 public static boolean isChinese(char c) 224 { 225 if ((c >= 0x4E00) && (c <= 0x9FFF)) return true; 226 if ((c >= 0xB0A0) && (c <= 0xBFFF)) return true; 227 if ((c >= 0xC0A0) && (c <= 0xCFFF)) return true; 228 if ((c >= 0xD0A0) && (c <= 0xDFFF)) return true; 229 if ((c >= 0xE0A0) && (c <= 0xEFFF)) return true; 230 if ((c >= 0xF0A0) && (c <= 0xF7FF)) return true; 231 232 return false; 233 } 234 235 /** 236 * Checks a {@code char} is something that is not {@code Alpha Numeric} or {@code White Space} 237 * 238 * @param c any UTF-8, ASCII or UniCode character available. 239 * 240 * @return {@code ((!isAlphaNumeric(c)) && (!isSpace(c)));} 241 */ 242 public static boolean isOther(char c) 243 { return ((!isAlphaNumeric(c)) && (!isSpace(c))); } 244 245 /** 246 * Checks if a {@code char} is Alpha Numberic. 247 * 248 * @param c any {@code UTF-8, ASCII} or {@code UniCode} character available. 249 * 250 * @return {@code (isAlpha(c) || isNumber(c));} 251 */ 252 public static boolean isAlphaNumeric(char c) 253 { return (isAlpha(c) || isNumber(c)); } 254 255 /** 256 * Checks if a {@code char} is Alphabetic. 257 * 258 * @param c any {@code UTF-8, ASCII} or {@code UniCode} character available. 259 * 260 * @return {@code (isToneVowel(c) || isRegVowel(c) || isRegLetter(c));} 261 */ 262 public static boolean isAlpha(char c) 263 { return (isToneVowel(c) || isRegVowel(c) || isRegLetter(c)); } 264 265 /** 266 * This is a helper function for the Mandarin Chinese accented vowel symbols in 267 * {@code UTF-8, ASCII} and {@code UniCode}. The exact character code numbers are printed 268 * below. 269 * 270 * <BR /><BR /><B>NOTE:</B> In 罗马拼音 (Pin-Yin Romanization), there are a few symbols that 271 * should never come up - at least as the software pertains to 罗马拼音-results provided by 272 * <B>Google Cloud Server Translation API</B> {@code (GCS-TS/TAPI)}. This is because 273 * <B><I>NO</I></B> word in Pin-Yin ever starts with the letter's I or U, or the U with an 274 * umlau - <B><I>so</B></I> - capitalized versions of these letters ought to never occur - 275 * unless the entire PinYin were capitalized - which is something GCSTS never does. 276 * 277 * @param c any UTF-8, ASCII or UniCode character available. 278 * 279 * @return {@code TRUE} if the input character {@code 'c'} is one of the following: 280 * 281 * <BR /><BR /><TABLE CLASS=JDBriefTable> 282 * <TR><TH>Simple ASCII</TH><TH>UTF-8 Tone Vowel</TH></TR> 283 * <TR><TD>a</TD><TD> ā (257), á (225), ǎ (462), à (224)</TD></TR> 284 * <TR><TD>e</TD><TD> ē (275), é (233), ě (283), è (232)</TD></TR> 285 * <TR><TD>i</TD><TD> ī (299), í (237), ǐ (464), ì (236)</TD></TR> 286 * <TR><TD>o</TD><TD> ō (333), ó (243), ǒ (466), ò (242)</TD></TR> 287 * <TR><TD>u</TD><TD> ū (363), ú (250), ǔ (468), ù (249)</TD></TR> 288 * <TR><TD>u</TD><TD> ǖ (470), ǘ (472), ǚ (474), ǜ (476)</TD></TR> 289 * <TR><TD>A</TD><TD> Ā (256), Á (193), Ǎ (461), À (192)</TD></TR> 290 * <TR><TD>E</TD><TD> Ē (274), É (201), Ě (282), È (200)</TD></TR> 291 * <TR><TD>O</TD><TD> Ō (332), Ó (211), Ǒ (465), Ò (210)</TD></TR> 292 * </TABLE> 293 * 294 * <BR />In Mandarin Chinese, PinYin-words cannot start with these letters below. 295 * Therefore it would be highly unlikely to see a "capitalized" version of these tone-vowels. 296 * 297 * <BR /><BR /><TABLE CLASS=JDBriefTable> 298 * <TR><TH>Simple ASCII</TH><TH>UTF-8 Tone Vowel</TH></TR> 299 * <TR><TD>I</TD><TD>Ī (298), Í (205), (there are 2: Ǐ (463), Ĭ (300)), Ì (204)</TD></TR> 300 * <TR><TD>U</TD><TD>Ū (362), Ú (218), Ŭ (364), Ù (217)</TD></TR> 301 * <TR><TD>U</TD><TD>(Ü (220) -no tone): Ǖ (469), Ǘ (471), Ǘ (473), Ǜ (475)</TD></TR> 302 * </TABLE> 303 */ 304 public static boolean isToneVowel(char c) 305 { 306 // A, ā 257, á 225, ǎ 462, à 224 307 if ((c == 257) || (c == 225) || (c == 462) || (c == 224)) return true; 308 309 // E, ē 275, é 233, ě 283, è 232 310 if ((c == 275) || (c == 233) || (c == 283) || (c == 232)) return true; 311 312 // I, ī 299, í 237, ǐ 464, ì 236 313 if ((c == 299) || (c == 237) || (c == 464) || (c == 236)) return true; 314 315 // O, ō 333, ó 243, ǒ 466, ò 242 316 if ((c == 333) || (c == 243) || (c == 466) || (c == 242)) return true; 317 318 // U, ū 363, ú 250, ǔ 468, ù 249 319 if ((c == 363) || (c == 250) || (c == 468) || (c == 249)) return true; 320 321 // U, ǖ 470, ǘ 472, ǚ 474, ǜ 476 322 if ((c == 470) || (c == 472) || (c == 474) || (c == 476)) return true; 323 324 // ******* 325 // Capital vowels with tone symbols 326 327 // Ā 256, Á 193, Ǎ 461, À 192 328 if ((c == 256) || (c == 193) || (c == 461) || (c == 192)) return true; 329 330 // Ē 274, É 201, Ě 282, È 200 331 if ((c == 274) || (c == 201) || (c == 282) || (c == 200)) return true; 332 333 // Ō 332, Ó 211, Ǒ 465, Ò 210 334 if ((c == 332) || (c == 211) || (c == 465) || (c == 210)) return true; 335 336 // Not sure about these - found them on a website 337 // ********************************************** 338 // 1234 5678 9ABC DEF 339 // A8A0 āáǎà ēéěè īíǐì ōóǒ 340 // 341 // 0 1234 5678 9 A 342 // A8B0 ò ūúǔù ǖǘǚǜ ü ê 343 // ********************************************** 344 if ((c >= 0xA8A1) && (c <= 0xA8Ba)) return true; 345 346 return false; 347 } 348 349 /** 350 * Checks that a character is a standard vowel. 351 * 352 * @param c any {@code UTF-8, ASCII} or {@code UniCode} character available. 353 * 354 * @return {@code TRUE} if the input character {@code 'c'} EQUALS one of these ten letters: 355 * <B>a, e, i, o, u, A, E, I, O, U</B> 356 */ 357 public static boolean isRegVowel(char c) 358 { 359 // The normal vowels 360 361 // a 97, A 65 362 if ((c == 97) || (c == 65)) return true; 363 364 // e 101, E 69 365 if ((c == 101) || (c == 69)) return true; 366 367 // i 105, I 73 368 if ((c == 105) || (c == 73)) return true; 369 370 // o 111, O 79 371 if ((c == 111) || (c == 79)) return true; 372 373 // u 117, U 85 374 if ((c == 117) || (c == 85)) return true; 375 376 return false; 377 } 378 379 /** 380 * Regular Letters Include: {@code 'A' ... 'Z'} (65 - 90), {@code 'a' ... 'z'} (97 - 122) 381 * 382 * @param c any {@code UTF-8, ASCII} or {@code UniCode} character available. 383 * 384 * @return {@code TRUE} if the input character {@code 'c'} is any letter in lower-level 385 * <B>ASCII</B> (and not any of the AUC). 386 */ 387 public static boolean isRegLetter(char c) 388 { return ((c >= 65) && (c <= 90)) || ((c >= 97) && (c <= 122)); } 389 390 /** 391 * Regular Numbers Include: {@code '0' ... '9'} 392 * 393 * @param c any {@code UTF-8, ASCII} or {@code UniCode} character available. 394 * 395 * @return {@code TRUE} if the input character {@code 'c'} is in the range of ASCII 396 * {@code '0' ... '9'} (not any of the AUC) 397 */ 398 public static boolean isNumber(char c) 399 { return ((c >= 48) && (c <= 57)); } 400 401 /** 402 * Checks for WhiteSpace: {@code '\t', '\n', '\r', ' '} 403 * 404 * @param c any {@code UTF-8, ASCII} or {@code UniCode} character available. 405 * 406 * @return {@code TRUE} if the input character {@code 'c'} is a whitespace character code from 407 * the above list 408 */ 409 public static boolean isSpace(char c) 410 { return ((c == 9) || (c == 12) || (c == 15) || (c == 32)); } 411 412 413 /** 414 * Bullet List characters in upper {@code UniCode / UTF-8}. These characters exist in 415 * <B>UTF-8</B> - and they are occasionally used in documents found on Chinese News Websites. 416 * They are all "bullet-list" points. An integer is returned for each of these, that is equal 417 * to the number represented by the UTF-8/UniCode character here. 418 * 419 * <BR /><BR /><UL CLASS=JDUL> 420 * <LI>0 1 2 3 4 5 6 7 8 9 a b c d e f</LI> 421 * <LI>N ⒈ ⒉ ⒊ ⒋ ⒌ ⒍ ⒎ ⒏ ⒐ ⒑ ⒒ ⒓ ⒔ ⒕ ⒖</LI> 422 * <LI>⒗ ⒘ ⒙ ⒚ ⒛ ⑴ ⑵ ⑶ ⑷ ⑸ ⑹ ⑺ ⑻ ⑼ ⑽ ⑾</LI> 423 * <LI>⑿ ⒀ ⒁ ⒂ ⒃ ⒄ ⒅ ⒆ ⒇ ① ② ③ ④ ⑤ ⑥ ⑦ </LI> 424 * <LI>⑧ ⑨ ⑩ N N ㈠ ㈡ ㈢ ㈣ ㈤ ㈥ ㈦ ㈧ ㈨ ㈩ N</LI> 425 * <LI>N Ⅰ Ⅱ Ⅲ Ⅳ Ⅴ Ⅵ Ⅶ Ⅷ Ⅸ Ⅹ Ⅺ Ⅻ</LI> 426 * </UL> 427 * 428 * @param c any character as input 429 * 430 * @return The number equivalent represented by this bullet point. 431 */ 432 public static int bulletListAUC(char c) 433 { 434 // ⒈ ==> ⒛ 435 if ((c >= 0x2488) && (c <= 0x249B)) return ((int) c) - 0x2487; 436 437 // ⑴ ==> ⒇ 438 if ((c >= 0x2474) && (c <= 0x2487)) return ((int) c) - 0x2473; 439 440 // ① ==> ⑩ 441 if ((c >= 0x2460) && (c <= 0x2469)) return ((int) c) - 0x245F; 442 443 // ㈠ ==> ㈩ 444 if ((c >= 0x3220) && (c <= 0x3229)) return ((int) c) - 0x321F; 445 446 // Ⅰ ==> Ⅻ 447 if ((c >= 0x2160) && (c <= 0x216B)) return ((int) c) - 0x215F; 448 449 return 0; 450 } 451 452 /** 453 * Alpha-Numeric character code from upper UniCode / UTF-8 454 * 455 * <BR /><BR />These characters exist in <B>UTF-8</B> - but they ARE NOT the usual ASCII 456 * characters for the letters {@code 'A' ... 'Z'} or the numbers {@code '0' ... '9'} They, 457 * however, are sometimes found in documents on Chinese News Websites, etc. 458 * 459 * <BR /><BR />Copied from:<BR /> 460 * <B><CODE><A HREF="http://www.khngai.com/chinese/charmap/tblgb.php?page=0" TARGET="_blank"> 461 * http://www.khngai.com/chinese/charmap/tblgb.php?page=0</A></CODE></B> 462 * 463 * <BR /><BR /><UL CLASS=JDUL> 464 * <LI>0 1 2 3 4 5 6 7 8 9 a b c d e f </LI> 465 * <LI>! " # ¥ % & ' ( ) * + , - . /</LI> 466 * <LI>0 1 2 3 4 5 6 7 8 9 : ; < = > ?</LI> 467 * <LI>@ A B C D E F G H I J K L M N O</LI> 468 * <LI>P Q R S T U V W X Y Z [ \ ] ^ _</LI> 469 * <LI>a b c d e f g h i j k l m n o</LI> 470 * <LI>p q r s t u v w x y z { | }  ̄</LI> 471 * </UL> 472 * 473 * @param c any character as input 474 * 475 * @return the "lower-level-ASCII" version of that character. 476 */ 477 public static char alphaNumericAUC(char c) 478 { 479 // ASCII 'A' is 65 480 if ((c > 0xFF20) && (c < 0xFF3B)) return (char) (65 + (c - 0xFF21)); 481 482 // ASCII 'a' is 97 483 if ((c > 0xFF40) && (c < 0xFF5B)) return (char) (97 + (c - 0xFF41)); 484 485 // ASCII '0' is 48 486 if ((c >= 0xFF10) && (c <= 0xFF1A)) return (char) (48 + (c - 0xFF10)); 487 488 return 0; 489 } 490 491 /** 492 * This method, {@code punctuationAUC(char)}, converts any characters which are common on many 493 * Mandarin Chinese websites into a lower-level, more typical/normal ASCII equivalent. This is 494 * can be very useful when trying to make sense of brackets, parenthesis, quotes, commas and 495 * other punctuation marks - and quickly convert them into a simple version of the character. 496 * 497 * <BR /><BR />If the input character has an "Alternate Version" in the lower-level-ASCII 498 * range, that lower level ASCII character is returned. If this isn't AUC, ASCII-0 is 499 * returned. 500 * 501 * <BR /><BR /><B>For Instance:</B> 502 * 503 * <BR /><BR /><TABLE CLASS=JDBriefTable> 504 * <TR><TH>Input</TH><TH>Output</TH></TR> 505 * <TR><TD>〖 〗 【 】 </TD><TD> [ ] [ ] </TD></TR> 506 * <TR><TD> 。 ○ ● . </TD><TD>. (ASCII-period) </TD></TR> 507 * <TR><TD>¨ 〃 “ ” ″ " </TD><TD>" (ASCII-double-quote) </TD></TR> 508 * <TR><TD>, (ASCII-comma) </TD><TD>ASCII-0 </TD></TR> 509 * <TR><TD>+ (ASCII-plus) </TD><TD>ASCII-0 </TD></TR> 510 * </TABLE> 511 * 512 * @param c any character as input 513 * 514 * @return the "lower-level-ASCII" version of that character 515 * 516 * <BR /><BR /><B>NOTE:</B> ASCII-0 is returned if this is not a valid "AUC" 517 * {@code UTF-8 / UniCode} code! 518 */ 519 public static char punctuationAUC(char c) 520 { 521 // Copied from: 522 // *** http://www.khngai.com/chinese/charmap/tblgb.php?page=0 523 // 524 // 0 2 3 4 5 6 7 8 9 a b c d e f 525 // N N 、 。 · ˉ ˇ ¨ 〃 々 — ~ ‖ … ‘ ’ 526 // “ ” 〔 〕 〈 〉 《 》 「 」 『 』 〖 〗 【 】 527 // ± × ÷ ∶ ∧ ∨ ∑ ∏ ∪ ∩ ∈ ∷ √ ⊥ ∥ ∠ 528 // ⌒ ⊙ ∫ ∮ ≡ ≌ ≈ ∽ ∝ ≠ ≮ ≯ ≤ ≥ ∞ ∵ 529 // ∴ ♂ ♀ ° ′ ″ ℃ $ ¤ ¢ £ ‰ § № ☆ ★ 530 // ○ ● ◎ ◇ ◆ □ ■ △ ▲ ※ → ← ↑ ↓ 〓 531 // 532 // 0 1 2 3 4 5 6 7 8 9 a b c d e f 533 // ! " # ¥ % & ' ( ) * + , - . / 534 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ? 535 // @ A B C D E F G H I J K L M N O 536 // P Q R S T U V W X Y Z [ \ ] ^ _ 537 // ` a b c d e f g h i j k l m n o 538 // p q r s t u v w x y z { | }  ̄ 539 540 switch (c) 541 { 542 // 、 , 543 case 0x3001: // 、 544 case 0xFF0C: return ','; // , 545 546 // 。 ○ ● . 547 case 0x3002: // 。 548 case 0x25CB: // ○ 549 case 0x25CF: // ● 550 case 0xFF0E: return '.'; // . 551 552 // ‘ ’ ′ ' ` 553 case 0x2018: // ‘ 554 case 0x2019: // ’ 555 case 0x2032: // ′ 556 case 0xFF07: // ' 557 case 0xFF40: return '\''; // ` 558 559 // ¨ 〃 “ ” ″ " 560 case 0x00A8: // ¨ 561 case 0x3003: // 〃 562 case 0x201C: // “ 563 case 0x201D: // ” 564 case 0x2033: // ″ 565 case 0xFF02: return '\"'; // " 566 567 // 〔 ( 568 case 0x3014: // 〔 569 case 0xFF08: return '('; // ( 570 571 // 〕 ) 572 case 0x3015: // 〕 573 case 0xFF09: return ')'; // ) 574 575 // 〈 < 576 case 0x3008: // 〈 577 case 0xFF1C: return '<'; // < 578 579 // 〉 > 580 case 0x3009: // 〉 581 case 0xFF1E: return '>'; // > 582 583 // 「 『 〖 【 [ 584 case 0x300C: // 「 585 case 0x300E: // 『 586 case 0x3016: // 〖 587 case 0x3010: // 【 588 case 0xFF3B: return '['; // [ 589 590 // 」 』 〗】 ] 591 case 0x300D: // 」 592 case 0x300F: // 』 593 case 0x3017: // 〗 594 case 0x3011: // 】 595 case 0xFF3D: return ']'; // ] 596 597 // ∶ : 598 case 0x2236: // ∶ 599 case 0xFF1A: return ':'; // : 600 601 case 0xFF01: return '!'; // ! 602 case 0xFF03: return '#'; // # 603 case 0xFF05: return '%'; // % 604 case 0xFF06: return '&'; // & 605 case 0xFF1F: return '?'; // ? 606 case 0xFF0F: return '/'; // / 607 case 0xFF3E: return '^'; // ^ 608 case 0xFF5B: return '{'; // { 609 case 0xFF5D: return '}'; // } 610 case 0xFF5C: return '|'; // | 611 case 0xFF0B: return '+'; // + 612 case 0xFF3C: return '\\'; // \ 613 case 0xFF3F: return '_'; // _ 614 615 // — - 616 case 0x2014: // — 617 case 0xFF0D: return '-'; // - 618 619 // 〓 = 620 case 0x3013: // 〓 621 case 0xFF1D: return '='; // = 622 } 623 return 0; 624 } 625 626 /** 627 * Bo Po Mo Fo (注音符號). 628 * 629 * <BR /><BR />This is a popular pronunciation system for Mandarin Characters in Taiwan & 630 * Hong Kong. 631 * 632 * <BR /><BR /><UL CLASS=JDUL> 633 * <LI>N N N N N ㄅ ㄆ ㄇ ㄈ ㄉ ㄊ ㄋ ㄌ ㄍ ㄎ ㄏ</LI> 634 * <LI>ㄐ ㄑ ㄒ ㄓ ㄔ ㄕ ㄖ ㄗ ㄘ ㄙ ㄚ ㄛ ㄜ ㄝ ㄞ ㄟ</LI> 635 * <LI>ㄠ ㄡ ㄢ ㄣ ㄤ ㄥ ㄦ ㄧ ㄨ ㄩ N N N N N N</LI> 636 * </UL> 637 * 638 * @param c any {@code UTF-8, ASCII} or {@code UniCode} character available from 639 * {@code Plane 0}, the <B>Basic Multi-Lingual Plane</B> 640 * 641 * @return {@code TRUE} if the input character {@code 'c'} is in this UTF-8/UniCode range. 642 * The {@code HEXADECIMAL / UTF-8} representation of the <B>'Bo Po Mo Fo'</B> range is: 643 * {@code 0x3110 ... 0x3129}. 644 */ 645 public static boolean isBPMFAUC(char c) 646 { 647 // 0 1 2 3 4 5 6 7 8 9 a b c d e f 648 // N N N N N ㄅ ㄆ ㄇ ㄈ ㄉ ㄊ ㄋ ㄌ ㄍ ㄎ ㄏ 649 // ㄐ ㄑ ㄒ ㄓ ㄔ ㄕ ㄖ ㄗ ㄘ ㄙ ㄚ ㄛ ㄜ ㄝ ㄞ ㄟ 650 // ㄠ ㄡ ㄢ ㄣ ㄤ ㄥ ㄦ ㄧ ㄨ ㄩ N N N N N N 651 652 return (c >= 0x3110) && (c <= 0x3129); 653 } 654 655 /** 656 * Checks for end-of-sentence punctuation marks - and "down-converts" them to the simple ASCII 657 * equivalent version of that punctuation mark. If the input character code is not an AUC 658 * version of a typical Mandarin-Chinese end-of-sentence punctuation mark - then ASCII-zero is 659 * returned. 660 * 661 * <BR /><BR /><B>NOTE:</B> if a lower-level-ASCII (normal) punctuation mark is input - then 662 * ASCII-0 is returned. 663 * 664 * <BR /><BR /><B>SPECIFICALLY:</B> with {@code '.' '?'} and {@code '!'} as input to this 665 * function, ASCII-0 will be returned. 666 * 667 * <BR /><BR /><B>USE:</B> {@code endOfSentence(c)} to have those punctuation marks included in 668 * non-zero results. 669 * 670 * @param c any UTF-8, ASCII or UniCode character available. 671 * 672 * @return if the input character {@code 'c'} is an "alternate UTF-8" version of the 673 * punctuation marks: 674 * 675 * <BR /><BR /><UL CLASS=JDUL> 676 * <LI>a period ('.')</LI> 677 * <LI>an exclamation-point ('!')</LI> 678 * <LI>a question-mark ('?')</LI> 679 * </UL> 680 * 681 * <BR /><BR />Then the output to this method shall be determined by the table below: 682 * 683 * <BR /><BR /><TABLE CLASS=JDBriefTable> 684 * <TR><TH>Input Character</TH><TH>Output Character</TH></TR> 685 * <TR><TD>。 ○ ● .</TD><TD>'.' (normal period)</TD></TR> 686 * <TR><TD>!</TD><TD>'!' (regular exclamation point)</TD></TR> 687 * <TR><TD>?</TD><TD>'?' (usual question mark)</TD></TR> 688 * </TABLE> 689 * 690 * <BR /><BR /> 691 * <B>NOTE:</B> If the normal period, question, or exclamation are passed as input to this 692 * function, this function will return ASCII-0 693 * 694 * @see #endOfSentence(char) 695 */ 696 public static char endOfSentenceAUC(char c) 697 { 698 char auc = punctuationAUC(c); 699 700 if (auc != 0) c = auc; 701 702 // A 'switch' is used instead of an 'if' with a char-cast because it is easier to 703 // read on this page. Only the three characters with ASCII 46, 33, and 63 should 704 // return non-zero values. 705 switch ((int) auc) 706 { 707 // These characters identify an "End of Sentence" marker. 708 case 0x2E: return '.'; // DEC: 46 709 case 0x21: return '!'; // DEC: 33 710 case 0x3F: return '?'; // DEC: 63 711 712 // All other characters should result in a '0' 713 default: return (char) 0; 714 } 715 } 716 717 /** 718 * Checks for end-of-sentence punctuation marks. This Helper function is *almost* identitical 719 * to the {@code endOfSentenceAUC(c)} method. 720 * 721 * <BR /><BR />{@code endOfSentenceAUC(c)} returns ASCII-0 for the usual-punctuation marks - 722 * {@code '.', '!'} and {@code '?'}. 723 * 724 * <BR /><BR />{@code endOfSentence(c)} does not 'leave-out' or 'deny' these lower-level-ASCII 725 * punctuation symbols. 726 * 727 * @param c any UTF-8, ASCII or UniCode character available. 728 * 729 * @return If the input character {@code 'c'} is a period {@code ('.')}, an exclamation-point 730 * {@code ('!')}, or a question-mark {@code ('?')} - <B><I>or an AUC version of that 731 * punctuation,</B></I> then that punctuation is returned. Otherwise ASCII-0 is returned. 732 * 733 * @see #endOfSentenceAUC(char) 734 */ 735 public static char endOfSentence(char c) 736 { 737 char auc = endOfSentenceAUC(c); 738 739 if (auc != 0) c = auc; 740 741 // These three characters identify an "End of Sentence" Marker 742 if ((c == '.') || (c == '!') || (c == '?')) return c; 743 744 return (char) 0; 745 } 746 747 /** 748 * Checks for end-of-phrase punctuation marks - and "down-converts" them to the simple ASCII 749 * equivalent version of that punctuation mark. If the input character code is not an AUC 750 * version of a typical Mandarin-Chinese phrase-delimiting punctuation mark - then ASCII-zero 751 * is returned. 752 * 753 * <BR /><BR /><B>NOTE:</B> if a lower-level-ASCII (normal) punctuation mark is input - then 754 * ASCII-0 is returned. 755 * 756 * <BR /><BR /><B>SPECIFICALLY:</B> with {@code ',' ':' ';'} and other common phrase-ending 757 * marks in Mandarin as input to this function, ASCII-0 will be returned. 758 * 759 * <BR /><BR /><B>USE:</B> {@code endOfPhrase(c)} to have those punctuation marks included in 760 * non-zero results. 761 * 762 * @param c any UTF-8, ASCII or UniCode character available. 763 * 764 * @return if the input character {@code 'c'} is an "alternate UTF-8" (AUC) version of the 765 * punctuation marks: 766 * 767 * <BR /><BR /><TABLE CLASS=JDBriefTable> 768 * <TR><TH>Punctuation</TH><TH>Symbol and ASCII-Code</TH></TR> 769 * <TR><TD>semi-colon </TD><TD>';' HEX:0x3B, DEC: 59</TD></TR> 770 * <TR><TD>comma </TD><TD>',' HEX:0x2C, DEC: 44</TD></TR> 771 * <TR><TD>colon </TD><TD>':' HEX:0x3A, DEC: 58</TD></TR> 772 * <TR><TD>double-quote </TD><TD>'\"' HEX:0x22, DEC: 34</TD></TR> 773 * <TR><TD>single-quote </TD><TD>'\'' HEX:0x27, DEC: 39</TD></TR> 774 * <TR><TD>left-bracket </TD><TD>'[' HEX:0x5B, DEC: 91</TD></TR> 775 * <TR><TD>right-bracket </TD><TD>']' HEX:0x5D, DEC: 93</TD></TR> 776 * <TR><TD>less-than </TD><TD>'<' HEX:0x3C, DEC: 60</TD></TR> 777 * <TR><TD>greater-than </TD><TD>'>' HEX:0x3E, DEC: 62</TD></TR> 778 * <TR><TD>left-paren </TD><TD>'(' HEX:0x28, DEC: 40</TD></TR> 779 * <TR><TD>right-paren </TD><TD>')' HEX:0x29, DEC: 41</TD></TR> 780 * </TABLE> 781 * 782 * <BR /><BR /> 783 * <B>IMPORTANT NOTE:</B> *only* the upper-level-UTF-8/UniCode versions of these 784 * punctuation marks will produce a non-zero result. An actual ASCII comma, semi-colon, quote, 785 * bracket, or parenthesis (etc...) will cause this method to return ASCII-0. Please use 786 * endOfPhrase(char) to include the lower-level (Already down-converted ASCII) with non-zero 787 * results. 788 * 789 * @see #endOfPhrase(char) 790 */ 791 public static char endOfPhraseAUC(char c) 792 { 793 char auc = punctuationAUC(c); 794 795 if (auc != 0) c = auc; 796 797 // A 'switch' is used instead of an 'if' with a char-cast because it is easier to 798 // read on this page. Only the characters having ASCII 59, 44, 58, 34, etc... should 799 // return non-zero values. 800 switch ((int) auc) 801 { 802 // These characters constitute an "End of Phrase" marker 803 case 0x3B: return ';'; // DEC: 59 804 case 0x2C: return ','; // DEC: 44 805 case 0x3A: return ':'; // DEC: 58 806 case 0x22: return '\"'; // DEC: 34 807 case 0x27: return '\''; // DEC: 39 808 case 0x5B: return '['; // DEC: 91 809 case 0x5D: return ']'; // DEC: 93 810 case 0x3C: return '<'; // DEC: 60 811 case 0x3E: return '>'; // DEC: 62 812 case 0x28: return '('; // DEC: 40 813 case 0x29: return ')'; // DEC: 41 814 815 // All other results should return '0' 816 default: return 0; 817 } 818 } 819 820 /** 821 * endOfPhrase - any version of the end-of-phrase markers usually used in Mandarin Chinese 822 * text. This method returns the exact same results as the {@code endOfPhraseAUC(char)} 823 * method. 824 * 825 * <BR /><BR /><B><SPAN STYLE="color: red;">EXCEPT:</SPAN></B> 826 * The regular/normal version of that punctuation mark (ASCII for semi-colon, comma, quote, 827 * etc...) will return the exact-same semi-colon, comma or quote - <I><B>instead of</B></I> 828 * ASCII-0 829 * 830 * <BR /><TABLE CLASS=JDBriefTable> 831 * <TR><TH>Input & Method Called:</TH><TH>Result</TH></TR> 832 * <TR><TD>endOfPhrase(';') </TD><TD>';' // Normal ASCII semi-colon symbol</TD></TR> 833 * <TR><TD>endOfPhraseAUC(';') </TD><TD>0 // ASCII-0 returned</TD></TR> 834 * <TR><TD>endOfPhrase('】') </TD><TD>']' // left-bracket returned</TD></TR> 835 * <TR><TD>endOfPhraseAUC('】') </TD><TD>']' // left-bracket returned</TD></TR> 836 * <TR><TD>endOfPhrase(']') </TD><TD>']' // left-bracket returned</TD></TR> 837 * <TR><TD>endOfPhraseAUC(']') </TD><TD>0 // ASCII-0 returned</TD></TR> 838 * </TABLE> 839 * 840 * <BR /><BR /> 841 * The list of end-of-phrase characters include the following:<BR /> 842 * <B STYLE="color:red">{@code ';' ',' ':' '\"' '\'' '[' ']' '<' '>' '(' ')'}</B> 843 * 844 * @param c Any character in the entire UniCode range. 0x0000 to 0xFFFF 845 * 846 * @return If {@code 'c'} is an "AUC" version of and end-of-phrase marker - or a regular 847 * lower-level ASCII version - then that punctuation mark is returned. Otherwise 0 is 848 * returned. 849 * 850 * @see #punctuationAUC(char) 851 */ 852 public static char endOfPhrase(char c) 853 { 854 char auc = punctuationAUC(c); 855 856 if (auc != 0) c = auc; 857 858 if ((c == ';') || (c == ',') || (c == ':') || 859 (c == '\"') || (c == '\'') || 860 (c == '[') || (c == ']') || 861 (c == '<') || (c == '>') || 862 (c == '(') || (c == ')')) 863 return c; 864 865 return (char) 0; 866 } 867 868 /** 869 * Quotes - any version. <B><I>AUC or normal-ASCII, (BOTH)</B></I> single or 870 * double quote. 871 * 872 * @param c Any character in the entire <B>UniCode</B> range. {@code 0x0000 to 0xFFFF} which is 873 * the {@code Basic Multi Lingual Plane}. 874 * 875 * @return If the input character {@code 'c'} is an <B><I>"AUC" version</B></i> of the single 876 * (or double) quote, or the <B><I>regular-ASCII</B></I> single/double quote, then the 877 * appropriate single or double-quote is returned. Otherwise 0 is returned. 878 * 879 * @see #punctuationAUC(char) 880 */ 881 public static char quoteAUC(char c) 882 { 883 char auc = punctuationAUC(c); 884 885 if (auc != 0) c = auc; 886 887 switch ((int) c) 888 { 889 case 0x22: return '\"'; // DEC: 34 890 case 0x27: return '\''; // DEC: 39 891 default: return (char) 0; 892 } 893 } 894 895 /** 896 * Comma - any version. <B><I>AUC or normal-ASCII, (BOTH)</B></I> comma 897 * @param c Any character in the entire <B>UTF-8</B> range. {@code 0x0000 to 0xFFFF}, the 898 * {@code Basic Multi-Lingual Plane}. 899 * @return If the input character {@code 'c'} is an <B><I>"AUC" version</B></i> of the comma, 900 * or the <B><I>regular-ASCII</B></I> comma, then the comma is returned. Otherwise 0 is returned. 901 * @see #punctuationAUC(char) 902 */ 903 public static char commaAUC(char c) 904 { 905 char auc = punctuationAUC(c); 906 907 if (auc != 0) c = auc; 908 909 switch ((int) c) 910 { 911 case 0x2c: return ','; // DEC: 44 912 default: return (char) 0; 913 } 914 } 915 916 /** 917 * Brackets - any version. <B><I>AUC or normal-ASCII, (BOTH)</B></I> brackets 918 * 919 * @param c Any character in the entirbrackets UniCode range. 0x0000 to 0xFFFF 920 * 921 * @return If the input character {@code 'c'} is an <B><I>"AUC" version</B></i> of the 922 * brackets, or the <B><I>regular-ASCII</B></I> brackets, then the appropriate brackets are 923 * returned. Otherwise 0 is returned. 924 * 925 * @see #punctuationAUC(char) 926 */ 927 public static char bracketAUC(char c) 928 { 929 char auc = punctuationAUC(c); 930 931 if (auc != 0) c = auc; 932 933 switch ((int) c) 934 { 935 case 0x5B: return '['; // DEC: 91 936 case 0x5D: return ']'; // DEC: 93 937 case 0x3C: return '<'; // DEC: 60 938 case 0x3E: return '>'; // DEC: 62 939 default: return (char) 0; 940 } 941 } 942 943 /** 944 * Parenthesis - any version. <B><I>AUC or normal-ASCII, (BOTH)</B></I> parenthesis 945 * @param c Any character in the entire UniCode range. 0x0000 to 0xFFFF 946 * @return If the input character {@code 'c'} is an <B><I>"AUC" version</B></i> of the parenthesis, 947 * or the <B><I>regular-ASCII</B></I> parenthesis, then the appropriate parenthesis are 948 * returned. Otherwise 0 is returned. 949 * @see #punctuationAUC(char) 950 */ 951 public static char parenAUC(char c) 952 { 953 char auc = punctuationAUC(c); 954 955 if (auc != 0) c = auc; 956 957 switch ((int) c) 958 { 959 case 0x28: return '('; // DEC: 40 960 case 0x29: return ')'; // DEC: 41 961 default: return (char) 0; 962 } 963 } 964 965 /** 966 * The complete list of "higher-level" (alternate) Uni-Code chars. Many of these are alternate 967 * punctuation marks used in documents that contain Mandarin Chinese. 968 */ 969 public static final String AUC = 970 // Special Punctuation characters found in Chinese HTML Pages 971 "、 。 · ˉ ˇ ¨ 〃 々 — ~ ‖ … ‘ ’ " + 972 "“ ” 〔 〕 〈 〉 《 》 「 」 『 』 〖 〗 【 】" + 973 "± × ÷ ∶ ∧ ∨ ∑ ∏ ∪ ∩ ∈ ∷ √ ⊥ ∥ ∠" + 974 "⌒ ⊙ ∫ ∮ ≡ ≌ ≈ ∽ ∝ ≠ ≮ ≯ ≤ ≥ ∞ ∵ " + 975 "∴ ♂ ♀ ° ′ ″ ℃ $ ¤ ¢ £ ‰ § № ☆ ★" + 976 "○ ● ◎ ◇ ◆ □ ■ △ ▲ ※ → ← ↑ ↓ 〓 " + 977 "! " # ¥ % & ' ( ) * + , - . /" + 978 979 // Extra Alphabetic and Numeric Characters sometimes used 980 // on web-pages written in Chinese 981 "0 1 2 3 4 5 6 7 8 9 : ; < = > ?" + 982 "@ A B C D E F G H I J K L M N O" + 983 "P Q R S T U V W X Y Z [ \ ] ^ _" + 984 "` a b c d e f g h i j k l m n o" + 985 "p q r s t u v w x y z { | }  ̄" + 986 987 // Certain "Bullet List" / "Bullet Point" markers 988 "⒈ ⒉ ⒊ ⒋ ⒌ ⒍ ⒎ ⒏ ⒐ ⒑ ⒒ ⒓ ⒔ ⒕ ⒖" + 989 "⒗ ⒘ ⒙ ⒚ ⒛ ⑴ ⑵ ⑶ ⑷ ⑸ ⑹ ⑺ ⑻ ⑼ ⑽ ⑾" + 990 "⑿ ⒀ ⒁ ⒂ ⒃ ⒄ ⒅ ⒆ ⒇ ① ② ③ ④ ⑤ ⑥ ⑦" + 991 "⑧ ⑨ ⑩ ㈠ ㈡ ㈢ ㈣ ㈤ ㈥ ㈦ ㈧ ㈨ ㈩" + 992 "Ⅰ Ⅱ Ⅲ Ⅳ Ⅴ Ⅵ Ⅶ Ⅷ Ⅸ Ⅹ Ⅺ Ⅻ" + 993 994 // The "Bo Po Mo Fo" Pronunciation Used for Chinese Characters 995 "ㄐ ㄑ ㄒ ㄓ ㄔ ㄕ ㄖ ㄗ ㄘ ㄙ ㄚ ㄛ ㄜ ㄝ ㄞ ㄟ" + 996 "ㄠ ㄡ ㄢ ㄣ ㄤ ㄥ ㄦ ㄧ ㄨ ㄩ"; 997 998 /** 999 * 1000 * @return An HTML <TABLE> that contains many tests of the subroutines in this class 1001 */ 1002 public static String testAUC() 1003 { 1004 StringBuilder ret = new StringBuilder(); 1005 ret.append( "<TABLE BORDER=\"1\"><TR>" + 1006 "<TD WIDTH=\"30\"> </TD>" + 1007 "<TD WIDTH=\"70\"> </TD>" + 1008 "<TD WIDTH=\"70\"> </TD>" + 1009 "<TD WIDTH=\"30\"> </TD>" ); 1010 1011 for (int i=4; i < 12; i++) 1012 ret.append("<TD WIDTH=\"70\"> </TD>"); 1013 ret.append("</TR>");; 1014 1015 for (int i=0; i < AUC.length(); i++) 1016 { 1017 char c = AUC.charAt(i); 1018 1019 if (c == ' ') continue; 1020 1021 // Check original character (not punctuation-converted cc) 1022 char bl = Integer.toString(bulletListAUC(c)).charAt(0); 1023 boolean bpmf = isBPMFAUC(c); 1024 1025 // first, convert the punctuation to normal-ASCII punctuation 1026 // These are the "translated" characters 1027 // The "translated character" is where, for example '〗' ==> ']' 1028 char newC = punctuationAUC(c); 1029 1030 // These are used for building <TABLE> & <TD> entry strings 1031 char q = quoteAUC(newC); 1032 char es = endOfSentenceAUC(newC); 1033 char ep = endOfPhraseAUC(newC); 1034 char com = commaAUC(newC); 1035 char br = bracketAUC(newC); 1036 char p = parenAUC(newC); 1037 1038 char ascii = punctuationAUC(c); 1039 if (ascii == 0) ascii = alphaNumericAUC(c); 1040 if (bl != 0) ascii = bl; 1041 if (bpmf) ascii = c; 1042 if (ascii == 0) ascii = 'x'; 1043 1044 // ================================================= 1045 // This is for debugging this test function 1046 String tmp = " newCC = " + newC + ", q=" + q + 1047 ", es=" + es + ", ep=" + ep + 1048 ", com=" + com + ", br=" + br + 1049 ", p=" + p + ", bl =" + bl + 1050 ", bpmf=" + bpmf; 1051 1052 tmp = tmp.replaceAll("<", "<").replaceAll(">", ">"); 1053 1054 // Build the HTML Table 1055 ret.append("<TR>"); 1056 1057 ret.append("<TD>" + c + "</TD>"); 1058 ret.append("<TD>" + ((int) c) + "</TD>"); 1059 ret.append("<TD>" + "0x" + String.format("%x",(int) c).toUpperCase() + "</TD>"); 1060 ret.append("<TD>" + ascii + "</TD>"); 1061 1062 ret.append("<TD>" + ((q == 0) ? "" : "Quote") + "</TD>"); 1063 ret.append("<TD>" + ((es == 0) ? "" : "Sentence") + "</TD>"); 1064 ret.append("<TD>" + ((ep == 0) ? "" : "Phrase") + "</TD>"); 1065 ret.append("<TD>" + ((com == 0) ? "" : "Comma") + "</TD>"); 1066 ret.append("<TD>" + ((br == 0) ? "" : "Bracket") + "</TD>"); 1067 ret.append("<TD>" + ((p == 0) ? "" : "Paren") + "</TD>"); 1068 ret.append("<TD>" + ((bl == 0) ? "" : "Bullet") + "</TD>"); 1069 ret.append("<TD>" + (bpmf ? "BPMF" : "") + "</TD>"); 1070 1071 // ========================================================== 1072 // Un-Comment this if you want to debug this print function 1073 // outStr += "</TR><TR><TD COLSPAN=\"12\">" + tmp + "</TD></TR>"; 1074 1075 } 1076 ret.append("</TABLE>"); 1077 return ret.toString(); 1078 } 1079 1080 /** 1081 * Checks for any leading alphabetic {@code ('a' ... 'z')} and numeric {@code ('0' ... '9')} 1082 * characters in a Chinese {@code String}. 1083 * 1084 * <B>CHANGED:</B> 2018.09.24 - I left comma's and period's in the {@code String} (when 1085 * situated between digits). These are considered to be part of the "Leading Letters and 1086 * Numbers" 1087 * 1088 * @param chineseSentence A sentence that may or may not have leading letters & numbers. 1089 * 1090 * @return the {@code String}-index of the first non-alphabetic, non-numeric character in the 1091 * {@code String}. 1092 * 1093 * <BR /><BR /><B>NOTE:</B> white-space does not count, and the position of the first 1094 * white-space character will be returned, if white-space is contained in this {@code String}. 1095 * 1096 * @see #isAlphaNumeric(char) 1097 */ 1098 public static int countLeadingLettersAndNumbers(String chineseSentence) 1099 { 1100 for (int i = 0; i < chineseSentence.length(); i++) 1101 { 1102 char c = chineseSentence.charAt(i); 1103 if ((! isAlphaNumeric(c)) && (c != '.') && (c != ',')) return i; 1104 } 1105 1106 return chineseSentence.length(); // This really ought not to happen, but just in case.... 1107 } 1108 1109 /** 1110 * Checks for higher-Unicode letters and numbers, and converts them into lower-level versions 1111 * of the appropriate letter or number. 1112 * 1113 * <BR /><BR /><B>SPECIFICALLY:</B> This method is just a "for-loop" which 1114 * makes a call to {@code alphaNumericaAUC()} and if zero is not returned from that 1115 * method-call, then the input {@code String} is modified at the index which contained such a 1116 * higher {@code UTF-8} letter or number. 1117 * 1118 * @param s This may or may not have "Alternate UniCode" Characters for letters and numbers. 1119 * 1120 * @return if the "alternate" versions of <CODE>'A' ... 'Z'</CODE> or <CODE>'0' ... '9'</CODE> 1121 * are there, this will make sure to change them. 1122 * 1123 * @see #alphaNumericAUC(char) 1124 */ 1125 public static String convertAnyAUC(String s) 1126 { 1127 char[] cArr = s.toCharArray(); 1128 1129 for (int i = 0; i < cArr.length; i++) 1130 { 1131 char auc = alphaNumericAUC(cArr[i]); 1132 if (auc != 0) cArr[i] = auc; 1133 } 1134 1135 return new String(cArr); 1136 } 1137 1138 /** 1139 * Counts syllables in a "word" of PinYin. The input {@code String} is expected to not have 1140 * any spaces! 1141 * 1142 * <BR /><BR /> 1143 * <B>NOTE:</B>The number of syllables in a Chinese PinYin "word" identifies the 1144 * number of Chinese Characters that were used to generate the input 1145 * <B>PinYin {@code String}</B>. 1146 * 1147 * <BR /><BR /><B>CHANGED:</B> 2018.09.24 - Added a test for periods and commas that are 1148 * situated directly between two digits. In the String "5.0" the period between 5 and 0 is no 1149 * longer removed! 1150 * 1151 * <BR /><BR />If the {@code String} "5.0" were passed as the "word" parameter, the result 1152 * should be 3! 1153 * 1154 * @param word A word in the "PinYin" format. (罗马拼音) 1155 * 1156 * @param DOUT This must implement {@code java.lang.Appendable} 1157 * 1158 * @return the number of syllables (specifically: Chinese Characters) in the input word. 1159 * 1160 * @throws IOException The interface {@code java.lang.Appendable} mandates that the 1161 * {@code IOException} must be treated as a checked exception for all output operations. 1162 * Therefore {@code IOException} is a required exception in this method' throws clause. 1163 */ 1164 public static int countSyllablesAndNonChinese(String word, Appendable DOUT) 1165 throws IOException 1166 { 1167 int numChinese = 0; 1168 1169 // Tone-Vowels & Numbers always correspond to a character 1170 for (int letter = 0; letter < word.length(); letter++) 1171 { 1172 char c = word.charAt(letter); 1173 if ( ZH.isToneVowel(c) || 1174 ZH.isNumber(c) || 1175 (c == '.') || 1176 (c == ',') 1177 ) 1178 numChinese++; 1179 } 1180 1181 // Checks for vowel-strings that don't contain a tone 1182 // ==> Checks for "clear tone" 1183 String copyW = "" + word; 1184 1185 DOUT.append("[" + copyW + "] - "); 1186 1187 for (int letterIndex = 0; letterIndex < copyW.length(); letterIndex++) 1188 if ( ! ZH.isRegVowel(copyW.charAt(letterIndex)) && 1189 ! ZH.isToneVowel(copyW.charAt(letterIndex)) ) 1190 copyW = StringParse.setChar(copyW, letterIndex, ' '); 1191 1192 DOUT.append("after erasing non-vowels [" + copyW + "]\n"); 1193 1194 String[] syllables = copyW.trim().split(" "); 1195 1196 DOUT.append("Syllables are:"); 1197 for (int sylIndex = 0; sylIndex < syllables.length; sylIndex++ ) 1198 DOUT.append("[" + syllables[sylIndex] + "]"); 1199 DOUT.append("\n"); 1200 1201 TOP: 1202 for (int sylIndex = 0; sylIndex < syllables.length; sylIndex++) 1203 { 1204 String syllable = syllables[sylIndex].trim(); 1205 boolean foundTone = false; 1206 1207 // The split(' ') function sometimes provides blanks 1208 if (syllable.length() == 0) continue TOP; 1209 1210 for (int vowelIndex = 0; vowelIndex < syllable.length(); vowelIndex++) 1211 if (ZH.isToneVowel(syllable.charAt(vowelIndex))) 1212 continue TOP; 1213 1214 numChinese++; 1215 DOUT.append("NOTE: *** FOUND CLEAR TONE\n"); 1216 } 1217 1218 return numChinese; 1219 } 1220 1221 /** 1222 * Deletes all punctuation & non-character symbols. The {@code String} that is returned 1223 * will be shortened by precisely the number of punctuation characters were contained by that 1224 * {@code String}. 1225 * 1226 * <BR /><BR /><B>NOTE:</B> {@code '.'} and {@code ','} (periods and commas) between 1227 * number/digits are not removed! 1228 * 1229 * @param s An input {@code String} (in Mandarin - 普通话) 1230 * 1231 * @return a {@code String} that is the same as the input {@code String} - after skipping 1232 * characters as follows: 1233 * 1234 * <BR /><DIV CLASS="SNIP">{@code 1235 * if (isChinese(c) || isAlphaNumeric(c) || (alphaNumericAUC(c) != 0)) continue; 1236 * (else) s = StringParse.delChar(s, chr--); 1237 * }</DIV> 1238 */ 1239 public static String delAllPunctuationCHINESE(String s) 1240 { 1241 char[] cArr = s.toCharArray(); 1242 int sourcePos = 0; 1243 int destPos = 0; 1244 1245 while (sourcePos < cArr.length) 1246 { 1247 char c = cArr[sourcePos]; 1248 1249 // Check for things like 5.0 or 1,120,987 - SPECIFICALLY Comma's and Period's situated 1250 // directly between 2 numbers. 1251 1252 if ( ((c == '.') || (c == ',')) 1253 && (((sourcePos-1) == -1) || isNumber(cArr[sourcePos-1])) 1254 && (((sourcePos+1) == s.length()) || isNumber(cArr[sourcePos+1])) 1255 ) 1256 { cArr[destPos++] = cArr[sourcePos++]; continue; } 1257 1258 // AUC were converted before calling this function ... (alphaNumericAUC(c) != 0)) 1259 1260 if (isChinese(c) || isAlphaNumeric(c)) 1261 { cArr[destPos++] = cArr[sourcePos++]; continue; } 1262 1263 sourcePos++; 1264 } 1265 1266 return s; 1267 } 1268 1269 /** 1270 * Deletes all punctuation & non-character symbols from a {@code String} of PinYin. 1271 * The returned {@code String} will have the same length as it originally did, but the 1272 * locations where punctuation existed will have been replaced with a space character. 1273 * 1274 * <BR /><BR /><B>NOTE:</B> {@code '.'} and {@code ','} (periods and commas) between 1275 * number/digits are not removed! 1276 * 1277 * @param s An input {@code String} in 罗马拼音 1278 * 1279 * @return A {@code String} that is the same as the input {@code String} - after skipping 1280 * characters as follows: 1281 * 1282 * <BR /><DIV CLASS="SNIP">{@code 1283 * if (isAlphaNumeric(c) || (alphaNumericAUC(c) != 0)) continue; 1284 * (else) s = StringParse.setChar(s, chr, ' '); 1285 * }</DIV> 1286 */ 1287 public static String delAllPunctuationPINYIN(String s) 1288 { 1289 char[] cArr = s.toCharArray(); 1290 1291 // This loop cnverts all non-AlphaNumeric unicode to a space 1292 for (int i = 0; i < cArr.length; i++) 1293 { 1294 char c = cArr[i]; 1295 1296 if (isAlphaNumeric(c) || (alphaNumericAUC(c) != 0)) continue; 1297 1298 // Check for things like 5.0 or 1,120,987 - SPECIFICALLY Comma's and Period's 1299 // situated directly between 2 numbers. 1300 1301 if ( ((c == '.') || (c == ',')) 1302 && (((i-1) == -1) || isNumber(cArr[i-1])) 1303 && (((i+1) == s.length()) || isNumber(cArr[i+1])) 1304 ) 1305 continue; 1306 1307 cArr[i] = ' '; 1308 } 1309 1310 return new String(cArr); 1311 } 1312 1313 // **************************************************************************************** 1314 // Constants 1315 // **************************************************************************************** 1316 1317 /** Special Quotation Mark, left-side */ 1318 public static final char CONSTSpecialQuoteLeft = (char) 0x201C; 1319 1320 /** Special Quotation Mark, right-side */ 1321 public static final char CONSTSpecialQuoteRight = (char) 0x201D; 1322 1323 /** 1324 * <B STYLE="color: red;">GTPPE: Google Translate Punctuation Pronunciation Equivalent</B> 1325 * This searches through a {@code String} to find the location of the "equivalent punctuation 1326 * mark" 1327 * 1328 * @param s The input {@code String}, expected to be the result of a <B>GCS TS</B> query. This 1329 * function is totally useless for any {@code Pronunciation String} that hasn't been obtained 1330 * from <B>GCS TS</B>. 1331 * 1332 * <BR /><BR /><B>NOTE:</B> The input {@code String} is intended to be in "PinYin" (罗马拼音) 1333 * 1334 * @param c The original punctuation character to look for... Generally, this is used to 1335 * search for higher-level <B>UTF-8 {@code chars}</B> that have been "down-converted" by <B>GCS 1336 * TS</B> 1337 * 1338 * @return the {@code indexOf()} of the character in the original input String. The actual 1339 * character is not looked for, BUT RATHER, the <B>Google Cloud Server Transation Services</B> 1340 * equivalent character. Specifically, {@code GCSTS} has a "substitute punctuation" for many 1341 * higher-level <B>UTF-8</B> and <B>UniCode</B> chars. There are 5 different versions of a 1342 * quote... 1343 */ 1344 public static int GTPPEIndexOf(String s, char c) 1345 { 1346 int cc = (int) c; 1347 1348 // if (c == '∶') return s.indexOf(c); 1349 if (cc == 0x2236) return s.indexOf(c); 1350 // if (c == ':') return s.indexOf(':'); 1351 if (cc == 0xFF1A) return s.indexOf(':'); // (0x003A); 1352 // if (c == ':') return s.indexOf(c); // Natural colon 1353 if (cc == 0x003A) return s.indexOf(c); 1354 1355 // commas 1356 // if (c == '、') return s.indexOf(','); 1357 if (cc == 0x3001) return s.indexOf(','); // (0x002C); 1358 // if (c == ',') return s.indexOf(','); 1359 if (cc == 0xFF0C) return s.indexOf(','); // (0x002C); 1360 // if (c == ',') return s.indexOf(c); // natural comma 1361 if (cc == 0x002C) return s.indexOf(c); 1362 1363 // periods 1364 // if (c == '。') return s.indexOf('.'); 1365 if (cc == 0x3002) return s.indexOf('.'); // (0x002E); 1366 // if (c == '○') return s.indexOf(c); 1367 if (cc == 0x25CB) return s.indexOf(c); 1368 // if (c == '●') return s.indexOf(c); 1369 if (cc == 0x25CF) return s.indexOf(c); 1370 // if (c == '.') return s.indexOf('.'); 1371 if (cc == 0xFF0E) return s.indexOf('.'); // (0x002E); 1372 // if (c == '.') return s.indexOf(c); // natural period 1373 if (cc == 0x002E) return s.indexOf(c); 1374 1375 1376 // Exclamation & Question 1377 // if (c == '?') return s.indexOf(c); // natural question-mark 1378 if (cc == 0x003F) return s.indexOf(c); 1379 // if (c == '?') return s.indexOf('?'); 1380 if (cc == 0xFF1F) return s.indexOf('?'); // (0x003F); 1381 // if (c == '!') return s.indexOf('!'); 1382 if (cc == 0xFF01) return s.indexOf('!'); // (0x0021); 1383 // if (c == '!') return s.indexOf(c); // natural exclamation 1384 if (cc == 0x0021) return s.indexOf(c); 1385 1386 // single-quotes 1387 // if (c == '‘') return s.indexOf(c); 1388 if (cc == 0x2018) return s.indexOf(c); 1389 // if (c == '’') return s.indexOf(c); 1390 if (cc == 0x2019) return s.indexOf(c); 1391 // if (c == '′') return s.indexOf(c); 1392 if (cc == 0x2032) return s.indexOf(c); 1393 // if (c == ''') return s.indexOf('\''); 1394 if (cc == 0xFF07) return s.indexOf('\''); // (0x0027); 1395 // if (c == '`') return s.indexOf('`'); 1396 if (cc == 0xFF40) return s.indexOf('`'); // (0x0060); 1397 // if (c == '\'') return s.indexOf(c); // natural single-quotes 1398 if (cc == 0x0027) return s.indexOf(c); 1399 1400 1401 // NOT DETECTED RIGHT NOW.. 1402 // if (c == '《') return s.indexOf('“'); 1403 if (cc == 0x300A) return s.indexOf(CONSTSpecialQuoteLeft); 1404 // if (c == '》') return s.indexOf('”'); 1405 if (cc == 0x300B) return s.indexOf(CONSTSpecialQuoteRight); 1406 1407 // double-quotes 1408 // if (c == '¨') return s.indexOf(c); 1409 if (cc == 0x00A8) return s.indexOf(c); 1410 // if (c == '〃') return s.indexOf(c); 1411 if (cc == 0x3003) return s.indexOf(c); 1412 // if (c == '“') return s.indexOf(c); 1413 if (cc == 0x201C) return s.indexOf(c); 1414 // if (c == '”') return s.indexOf(c); 1415 if (cc == 0x201D) return s.indexOf(c); 1416 // if (c == '″') return s.indexOf(c); 1417 if (cc == 0x2033) return s.indexOf(c); 1418 // if (c == '"') return s.indexOf('\"'); 1419 if (cc == 0xFF02) return s.indexOf('\"'); // (0x0022); 1420 // if (c == '\"') return s.indexOf(c); // natural double quotes 1421 if (cc == 0x0022) return s.indexOf(c); 1422 1423 1424 // Brackets 1425 // if (c == '[') return s.indexOf(c); 1426 if (cc == 0x005B) return s.indexOf(c); 1427 // if (c == ']') return s.indexOf(c); 1428 if (cc == 0x005D) return s.indexOf(c); 1429 // if (c == '[') return s.indexOf('['); 1430 if (cc == 0xFF3B) return s.indexOf('['); // (0x005B); 1431 // if (c == ']') return s.indexOf(']'); 1432 if (cc == 0xFF3D) return s.indexOf(']'); // (0x005D); 1433 // if (c == '【') return s.indexOf('['); 1434 if (cc == 0x3010) return s.indexOf('['); // (0x005B); 1435 // if (c == '】') return s.indexOf(']'); 1436 if (cc == 0x3011) return s.indexOf(']'); // (0x005D); 1437 // if (c == '〖') return s.indexOf(c); 1438 if (cc == 0x3016) return s.indexOf(c); 1439 // if (c == '〗') return s.indexOf(c); 1440 if (cc == 0x3017) return s.indexOf(c); 1441 // if (c == '『') return s.indexOf('“'); 1442 if (cc == 0x300E) return s.indexOf(CONSTSpecialQuoteLeft); 1443 // if (c == '』') return s.indexOf('”'); 1444 if (cc == 0x300F) return s.indexOf(CONSTSpecialQuoteRight); 1445 // if (c == '「') return s.indexOf('`'); 1446 if (cc == 0x300C) return s.indexOf('`'); // (0x0060); 1447 // if (c == '」') return s.indexOf('\''); 1448 if (cc == 0x300D) return s.indexOf('\''); // (0x0027); 1449 1450 1451 // Parenthesis 1452 // if (c == '(') return s.indexOf(c); 1453 if (cc == 0x0028) return s.indexOf(c); 1454 // if (c == ')') return s.indexOf(c); 1455 if (cc == 0x0029) return s.indexOf(c); 1456 // if (c == '(') return s.indexOf('('); 1457 if (cc == 0xFF08) return s.indexOf('('); // (0x0028); 1458 // if (c == ')') return s.indexOf(')'); 1459 if (cc == 0xFF09) return s.indexOf(')'); // (0x0029); 1460 // if (c == '〔') return s.indexOf(c); 1461 if (cc == 0x3014) return s.indexOf(c); 1462 // if (c == '〕') return s.indexOf(c); 1463 if (cc == 0x3015) return s.indexOf(c); 1464 1465 System.out.println("character not found: \'" + c + "\'\nZH.GTPPEIndexOf(String s, char c)"); 1466 System.exit(0); 1467 return 0; 1468 } 1469}