001package Torello.Languages; 002 003import java.util.*; 004 005import Torello.Java.*; 006 007/** 008 * Some simple String Utilities for helping parse (Español) Spanish <CODE>String's</CODE>. 009 * 010 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=ES> 011 */ 012public class ES 013{ 014 private ES() { } 015 016 /** 017 * GRAVE & ACCUTE are the "first bit" of this mask, if that bit is '0', then the mask is 018 * ACCUTE 019 */ 020 public static final int GRAVE = 0b0001; 021 022 /** 023 * UPPER & LOWER CASE are the "second bit" of this mask, if that bit is '0', then he mask 024 * is LOWER-CASE 025 */ 026 public static final int UPPERCASE = 0b0010; 027 028 /** 029 * This is intended to produce an accented vowel 'on request' from the method invocation. The 030 * complete list of characters that may be returned by this function are listed below. 031 * 032 * <BR /><BR /><TABLE BORDER='1'><TBODY> 033 * <TR><TH>Upper, Grave</TH><TH>Upper, Acute</TH><TH>Lower, Grave</TH><TH>Lower, Acute</TH></TR> 034 * <TR><TD>À (192)</TD><TD>Á (193)</TD><TD>à (224)</TD><TD>á (225)</TD></TR> 035 * <TR><TD>È (200)</TD><TD>É (201)</TD><TD>è (232)</TD><TD>é (233)</TD></TR> 036 * <TR><TD>Ì (204)</TD><TD>Í (205)</TD><TD>ì (236)</TD><TD>í (237)</TD></TR> 037 * <TR><TD>Ò (210)</TD><TD>Ó (211)</TD><TD>ò (242)</TD><TD>ó (243)</TD></TR> 038 * <TR><TD>Ù (217)</TD><TD>Ú (218)</TD><TD>ù (249)</TD><TD>ú (250)</TD></TR> <BR /> 039 * </TBODY></TABLE> 040 * 041 * @param vowel Any vowel: [A, E, I, O, U] or [a, e, i, o, u] 042 * <BR /><BR />If 'vowel' is not one of these 10 choices, then other characters will be 043 * ignored, and this method will just return (char) 0. 044 * 045 * @param flags The following values can be OR'D (masked): Helper.GRAVE or Helper.UPPERCASE 046 * 047 * <BR /> <BR />In total, there are 4 possible versions: Upper-Case/Lower-Case output, and 048 * Accute/Grave output. 049 * 050 * <BR /><BR /><UL CLASS=JDUL> 051 * <LI> If Helper.GRAVE is not masked (binary-bit 0), 052 * then an "accute" accented vowel is returned (accute is "the default"). 053 * </LI> 054 * <LI> If Helper.UPPERCASE is not masked (binary-bit 1), 055 * then a lower-case vowel is returned (lower-case is "the default"). 056 * </LI> 057 * </UL> 058 * 059 * @return With correct input: one of ten listed vowels above - and if not, then ASCII 0 is 060 * returned. 061 */ 062 public static char getAccentedVowel(char vowel, int flags) 063 { 064 int i = 0; 065 066 if ((vowel == 'a') || (vowel == 'A')) i = 192; 067 else if ((vowel == 'e') || (vowel == 'E')) i = 200; 068 else if ((vowel == 'i') || (vowel == 'I')) i = 204; 069 else if ((vowel == 'o') || (vowel == 'O')) i = 210; 070 else if ((vowel == 'u') || (vowel == 'U')) i = 217; 071 else return (char) 0; 072 073 // À (192)È (200)Ì (204)Ò (210)Ù (217) 074 if ( ((flags & UPPERCASE) > 0) 075 && ((flags & GRAVE) > 0) 076 ) 077 return (char) (i + 0); 078 079 // Á (193)É (201)Í (205)Ó (211)Ú (218) 080 else if ((flags & UPPERCASE) > 0) return (char) (i + 1); 081 082 // à (224)è (232)ì (236)ò (242)ù (249) 083 else if ((flags & GRAVE) > 0) return (char) (i + 32); 084 085 // á (225)é (233)í (237)ó (243)ú (250) 086 else return (char) (i + 33); 087 } 088 089 090 /** 091 * This converts all Spanish-Accented characters into a lower-case, and non-accented 092 * equivalent. Also, upper-case regular characters are down-cased. If specifically 093 * requested, case can be preserved. 094 * 095 * <BR /><BR /><TABLE> 096 * <TR><TD>A (65) ... Z (90) </TD><TD>⇒ a .. z </TD></TR> 097 * <TR><TD>À (192), Á (193), à (224), á (225) </TD><TD>⇒ A or a </TD></TR> 098 * <TR><TD>È (200), É (201), è (232), é (233) </TD><TD>⇒ E or e </TD></TR> 099 * <TR><TD>Ì (204), Í (205), ì (236), í (237) </TD><TD>⇒ I or i </TD></TR> 100 * <TR><TD>Ò (210), Ó (211), ò (242), ó (243) </TD><TD>⇒ O or o </TD></TR> 101 * <TR><TD>Ù (217), Ú (218), ù (249), ú (250) </TD><TD>⇒ U or u </TD></TR> 102 * <TR><TD>Ñ (209), ñ (241) </TD><TD>⇒ N or n </TD></TR> 103 * <TR><TD>Ü (220), ü (252) </TD><TD>⇒ U or u </TD></TR> 104 * <TR><TD>Ý (221), ý (253) </TD><TD>⇒ Y or y </TD></TR> 105 * </TABLE> 106 * 107 * @param c Any ASCII/UniCode character 108 * 109 * @param preserveCase If this is TRUE, then accented capital letters remain capitlized. If 110 * this is FALSE, then all letters are converted to lowercase. 111 * 112 * @return If this character contained an accent, it will be removed. It will also be in 113 * lower-case form, unless preserveCase is TRUE. 114 */ 115 public static char toNonAccented(char c, boolean preserveCase) 116 { 117 if ((c == 224) || (c == 225)) return 'a'; 118 if ((c == 232) || (c == 233)) return 'e'; 119 if ((c == 236) || (c == 237)) return 'i'; 120 if ((c == 242) || (c == 243)) return 'o'; 121 if ((c == 249) || (c == 250)) return 'u'; 122 if (c == 241) return 'n'; 123 if (c == 252) return 'u'; 124 if (c == 253) return 'y'; 125 126 if ((c == 192) || (c == 193)) return (preserveCase ? 'A' : 'a'); 127 if ((c == 200) || (c == 201)) return (preserveCase ? 'E' : 'e'); 128 if ((c == 204) || (c == 205)) return (preserveCase ? 'I' : 'i'); 129 if ((c == 210) || (c == 211)) return (preserveCase ? 'O' : 'o'); 130 if ((c == 217) || (c == 218)) return (preserveCase ? 'U' : 'u'); 131 if (c == 209) return (preserveCase ? 'N' : 'n'); 132 if (c == 220) return (preserveCase ? 'U' : 'u'); 133 if (c == 221) return (preserveCase ? 'Y' : 'y'); 134 135 if ((c >= 'A') && (c <= 'Z')) return (char) (preserveCase ? c : (c -'A' + 'a')); 136 137 return c; 138 } 139 140 /** 141 * Removes Spanish-Accent Characters from all characters in a string. 142 * 143 * @return a new String, one where toNonAccented(s.charAt(i), preserveCase) has been 144 * called for each character in the String. This is just a small for-loop over a String. 145 * 146 * @see #toNonAccented(char, boolean) 147 */ 148 public static String toNonAccented(String s, boolean preserveCase) 149 { 150 StringBuilder sb = new StringBuilder(); 151 int len = s.length(); 152 153 for (int i=0; i < len; i++) sb.append(toNonAccented(s.charAt(i), preserveCase)); 154 155 return sb.toString(); 156 } 157 158 /** 159 * Produces a <I>lower-case Spanish Character</I> - if and only if the input-parameter 160 * is an <I>upper-case Spanish Character</I>. 161 * This is almost identifical to the usual String function toLowerCase(char), but it 162 * also includes Spanish vowels and consonants with: 163 * 164 * <BR /><BR /><UL CLASS=JDUL> 165 * <LI>accent marks: À, Á, à, and á ... etc.</LI> 166 * <LI>umlaut's: Ü and ü</LI> 167 * <LI>tildes: Ñ and ñ</LI> 168 * </UL> 169 * 170 * <BR /><B>NOTE:</B> The 'accute' and 'grave' accent marks are not so prevalently used anymore 171 * as in the time of "Don Quijote de la Mancha" - however, they are included here, just in case. 172 * Mostly the 'acute' accent mark (from top-right-corner to the lower-left-corner) is used in 173 * newspapers around here (Dallas, Texas). 174 * 175 * @param c Any ASCII or UniCode {@code char} 176 * 177 * @return Uppercase letters 'A' .. 'Z' are converted to 'a' .. 'z' <BR /> 178 * AND: 179 * 180 * <BR /><BR /><TABLE> 181 * <TR><TD>À (192), Á (193) </TD><TD>⇒ à (224), á (225) </TD></TR> 182 * <TR><TD>È (200), É (201) </TD><TD>⇒ è (232), é (233) </TD></TR> 183 * <TR><TD>Ì (204), Í (205) </TD><TD>⇒ ì (236), í (237) </TD></TR> 184 * <TR><TD>Ò (210), Ó (211) </TD><TD>⇒ ò (242), ó (243) </TD></TR> 185 * <TR><TD>Ù (217), Ú (218) </TD><TD>⇒ ù (249), ú (250) </TD></TR> 186 * <TR><TD>Ñ (209) </TD><TD>⇒ ñ (241) </TD></TR> 187 * <TR><TD>Ý (221) </TD><TD>⇒ ý (253) </TD></TR> 188 * <TR><TD>Ü (220) </TD><TD>⇒ ü (252) </TD></TR> 189 * </TABLE> 190 * 191 * @see #toUpperCaseSpanish(char) 192 * @see #toLowerCaseSpanish(String) 193 */ 194 public static char toLowerCaseSpanish(char c) 195 { 196 if ((c >= 'A') && (c <= 'Z')) return (char) (c + 'a' - 'A'); 197 198 else if ( 199 (c == 192) || (c == 193) || (c == 200) || (c == 201) 200 || (c == 204) || (c == 205) || (c == 210) || (c == 211) 201 || (c == 217) || (c == 218) || (c == 209) || (c == 220) 202 || (c == 221) 203 ) 204 return (char) (c + 32); 205 206 return c; 207 } 208 209 /** 210 * This cycles through an input-String parameter, and converts any/all letters that are 211 * uppercase - including ones with accent marks, tildes, and umlaut's, and returns a 212 * {@code String} n which all characters are lower-case, but have their punctuation preserved. 213 * 214 * @return a new string in which Helper.toLowerCaseSpanish(char) has been invoked on each 215 * character. 216 * 217 * @see #toLowerCaseSpanish(char) 218 */ 219 public static String toLowerCaseSpanish(String s) 220 { 221 StringBuilder ret = new StringBuilder(); 222 for (int i=0; i < s.length(); i++) ret.append(toLowerCaseSpanish(s.charAt(i))); 223 return ret.toString(); 224 } 225 226 227 228 229 /** 230 * Produces an <I>upper-case Spanish Character</I> - if and only if the input-parameter 231 * is a <I>lower-case Spanish Character</I>. See toLowerCaseSpanish(char) for more notes! 232 * 233 * @param c Any ASCII or UniCode char 234 * 235 * @return Lowercase letters {@code 'a' .. 'z'} are converted to {@code 'A' .. 'Z'} 236 * 237 * <BR /><BR />AND: 238 * 239 * <BR /><BR /><TABLE> 240 * <TR><TD>à (224), á (225) </TD><TD>⇒ À (192), Á (193)</TD></TR> 241 * <TR><TD>è (232), é (233) </TD><TD>⇒ È (200), É (201)</TD></TR> 242 * <TR><TD>ì (236), í (237) </TD><TD>⇒ Ì (204), Í (205)</TD></TR> 243 * <TR><TD>ò (242), ó (243) </TD><TD>⇒ Ò (210), Ó (211)</TD></TR> 244 * <TR><TD>ù (249), ú (250) </TD><TD>⇒ Ù (217), Ú (218)</TD></TR> 245 * <TR><TD>ñ (241) </TD><TD>⇒ Ñ (209) </TD></TR> 246 * <TR><TD>ý (253) </TD><TD>⇒ Ý (221) </TD></TR> 247 * <TR><TD>ü (252) </TD><TD>⇒ Ü (220) </TD></TR> 248 * </TABLE> 249 * 250 * @see #toLowerCaseSpanish(char) 251 * @see #toUpperCaseSpanish(String) 252 */ 253 public static char toUpperCaseSpanish(char c) 254 { 255 if ((c >= 'a') && (c <= 'z')) 256 return (char) (c + 'A' - 'a'); 257 258 else if ( (c == 224) || (c == 225) || (c == 232) || (c == 233) 259 || (c == 236) || (c == 237) || (c == 242) || (c == 243) 260 || (c == 249) || (c == 250) || (c == 241) || (c == 253) 261 || (c == 252) 262 ) 263 return (char) (c - 32); 264 265 return c; 266 } 267 268 /** 269 * This cycles through an input-String parameter, and converts any/all letters 270 * that are lower-case, including ones with accent marks, tildes, and umlaut's, and 271 * returns a String in which all characters are upper-case, but have their punctuation 272 * preserved. 273 * 274 * @return a new string in which Helper.toUpperCaseSpanish(char) has been invoked on each 275 * character. 276 * 277 * @see #toUpperCaseSpanish(char) 278 */ 279 public static String toUpperCaseSpanish(String s) 280 { 281 StringBuilder ret = new StringBuilder(); 282 for (int i=0; i < s.length(); i++) ret.append(toLowerCaseSpanish(s.charAt(i))); 283 return ret.toString(); 284 } 285 286 287 /** 288 * Checks if this character could be a Spanish Language Character 289 * 290 * @param c Any ASCII or Uni-Code Character 291 * 292 * @return <B>TRUE:</B> If and only if 'c' is one of the following char-sets: 293 * 294 * <BR /><BR /><UL CLASS=JDUL> 295 * <LI>a ... z</LI> 296 * <LI>A ... Z</LI> 297 * <LI>Á (193), É (201), Í (205), Ó (211), Ú (218), Ý (221), Ü (220), Ñ (209)</LI> 298 * <LI>á (225), é (233), í (237), ó (243), ú (250), ý (253), ü (252), ñ (241)</LI> 299 * </UL> 300 * 301 * <BR />and <B>FALSE</B> otherwise... 302 */ 303 public static boolean isLanguageChar(char c) 304 { 305 if ((c >= 'a') && (c <= 'z')) return true; 306 if ((c >= 'A') && (c <= 'Z')) return true; 307 308 // Á 193, É 201, Í 205, Ó 211, Ú 218, Ý 221, Ü 220, Ñ 209 309 if ( (c == 193) || (c == 201) || (c == 205) || (c == 211) || (c == 218) || (c == 221) 310 || (c == 220) || (c == 209)) 311 return true; 312 313 // á 225, é 233, í 237, ó 243, ú 250, ý 253, ü 252, ñ 241 314 if ( (c == 225) || (c == 233) || (c == 237) || (c == 243) || (c == 250) || (c == 253) 315 || (c == 252) || (c == 241)) 316 return true; 317 318 return false; 319 } 320 321 /** 322 * Checks if a {@code String} contains non-Spanish-Language Characters. Utilizes 323 * {@link #isLanguageChar(char)} 324 * 325 * @param s Any {@code String} consisting of ASCII & UniCode Characters 326 * 327 * @return {@code TRUE} only if {@code isLanguageChar(s.charAt(i))} returns {@code TRUE} for 328 * ever integer {@code i}, and <B>FALSE</B> otherwise. 329 * 330 * @see #isLanguageChar(char) 331 */ 332 public static boolean onlyLanguageChars(String s) 333 { 334 for (int i=0; i < s.length(); i++) if (! isLanguageChar(s.charAt(i))) return false; 335 return true; 336 } 337 338 /** 339 * This is a function which identifies Spanish Language Infinitive Form Verbs. 340 * 341 * @param s Any String consisting of ASCII & UniCode Characters 342 * 343 * @return {@code TRUE} if and only if: 344 * <BR />input-parameter {@code 's'} ends with: ar, er, ir, arse, erse, irse, ír, írse 345 * <BR />{@code 's'} passes the {@link #onlyLanguageChars(String)} boolean test 346 * <BR /><B>FALSE</B> otherwise 347 * 348 * @see #onlyLanguageChars(String) 349 */ 350 public static boolean isSpanishVerbInfinitive(String s) 351 { 352 s = toLowerCaseSpanish(s); 353 354 if (onlyLanguageChars(s)) 355 if ( s.endsWith("ar") || s.endsWith("er") || s.endsWith("ir") 356 || s.endsWith("arse") || s.endsWith("erse") || s.endsWith("irse") 357 || s.endsWith("ír") || s.endsWith("írse")) 358 return true; 359 360 return false; 361 } 362 363 private static final String[] ESC_STRS = 364 { 365 "á", "é", "í", "ó", "ú", "Á", "É", 366 "Í", "Ó", "Ú", "ñ", "«", "»", "—", "ü", 367 "ï", "¡", "¿", """ 368 }; 369 370 private static final char[] REPL_CHARS = 371 { 372 'á', 'é', 'í', 'ó', 'ú', 'Á', 'É', 'Í', 'F', 'Ú', 'ñ', '«', '»', '-', 'ü', 'ï', '¡', 373 '¿', '\"' 374 }; 375 376 /** 377 * This function is somewhat redundant, as a complete HTML-Character Escape-Sequence class is 378 * included in the Torello.HTML package. There is a link provided to these methods at the end 379 * of this comment. This method was written much earlier, and functions well, but it can only 380 * convert HTML-Escape-Sequences that are used in Spanish - rather than all HTML-Character 381 * Escape-Sequences. Here is the complete list: 382 * 383 * <BR /><BR /><TABLE> 384 * <TR> 385 * <TD>&aacute;</TD><TD>⇒ á</TD></TR><TR><TD>&eacute;</TD> 386 * <TD>⇒ é</TD> 387 * </TR> 388 * <TR> 389 * <TD>&iacute;</TD><TD>⇒ í</TD></TR><TR><TD>&oacute;</TD> 390 * <TD>⇒ ó</TD> 391 * </TR> 392 * <TR> 393 * <TD>&uacute;</TD><TD>⇒ ú</TD></TR><TR><TD>&Aacute;</TD> 394 * <TD>⇒ Á</TD> 395 * </TR> 396 * <TR> 397 * <TD>&Eacute;</TD><TD>⇒ É</TD></TR><TR><TD>&Iacute;</TD> 398 * <TD>⇒ Í</TD> 399 * </TR> 400 * <TR> 401 * <TD>&Oacute;</TD><TD>⇒ Ó</TD></TR><TR><TD>&Uacute;</TD> 402 * <TD>⇒ Ú</TD> 403 * </TR> 404 * <TR> 405 * <TD>&ntilde;</TD><TD>⇒ ñ</TD></TR><TR><TD>&laquo;</TD> 406 * <TD>⇒ «</TD> 407 * </TR> 408 * <TR> 409 * <TD>&raquo; </TD><TD>⇒ »</TD></TR><TR><TD>&mdash;</TD> 410 * <TD>⇒ -</TD> 411 * </TR> 412 * <TR> 413 * <TD>&uuml; </TD><TD>⇒ ü</TD></TR><TR><TD>&iuml;</TD> 414 * <TD>⇒ ï</TD> 415 * </TR> 416 * <TR> 417 * <TD>&iexcl; </TD><TD>⇒ ¡</TD></TR><TR><TD>&iquest;</TD> 418 * <TD>⇒ ¿</TD> 419 * </TR> 420 * <TR> 421 * <TD>&quot;</TD> 422 * <TD>⇒ "</TD> 423 * </TR> 424 * </TABLE> 425 * 426 * @param s Any ASCII/UniCode String, which ostensibly ought to (possibly) contain 427 * Spanish-Language HTML-Escaped characters within them. 428 * 429 * @return A string where all HTML escape-sequences have been converted to their actual 430 * character equivalent. 431 * 432 * @see Torello.HTML.Escape#escHTMLToChar(String) 433 * @see Torello.HTML.Escape#htmlEsc(char) 434 * @see StrReplace#r(String, String[], char[]) 435 */ 436 public static String convertHTML_TO_UTF8(String s) 437 { return StrReplace.r(s, ESC_STRS, REPL_CHARS); } 438 439 440 //********************************************************************************************* 441 //********************************************************************************************* 442 443 /** 444 * This is some "list processing" stuff - used to grep "DictCC". It's an internally used 445 * list. 446 */ 447 private static Vector<String> removeList = null; 448 449 /** 450 * This just stores a list of "words", and they are removed from certain texts/articles. This 451 * program currently uses it to remove certain extremely commonly used words, so they are not 452 * repeatedly searched for in the dictionary. It is <I>kind of</I> a hack. 453 * 454 * @param wordList An array of Strings. It is expected to be a list of words that may be 455 * removed from Spanish Texts, but it can be any list of words. It is checked to see if 100% 456 * of the characters in each word are alphabetic, and throws an IllegalArgumentException if they 457 * are not. 458 * 459 * @throws IllegalArgumentException if the wordList parameter contains strings with invalid 460 * non-word characters. 461 */ 462 public static void setRemoveWordsArr(String[] wordList) 463 { 464 removeList = new Vector<String>(); 465 466 for (int i=0; i < wordList.length; i++) 467 { 468 String word = wordList[i]; 469 470 for (int j=0; j < word.length(); j++) 471 472 if (! isLanguageChar(word.charAt(j))) throw new IllegalArgumentException( 473 "Contains word:" + word + " which has invalid, non-word, language-characters"); 474 475 removeList.addElement(word); 476 } 477 } 478 479 /** 480 * This function references the words in the "removeList" and removes every occurence of each 481 * word that is present in the "removeList" {@code Vector<String>} 482 * 483 * @param s A String of Spanish Words. 484 * 485 * @return The same string with each instance of each word that is listed in the "removeList" 486 * {@code Vector} removed from the {@code String} 487 * 488 * @see #setRemoveWordsArr(String[]) 489 */ 490 public static String removeWords(String s) 491 { 492 // boolean printIt = false; 493 // int tpos = s.indexOf(" a "); 494 // if (tpos != -1) if (s.indexOf(" a ", tpos + 3) != -1) printIt = true; 495 // if (printIt) System.out.println(s + ":"); 496 497 Enumeration<String> e = removeList.elements(); 498 // System.out.println("CLEANING: [" + s + "]"); 499 500 while (e.hasMoreElements()) 501 { 502 String lc = toLowerCaseSpanish(s); 503 504 // System.out.print(" <" + lc + ">"); 505 String word = e.nextElement(); 506 507 // System.out.print(" {" + word + "}"); 508 509 int pos = 0; 510 while ((pos = lc.indexOf(word, pos)) != -1) 511 { 512 int startPos = pos; 513 int endPos = pos + word.length(); 514 boolean leftEnd = (startPos == 0); 515 boolean rightEnd = (endPos == lc.length()); 516 char leftChar = leftEnd ? 0 : lc.charAt(startPos - 1); 517 char rightChar = rightEnd ? 0 : lc.charAt(endPos); 518 519 // if (printIt) System.out.print("(" + leftChar + "," + rightChar + "," + leftEnd + 520 // "," + rightEnd + "," + startPos + "," + endPos + ") "); 521 522 if (isLanguageChar(leftChar)) { pos = endPos; continue; } 523 if (isLanguageChar(rightChar)) { pos = endPos; continue; } 524 525 // System.out.print("(" + startPos + "," + endPos + ")" ); 526 boolean leftSpace = (leftChar == ' '); 527 boolean rightSpace = (rightChar == ' '); 528 529 if (leftSpace && rightSpace) startPos--; 530 else if (leftSpace && rightEnd) startPos--; 531 else if (leftEnd && rightSpace) endPos++; 532 533 s = (leftEnd ? "" : s.substring(0, startPos)) + 534 (rightEnd ? "" : s.substring(endPos)); 535 536 // if (printIt) System.out.print("[" + s + "] "); 537 lc = toLowerCaseSpanish(s); 538 } 539 } 540 541 // if (printIt) System.out.println("\n"); 542 return s; 543 } 544}