ES.java.html

package Torello.Languages;

import java.util.*;

import Torello.Java.*;

/**
 * Some simple String Utilities for helping parse (Español) Spanish <CODE>String's</CODE>.
 * 
 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=ES>
 */
public class ES
{
    private ES() { }

    /**
     * GRAVE &amp; ACCUTE are the "first bit" of this mask, if that bit is '0', then the mask is
     * ACCUTE
     */
    public static final int GRAVE = 0b0001;

    /**
     * UPPER &amp; LOWER CASE are the "second bit" of this mask, if that bit is '0', then he mask
     * is LOWER-CASE
     */
    public static final int UPPERCASE	= 0b0010;

    /**
     * This is intended to produce an accented vowel 'on request' from the method invocation.  The
     * complete list of characters that may be returned by this function are listed below.
     * 
     * <BR /><BR /><TABLE BORDER='1'><TBODY>
     * <TR><TH>Upper, Grave</TH><TH>Upper, Acute</TH><TH>Lower, Grave</TH><TH>Lower, Acute</TH></TR>	
     * <TR><TD>À (192)</TD><TD>Á (193)</TD><TD>à (224)</TD><TD>á (225)</TD></TR>
     * <TR><TD>È (200)</TD><TD>É (201)</TD><TD>è (232)</TD><TD>é (233)</TD></TR>
     * <TR><TD>Ì (204)</TD><TD>Í (205)</TD><TD>ì (236)</TD><TD>í (237)</TD></TR>
     * <TR><TD>Ò (210)</TD><TD>Ó (211)</TD><TD>ò (242)</TD><TD>ó (243)</TD></TR>
     * <TR><TD>Ù (217)</TD><TD>Ú (218)</TD><TD>ù (249)</TD><TD>ú (250)</TD></TR>																					<BR />
     * </TBODY></TABLE>
     * 
     * @param vowel Any vowel: [A, E, I, O, U] or [a, e, i, o, u]
     * <BR /><BR />If 'vowel' is not one of these 10 choices, then other characters will be
     * ignored, and this method will just return (char) 0.
     * 
     * @param flags The following values can be OR'D (masked): Helper.GRAVE or Helper.UPPERCASE
     *
     * <BR /> <BR />In total, there are 4 possible versions: Upper-Case/Lower-Case output, and
     * Accute/Grave output.
     * 
     * <BR /><BR /><UL CLASS=JDUL>
     * <LI> If Helper.GRAVE is not masked (binary-bit 0), 
     *      then an "accute" accented vowel is returned (accute is "the default").
     *      </LI>
     * <LI> If Helper.UPPERCASE is not masked (binary-bit 1),
     *      then a lower-case vowel is returned (lower-case is "the default").
     *      </LI>
     * </UL>
     * 
     * @return With correct input: one of ten listed vowels above - and if not, then ASCII 0 is
     * returned.
     */
    public static char getAccentedVowel(char vowel, int flags)
    {
        int i = 0;

        if		((vowel == 'a') || (vowel == 'A')) i = 192;
        else if	((vowel == 'e') || (vowel == 'E')) i = 200;
        else if ((vowel == 'i') || (vowel == 'I')) i = 204;
        else if ((vowel == 'o') || (vowel == 'O')) i = 210;
        else if ((vowel == 'u') || (vowel == 'U')) i = 217;
        else return (char) 0;

        // À (192)È (200)Ì (204)Ò (210)Ù (217)
        if (    ((flags & UPPERCASE) > 0)
            &&  ((flags & GRAVE) > 0)
        )
            return (char) (i + 0);

        // Á (193)É (201)Í (205)Ó (211)Ú (218)
        else if	((flags & UPPERCASE) > 0) return (char) (i + 1);

        // à (224)è (232)ì (236)ò (242)ù (249)
        else if ((flags & GRAVE) > 0) return (char) (i + 32);

        // á (225)é (233)í (237)ó (243)ú (250)
        else return (char) (i + 33);
    }


    /**
     * This converts all Spanish-Accented characters into a lower-case, and non-accented
     * equivalent.  Also, upper-case regular characters are down-cased.  If specifically
     * requested, case can be preserved.
     * 
     * <BR /><BR /><TABLE>
     * <TR><TD>A (65) ... Z (90)                    </TD><TD>&rArr; a .. z	</TD></TR>
     * <TR><TD>À (192), Á (193), à (224), á (225)   </TD><TD>&rArr;	A or a	</TD></TR>
     * <TR><TD>È (200), É (201), è (232), é (233)   </TD><TD>&rArr;	E or e	</TD></TR>
     * <TR><TD>Ì (204), Í (205), ì (236), í (237)   </TD><TD>&rArr;	I or i	</TD></TR>
     * <TR><TD>Ò (210), Ó (211), ò (242), ó (243)   </TD><TD>&rArr;	O or o	</TD></TR>
     * <TR><TD>Ù (217), Ú (218), ù (249), ú (250)   </TD><TD>&rArr;	U or u	</TD></TR>
     * <TR><TD>Ñ (209),  ñ (241)                    </TD><TD>&rArr;	N or n	</TD></TR>
     * <TR><TD>Ü (220),  ü (252)                    </TD><TD>&rArr;	U or u	</TD></TR>
     * <TR><TD>Ý (221),  ý (253)                    </TD><TD>&rArr;	Y or y	</TD></TR>
     * </TABLE>
     * 
     * @param c Any ASCII/UniCode character
     * 
     * @param preserveCase If this is TRUE, then accented capital letters remain capitlized.  If
     * this is FALSE, then all letters are converted to lowercase.
     *
     * @return If this character contained an accent, it will be removed.  It will also be in
     * lower-case form, unless preserveCase is TRUE.
     */
    public static char toNonAccented(char c, boolean preserveCase)
    {
        if ((c == 224) || (c == 225))   return 'a';
        if ((c == 232) || (c == 233))   return 'e';
        if ((c == 236) || (c == 237))   return 'i';
        if ((c == 242) || (c == 243))   return 'o';
        if ((c == 249) || (c == 250))   return 'u';
        if (c == 241)                   return 'n';
        if (c == 252)                   return 'u';
        if (c == 253)                   return 'y';

        if ((c == 192) || (c == 193))   return (preserveCase ? 'A' : 'a');
        if ((c == 200) || (c == 201))   return (preserveCase ? 'E' : 'e');
        if ((c == 204) || (c == 205))   return (preserveCase ? 'I' : 'i');
        if ((c == 210) || (c == 211))   return (preserveCase ? 'O' : 'o');
        if ((c == 217) || (c == 218))   return (preserveCase ? 'U' : 'u');
        if (c == 209)                   return (preserveCase ? 'N' : 'n');
        if (c == 220)                   return (preserveCase ? 'U' : 'u');
        if (c == 221)                   return (preserveCase ? 'Y' : 'y');

        if ((c >= 'A') && (c <= 'Z'))   return (char) (preserveCase ? c : (c -'A' + 'a'));

        return c;
    }

    /**
     * Removes Spanish-Accent Characters from all characters in a string.
     * 
     * @return a new String, one where toNonAccented(s.charAt(i), preserveCase) has been
     * called for each character in the String.  This is just a small for-loop over a String.
     * 
     * @see #toNonAccented(char, boolean)
     */
    public static String toNonAccented(String s, boolean preserveCase)
    {
        StringBuilder   sb  = new StringBuilder();
        int             len = s.length();

        for (int i=0; i < len; i++) sb.append(toNonAccented(s.charAt(i), preserveCase));

        return sb.toString();
    }

    /**
     * Produces a <I>lower-case Spanish Character</I> - if and only if the input-parameter
     * is an <I>upper-case Spanish Character</I>.
     * This is almost identifical to the usual String function toLowerCase(char), but it
     * also includes Spanish vowels and consonants with:
     * 
     * <BR /><BR /><UL CLASS=JDUL>
     * <LI>accent marks:    À, Á, à, and á ... etc.</LI>
     * <LI>umlaut's:        Ü and ü</LI>
     * <LI>tildes:          Ñ and ñ</LI>
     * </UL>
     * 
     * <BR /><B>NOTE:</B> The 'accute' and 'grave' accent marks are not so prevalently used anymore
     * as in the time of "Don Quijote de la Mancha" - however, they are included here, just in case.
     * Mostly the 'acute' accent mark (from top-right-corner to the lower-left-corner) is used in
     * newspapers around here (Dallas, Texas).
     * 
     * @param c Any ASCII or UniCode {@code char}
     * 
     * @return Uppercase letters 'A' .. 'Z' are converted to 'a' .. 'z'	<BR />
     *  AND:
     * 
     * <BR /><BR /><TABLE>
     * <TR><TD>À (192),  Á (193)    </TD><TD>&rArr; à (224), á (225)    </TD></TR>
     * <TR><TD>È (200),  É (201)    </TD><TD>&rArr; è (232), é (233)    </TD></TR>
     * <TR><TD>Ì (204),  Í (205)    </TD><TD>&rArr; ì (236), í (237)    </TD></TR>
     * <TR><TD>Ò (210),  Ó (211)    </TD><TD>&rArr; ò (242), ó (243)    </TD></TR>
     * <TR><TD>Ù (217),  Ú (218)    </TD><TD>&rArr; ù (249), ú (250)    </TD></TR>
     * <TR><TD>Ñ (209)              </TD><TD>&rArr; ñ (241)             </TD></TR>
     * <TR><TD>Ý (221)              </TD><TD>&rArr; ý (253)             </TD></TR>
     * <TR><TD>Ü (220)              </TD><TD>&rArr; ü (252)             </TD></TR>
     * </TABLE>
     * 
     * @see #toUpperCaseSpanish(char)
     * @see #toLowerCaseSpanish(String)
     */
    public static char toLowerCaseSpanish(char c)
    {
        if ((c >= 'A') && (c <= 'Z')) return (char) (c + 'a' - 'A');

        else if (
                (c == 192) || (c == 193) || (c == 200) || (c == 201)
            ||  (c == 204) || (c == 205) || (c == 210) || (c == 211)
            ||  (c == 217) || (c == 218) || (c == 209) || (c == 220)
            ||  (c == 221)
        )
            return (char) (c + 32);

        return c;
    }

    /**
     * This cycles through an input-String parameter, and converts any/all letters that are
     * uppercase - including ones with accent marks, tildes, and umlaut's, and returns a
     * {@code String} n which all characters are lower-case, but have their punctuation preserved.
     * 
     * @return a new string in which Helper.toLowerCaseSpanish(char) has been invoked on each
     * character.
     * 
     * @see #toLowerCaseSpanish(char)
     */
    public static String toLowerCaseSpanish(String s)
    {
        StringBuilder ret = new StringBuilder();
        for (int i=0; i < s.length(); i++) ret.append(toLowerCaseSpanish(s.charAt(i)));
        return ret.toString();
    }




    /**
     * Produces an <I>upper-case Spanish Character</I> - if and only if the input-parameter
     * is a <I>lower-case Spanish Character</I>.  See toLowerCaseSpanish(char) for more notes!
     * 
     * @param c Any ASCII or UniCode char
     * 
     * @return	Lowercase letters {@code 'a' .. 'z'} are converted to {@code 'A' .. 'Z'}
     * 
     * <BR /><BR />AND:
     *
     * <BR /><BR /><TABLE>
     * <TR><TD>à (224), á (225)	</TD><TD>&rArr; À (192), Á (193)</TD></TR>
     * <TR><TD>è (232), é (233)	</TD><TD>&rArr; È (200), É (201)</TD></TR>
     * <TR><TD>ì (236), í (237) </TD><TD>&rArr; Ì (204), Í (205)</TD></TR>
     * <TR><TD>ò (242), ó (243) </TD><TD>&rArr; Ò (210), Ó (211)</TD></TR>
     * <TR><TD>ù (249), ú (250) </TD><TD>&rArr; Ù (217), Ú (218)</TD></TR>
     * <TR><TD>ñ (241)          </TD><TD>&rArr; Ñ (209)         </TD></TR>
     * <TR><TD>ý (253)          </TD><TD>&rArr; Ý (221)         </TD></TR>
     * <TR><TD>ü (252)          </TD><TD>&rArr; Ü (220)         </TD></TR>
     * </TABLE>
     * 
     * @see #toLowerCaseSpanish(char)
     * @see #toUpperCaseSpanish(String)
     */
    public static char toUpperCaseSpanish(char c)
    {
        if ((c >= 'a') && (c <= 'z'))
            return (char) (c + 'A' - 'a');

        else if (	(c == 224) || (c == 225) || (c == 232) || (c == 233)
                ||  (c == 236) || (c == 237) || (c == 242) || (c == 243)
                ||  (c == 249) || (c == 250) || (c == 241) || (c == 253)
                ||  (c == 252)
            )
            return (char) (c - 32);

        return c;
    }
    
    /**
     * This cycles through an input-String parameter, and converts any/all letters
     * that are lower-case, including ones with accent marks, tildes, and umlaut's, and
     * returns a String in which all characters are upper-case, but have their punctuation
     * preserved.
     * 
     * @return a new string in which Helper.toUpperCaseSpanish(char) has been invoked on each
     * character.
     * 
     * @see #toUpperCaseSpanish(char)
     */
    public static String toUpperCaseSpanish(String s)
    {
        StringBuilder ret = new StringBuilder();
        for (int i=0; i < s.length(); i++) ret.append(toLowerCaseSpanish(s.charAt(i)));
        return ret.toString();
    }


    /**
     * Checks if this character could be a Spanish Language Character
     * 
     * @param c Any ASCII or Uni-Code Character
     * 
     * @return <B>TRUE:</B> If and only if 'c' is one of the following char-sets:
     * 
     * <BR /><BR /><UL CLASS=JDUL>
     * <LI>a ... z</LI>
     * <LI>A ... Z</LI>
     * <LI>Á (193), É (201), Í (205), Ó (211), Ú (218), Ý (221), Ü (220), Ñ (209)</LI>
     * <LI>á (225), é (233), í (237), ó (243), ú (250), ý (253), ü (252), ñ (241)</LI>
     * </UL>
     * 
     * <BR />and <B>FALSE</B> otherwise...
     */
    public static boolean isLanguageChar(char c)
    {
        if ((c >= 'a') && (c <= 'z')) return true;
        if ((c >= 'A') && (c <= 'Z')) return true;

        // Á 193, É 201, Í 205, Ó 211, Ú 218, Ý 221, Ü 220, Ñ 209
        if (    (c == 193) || (c == 201) || (c == 205) || (c == 211) || (c == 218) || (c == 221)
            ||  (c == 220) || (c == 209))
            return true;

        // á 225, é 233, í 237, ó 243, ú 250, ý 253, ü 252, ñ 241
        if (    (c == 225) || (c == 233) || (c == 237) || (c == 243) || (c == 250) || (c == 253)
             || (c == 252) || (c == 241))
            return true;

        return false;
    }
    
    /**
     * Checks if a {@code String} contains non-Spanish-Language Characters.  Utilizes
     * {@link #isLanguageChar(char)}
     * 
     * @param s Any {@code String} consisting of ASCII &amp; UniCode Characters
     * 
     * @return	{@code TRUE} only if {@code isLanguageChar(s.charAt(i))} returns {@code TRUE} for
     * ever integer {@code i}, and <B>FALSE</B> otherwise.
     * 
     * @see #isLanguageChar(char)
     */
    public static boolean onlyLanguageChars(String s)
    {
        for (int i=0; i < s.length(); i++) if (! isLanguageChar(s.charAt(i))) return false;
        return true;
    }

    /**
     * This is a function which identifies Spanish Language Infinitive Form Verbs.
     * 
     * @param s Any String consisting of ASCII &amp; UniCode Characters
     * 
     * @return {@code TRUE} if and only if:
     * <BR />input-parameter {@code 's'} ends with: ar, er, ir, arse, erse, irse, ír, írse
     * <BR />{@code 's'} passes the {@link #onlyLanguageChars(String)} boolean test
     * <BR /><B>FALSE</B> otherwise
     * 
     * @see #onlyLanguageChars(String)
     */
    public static boolean isSpanishVerbInfinitive(String s)
    {
        s = toLowerCaseSpanish(s);

        if (onlyLanguageChars(s))
            if (    s.endsWith("ar")	|| s.endsWith("er")		|| s.endsWith("ir")
                ||  s.endsWith("arse")	|| s.endsWith("erse")	|| s.endsWith("irse")
                ||  s.endsWith("ír")	|| s.endsWith("írse"))
                return true;

        return false;
    }

    private static final String[] ESC_STRS =
    {
        "&aacute;", "&eacute;", "&iacute;", "&oacute;", "&uacute;", "&Aacute;", "&Eacute;",
        "&Iacute;", "&Oacute;", "&Uacute;", "&ntilde;", "&laquo;", "&raquo;", "&mdash;", "&uuml;",
        "&iuml;", "&iexcl;", "&iquest;", "&quot;"
    };

    private static final char[] REPL_CHARS =
    {
        'á',  'é', 'í', 'ó', 'ú', 'Á', 'É', 'Í', 'F', 'Ú', 'ñ', '«', '»', '-', 'ü', 'ï', '¡',
        '¿', '\"'
    };

    /**
     * This function is somewhat redundant, as a complete HTML-Character Escape-Sequence class is
     * included in the Torello.HTML package.  There is a link provided to these methods at the end
     * of this comment.  This method was written much earlier, and functions well, but it can only
     * convert HTML-Escape-Sequences that are used in Spanish - rather than all HTML-Character
     * Escape-Sequences. Here is the complete list:
     * 
     * <BR /><BR /><TABLE>
     * <TR>
     *      <TD>&amp;aacute;</TD><TD>&rArr; á</TD></TR><TR><TD>&amp;eacute;</TD>
     *      <TD>&rArr;	é</TD>
     * </TR>
     * <TR>
     *      <TD>&amp;iacute;</TD><TD>&rArr; í</TD></TR><TR><TD>&amp;oacute;</TD>
     *      <TD>&rArr;	ó</TD>
     * </TR>
     * <TR>
     *      <TD>&amp;uacute;</TD><TD>&rArr; ú</TD></TR><TR><TD>&amp;Aacute;</TD>
     *      <TD>&rArr;	Á</TD>
     * </TR>
     * <TR>
     *      <TD>&amp;Eacute;</TD><TD>&rArr; É</TD></TR><TR><TD>&amp;Iacute;</TD>
     *      <TD>&rArr;	Í</TD>
     * </TR>
     * <TR>
     *      <TD>&amp;Oacute;</TD><TD>&rArr; Ó</TD></TR><TR><TD>&amp;Uacute;</TD>
     *      <TD>&rArr;	Ú</TD>
     * </TR>
     * <TR>
     *      <TD>&amp;ntilde;</TD><TD>&rArr; ñ</TD></TR><TR><TD>&amp;laquo;</TD>
     *      <TD>&rArr;	«</TD>
     * </TR>
     * <TR>
     *      <TD>&amp;raquo; </TD><TD>&rArr; »</TD></TR><TR><TD>&amp;mdash;</TD>
     *      <TD>&rArr;	-</TD>
     * </TR>
     * <TR>
     *      <TD>&amp;uuml;  </TD><TD>&rArr; ü</TD></TR><TR><TD>&amp;iuml;</TD>
     *      <TD>&rArr;	ï</TD>
     * </TR>
     * <TR>
     *      <TD>&amp;iexcl;	</TD><TD>&rArr;	¡</TD></TR><TR><TD>&amp;iquest;</TD>
     *      <TD>&rArr;	¿</TD>
     * </TR>
     * <TR>
     *      <TD>&amp;quot;</TD>
     *      <TD>&rArr;	"</TD>
     * </TR>
     * </TABLE>
     * 
     * @param s Any ASCII/UniCode String, which ostensibly ought to (possibly) contain
     * Spanish-Language HTML-Escaped characters within them.
     * 
     * @return A string where all HTML escape-sequences have been converted to their actual
     * character equivalent.
     * 
     * @see Torello.HTML.Escape#escHTMLToChar(String)
     * @see Torello.HTML.Escape#htmlEsc(char)
     * @see StrReplace#r(String, String[], char[])
     */
    public static String convertHTML_TO_UTF8(String s)
    { return StrReplace.r(s, ESC_STRS, REPL_CHARS); }


    //*********************************************************************************************
    //*********************************************************************************************

    /**
     * This is some "list processing" stuff - used to grep "DictCC".  It's an internally used
     * list.
     */
    private static Vector<String> removeList = null;

    /**
     * This just stores a list of "words", and they are removed from certain texts/articles.  This
     * program currently uses it to remove certain extremely commonly used words, so they are not
     * repeatedly searched for in the dictionary.  It is <I>kind of</I> a hack.
     * 
     * @param wordList An array of Strings.  It is expected to be a list of words that may be
     * removed from Spanish Texts, but it can be any list of words.  It is checked to see if 100%
     * of the characters in each word are alphabetic, and throws an IllegalArgumentException if they
     * are not.
     * 
     * @throws IllegalArgumentException if the wordList parameter contains strings with invalid
     * non-word characters.
     */
    public static void setRemoveWordsArr(String[] wordList)
    {
        removeList = new Vector<String>();
        
        for (int i=0; i < wordList.length; i++)
        {
            String word = wordList[i];

            for (int j=0; j < word.length(); j++)

                if (! isLanguageChar(word.charAt(j))) throw new IllegalArgumentException(
                    "Contains word:" + word + " which has invalid, non-word, language-characters");

            removeList.addElement(word);
        }
    }

    /**
     * This function references the words in the "removeList" and removes every occurence of each
     * word that is present in the "removeList" {@code Vector<String>}
     * 
     * @param s A String of Spanish Words.
     * 
     * @return The same string with each instance of each word that is listed in the "removeList"
     * {@code Vector} removed from the {@code String}
     * 
     * @see #setRemoveWordsArr(String[])
     */
    public static String removeWords(String s)
    {
        // boolean printIt = false;
        // int tpos = s.indexOf(" a ");
        // if (tpos != -1) if (s.indexOf(" a ", tpos + 3) != -1) printIt = true;
        // if (printIt) System.out.println(s + ":");
        
        Enumeration<String> e = removeList.elements();
        // System.out.println("CLEANING: [" + s + "]");

        while (e.hasMoreElements())
        {
            String lc = toLowerCaseSpanish(s);

            // System.out.print(" <" + lc + ">");
            String word = e.nextElement();

            // System.out.print(" {" + word + "}");
    
            int pos = 0;
            while ((pos = lc.indexOf(word, pos)) != -1)
            {
                int     startPos    = pos;
                int     endPos      = pos + word.length();
                boolean leftEnd     = (startPos == 0);
                boolean rightEnd    = (endPos == lc.length());
                char    leftChar    = leftEnd ? 0 : lc.charAt(startPos - 1);
                char    rightChar   = rightEnd ? 0 : lc.charAt(endPos);

                // if (printIt) System.out.print("(" + leftChar + "," + rightChar + "," + leftEnd +
                // "," + rightEnd + "," + startPos + "," + endPos + ") ");
    
                if (isLanguageChar(leftChar))   { pos = endPos; continue; }
                if (isLanguageChar(rightChar))  { pos = endPos; continue; }

                // System.out.print("(" + startPos + "," + endPos + ")" );
                boolean leftSpace = (leftChar == ' ');
                boolean rightSpace = (rightChar == ' ');

                if (leftSpace && rightSpace)    startPos--;
                else if (leftSpace && rightEnd) startPos--;
                else if (leftEnd && rightSpace) endPos++;
                
                s = (leftEnd ? "" : s.substring(0, startPos)) +
                    (rightEnd ? "" : s.substring(endPos));

                // if (printIt) System.out.print("[" + s + "] ");
                lc = toLowerCaseSpanish(s);
            }
        }

        // if (printIt) System.out.println("\n");
        return s;
    }
}