Verbs.java.html

package Torello.Languages;

import Torello.Java.*;
import Torello.HTML.*;

import Torello.Java.Additional.RemoveUnsupportedIterator;

import java.io.*;
import java.util.*;
import java.util.regex.*;

/**
 * Conjugating Verbs (Spanish).
 * 
 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=V>
 */
@SuppressWarnings("unchecked")
public class Verbs
{
    static void main(String[] argv)
    {
        loadDefinitions();
        loadConjugations();
        loadInfinitives();
        loadIrregularInfinitives();

        for (String s : irregularInfinitives) System.out.print(s + ", ");
        for (String s : infinitives) System.out.print(s + ", ");
        System.out.println("\n" + conjugations);
    }

    private Verbs() { }

    private static String                   conjugations            = null;
    private static TreeSet<String>          irregularInfinitives    = new TreeSet<>();
    private static TreeSet<String>          infinitives             = new TreeSet<>();
    private static TreeMap<String, String>  definitions             = new TreeMap<>();

    private static final String R                   = "data-files/Spanish/";
    private static final String CONJUGATIONS        = R + "CONJUGATIONS.tmdat"; 
    private static final String IRREG_INFINITIVES   = R + "IRREGULAR_INFINITIVES.tsdat";
    private static final String INFINITIVES         = R + "INFINITIVES.tsdat";
    private static final String DEFINITIONS         = R + "DEFINITIONS.tmdat";
    private static final String DEFINITIONS_JS      = R + "definitions.sdat";
    private static final String POPUP_WIN_JS        = R + "verbs.sdat";
    private static final String IRREG_CONJ_JS       = R + "ir.sdat";
    private static final String POPUP_WIN_CSS       = R + "css.sdat";

    private static Runtime rt = null;

    private static void GC()
    { if (rt == null) rt = Runtime.getRuntime(); rt.gc(); }

    /**
     * Loads the Conjugations {@code String} into memory.  This must be in memory before working
     * with Verb-Spans.
     * 
     * @see LFEC#loadFile_JAR(Class, String)
     */
    public static void loadConjugations()
    { conjugations = LFEC.loadFile_JAR(Verbs.class, CONJUGATIONS); }

    /**
     * Releases the memory for the (rather large) Java-{@code String} containing the verb
     * conjugations, and calls <CODE>System&#46;gc()</CODE>.
     */
    public static void releaseConjugations()
    { conjugations = null; GC(); }

    /**
     * Loads the list of Irregular Infinitives into Java Memory, from the JAR.  This
     * {@code TreeSet} needs to be loaded into memory before working with Verb-Spans.
     * 
     * @see LFEC#readObjectFromFile_JAR(Class, String, boolean, Class)
     */
    public static void loadIrregularInfinitives()
    {
        irregularInfinitives = (TreeSet<String>) LFEC.readObjectFromFile_JAR
            (Verbs.class, IRREG_INFINITIVES, true, TreeSet.class);
    }

    /**
     * Releases the memory for the (rather large) <CODE>java&#46;util&#46;TreeSet</CODE> of
     * Irregular Infinitives, and calls <CODE>System&#46;gc()</CODE>.
     */
    public static void releaseIrregularInfinitives()
    { irregularInfinitives.clear(); irregularInfinitives = null; GC(); }

    /**
     * Loads the complete list of known infinitives from the JAR to the {@code TreeSet<String>}
     * 
     * @see LFEC#readObjectFromFile_JAR(Class, String, boolean, Class)
     */
    public static void loadInfinitives()
    { 
        infinitives = (TreeSet<String>) LFEC.readObjectFromFile_JAR
            (Verbs.class, INFINITIVES, true, TreeSet.class);
    }

    /**
     * Releases the memory for the {@code TreeSet} of infinitives, and calls
     * <CODE>System&#46;gc()</CODE>.
     */
    public static void releaseInfinitives()
    { infinitives.clear(); infinitives = null; GC(); }

    /**
     * Loads the definitions file - which is typed as a {@code TreeMap<String, String>}
     * 
     * @see LFEC#readObjectFromFile_JAR(Class, String, boolean, Class)
     */
    public static void loadDefinitions()
    { 
        definitions = (TreeMap<String, String>) LFEC.readObjectFromFile_JAR
            (Verbs.class, DEFINITIONS, true, TreeMap.class);
    }

    /**
     * Releasees the memory for the {@code TreeMap} of definitions, and calls
     * <CODE>System&#46;gc()</CODE>
     */
    public static void releaseDefinitions()
    { definitions.clear(); definitions = null; GC(); }

    // ********************************************************************************************
 	// ******************** View and Inspect the data in the data-files ***************************
    // ********************************************************************************************

    /**
     * Lazily-Load &amp; retrieve the Java-Script Web-Files needed with the Spanish Verb Cojugation
     * popup-windows.
     * 
     * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=WEBFJS>
     */
    @Torello.JavaDoc.StaticFunctional
     public static class WebFiles
     {
        private WebFiles() { }

        /**
         * Extracts Java-Script <B>"Spanish verb Definitions"</B> Java-Script Arrays from the
         * Java-HTML JAR File, and returns those arrays as a <CODE>java&#46;lang&#46;String</CODE>.
         * 
         * @return This simply returns the necessary Java-Script file as a Java-{@code String} that
         * contains all verb definitions.  Save this file, transmit it, convert it.  Generally it
         * can be used to make definition pop-up windows in JS.
         * 
         * @see LFEC#readObjectFromFile_JAR(Class, String, boolean, Class)
         */
        public static String JSgetDefinitionsCode()
        { 
            return LFEC.readObjectFromFile_JAR
                (Verbs.class, DEFINITIONS_JS, true, String.class);
        }

        /**
         * Extracts Java-Script <B>"Verb Conjugations Pop-Up Window"</B> Java-Script Functions from
         * the Java-HTML JAR File, and returns those functions as a
         * <CODE>java&#46;lang&#46;String</CODE>.
         * 
         * @return This returns the Java-Script file (as a {@code String}) that contains the "Verb
         * Popup Window" code.  Save this file, and store it in an accessible directory when you
         * use the {@code 'addVerbSpans'} method.
         * 
         * <BR /><BR />Save this file to disk, and put it in your HTML
         * {@code <HEADER>...<SCRIPT TYPE="text/javascript">} section!
         * 
         * @see LFEC#readObjectFromFile_JAR(Class, String, boolean, Class)
         */
        public static String JSgetPopupCode()
        {
            return LFEC.readObjectFromFile_JAR
                (Verbs.class, POPUP_WIN_JS, true, String.class);
        }

        /**
         * Extracts the needed <B>CSS Declaractions</B> from the Java-HTML JAR File, and returns
         * that CSS as a <CODE>java&#46;lang&#46;String</CODE>.
         * 
         * @return This returns the CSS file (as a {@code String}) for formatting the "Verb Popup
         * Window."  If a call to method {@code 'addVerbSpans'} is used, and the three files listed
         * (3 Java-Script, and 1 CSS) are included in the directory of the output page, then 
         * verb-conjugation pop-up windows will be functioning.
         * 
         * <BR /><BR />Save this file to disk, and put it in your HTML
         * {@code <HEADER>...<SCRIPT TYPE="text/javascript">} section!
         * 
         * @see LFEC#readObjectFromFile_JAR(Class, String, boolean, Class)
         */
        public static String CSSgetCode()
        {
            return LFEC.readObjectFromFile_JAR
                (Verbs.class, POPUP_WIN_CSS, true, String.class);
        }

        /**
         * Extracts Java-Script <B>"Irregular Spanish Verbs"</B> Java-Script Arrays from the
         * Java-HTML JAR File, and returns those arrays as a <CODE>java&#46;lang&#46;String</CODE>.
         * 
         * @return This returns the last Java-Script file you will need to put a "Verbs Pop-up
         * Window" on your Spanish HTML documents.
         * 
         * <BR /><BR />Save this file to disk, and put it in your HTML
         * {@code <HEADER>...<SCRIPT TYPE="text/javascript">} section!
         * 
         * @see LFEC#readObjectFromFile_JAR(Class, String, boolean, Class)
         */
        public static String JSgetIrregulars()
        { 
            return LFEC.readObjectFromFile_JAR
                (Verbs.class, IRREG_CONJ_JS, true, String.class);
        }
     }


    // ********************************************************************************************
    // ********************************************************************************************
 	// View and Inspect the data in the data-files
    // ********************************************************************************************
    // ********************************************************************************************


    /**
     * Generates an iterator of Spanish Verb Infinitives.  Items may not be removed via the
     * iterator's {@code 'remove()'} method.
     * 
     * @return An iterator of all Spanish Verbs loaded into the infinitives TreeSet.
     * @see RemoveUnsupportedIterator
     */
    public static Iterator<String> infinitives()
    { return new RemoveUnsupportedIterator<String>(infinitives.iterator()); }

    /**
     * Generates an iterator of Spanish Irregular-Verbs in Infinitive Form.  Items may not be
     * removed via the iterator's {@code 'remove()'} method.
     * 
     * @return An iterator of all Irregular Spanish Verbs loaded into the irregular-infinitives
     * TreeSet.
     * 
     * @see RemoveUnsupportedIterator
     */
    public static Iterator<String> irregularInfinitives()
    { return new RemoveUnsupportedIterator<String>(irregularInfinitives.iterator()); }

    /**
     * Gets the quick-definition of a Spanish Verb.<BR />
     * <B>EXPECTATIONS:</B>
     * 
     * <BR /><BR /><UL CLASS=JDUL>
     * <LI>The "definitions" data file must already be loaded into memory</LI>
     * <LI>To be precise, loadIDefinitions() needs to have been called!</LI>
     * 
     * <LI> word <B>MUST</B> be in <I>lower-case Spanish</I> - otherwise results might be
     *      inaccurate!
     *      </LI>
     * 
     * <LI><B>TRY:</B> ES.toLowerCaseSpanish(String) to make sure.</LI>
     * </UL>
     * 
     * @param infinitiveInLowerCase This may be any Spanish Verb - <I><B>as long as it is in the
     * infinitive form.</I></B>
     * 
     * @return Will return the string stored as the value in the
     * {@code TreeMap<String, String>} <I>definitions</I>, and null if this infinitive is not
     * found in the  dictionary.
     * 
     * @see ES#toLowerCaseSpanish(String)
     */
    public static String getDefinition(String infinitiveInLowerCase)
    { return definitions.get(infinitiveInLowerCase); }

    /**
     * Get the infinitive form of a Verb-{@code String}.<BR />
     * <B>EXPECTATIONS:</B>
     * 
     * <BR /><BR /><UL CLASS=JDUL>
     * <LI>The "conjugations" data file must already be loaded into memory</LI>
     * <LI>To be precise, loadIConjugations() needs to have been called!</LI>
     * 
     * <LI> word <B>MUST</B> be in <I>lower-case Spanish</I> - otherwise results might be
     *      inaccurate!
     *      </LI>
     * 
     * <LI><B>TRY:</B> ES.toLowerCaseSpanish(String) to make sure.</LI>
     * </UL>
     * 
     * @param wordInLowerCase This can be any word (in Spanish... or any language for that matter).
     * 
     * <BR /><BR />It is expected to be a conjugated form of a Spanish verb.  <I>If it
     * is...</I> The original infinitive form of that verb will be returned.
     * 
     * @return <UL CLASS=JDUL>
     * 
     * <LI> Returns the Infinitive of a verb - if the word passed is a direct conjugation of that
     *      verb.
     *      </LI>
     * 
     * <LI> Returns null if there are no matching verbs conjugations in {@code private static
     *      String conjugations}
     *      </LI>
     * 
     * </UL>
     */
    public static String getInfinitive(String wordInLowerCase)
    {
        // Eliminates common words that aren't verbs - but conjugate .. "para" "como"
        // for (int k=0; k < skip.length; k++) if (wtlc.equals(skip[k])) return null;

        // GREP through the conjugations data file (stored in String: conjugations)
        int pos = conjugations.indexOf(" " + wordInLowerCase + ",");

        if (pos == -1)
            if (wordInLowerCase.charAt(wordInLowerCase.length() - 1) == 'r')
                pos = conjugations.indexOf("\n" + wordInLowerCase + ":");

        // the post-increment (++) is for the infinitive case match.
        // Specifically, the first character, in this (the infinitive) case, would be a 
        // newline '\n'.. and a '\n' character is exactly what the loop which follows is
        // grep'ing for...

        if (pos == -1) return null; else pos++;
        
        // There *WAS* a match in the conjugations data file. - get infinitive and return
        while ((conjugations.charAt(--pos) != '\n') && (pos > 0));

        return conjugations.substring(pos + 1, conjugations.indexOf(':', pos + 1));
    }

    /**
     * Checks if a word is an irregular verb.
     *
     * <BR /><BR /><B>EXPECTATIONS:</B>
     * 
     * <BR /><BR /><UL CLASS=JDUL>
     * 
     * <LI>The "irregular infinitives" data file must already be loaded into memory</LI>
     * <LI>To be precise, loadIrregularInfinitives() needs to have been called!</LI>
     *
     * <LI> word <B>MUST</B> be in <I>lower-case Spanish</I> - otherwise results will be 
     *      inaccurate!
     *      </LI>
     *
     * <LI><B>TRY:</B> ES.toLowerCaseSpanish(String) to make sure</LI>
     * </UL>
     * 
     * @param infinitiveInLowerCase This may be any Spanish Verb - as long as it is in the
     * infinitive form.  This word must have been converted <I>to lower case</I>, and if not, it
     * will likely return null.
     * 
     * @return Will return <I>TRUE</I> if this verb is contained by the list of irregular-verbs
     * Will return <I>FALSE</I> otherwise.
     * 
     * @see ES#toLowerCaseSpanish(String)
     */
    public static boolean isIrregular(String infinitiveInLowerCase)
    { return irregularInfinitives.contains(infinitiveInLowerCase); }


 	// *************** HTML TOKEN AND PUNCTUATION REMOVAL from Vocabulary Strings *****************
    // Here is the expression I used
    //     (complete-expression, without marking it up for Java... (escaped-slashes))
    // This is looking for punctuation: [,-:;'"¿?¡!“”&@\.\?\$\(\)]
    // The main string can ONLY contain THESE CHARS: [a-z,A-Z,áéíóúüñýÁÉÍÓÚÜÑÝ]
    // ********************************************************************************************
    // Match Groups:
    //			Group 1: ORIGINAL String
    //			Group 2: The "Prefix" - a.k.a. the "stuff" before the Spanish-Text
    //			Group 3: The Spanish-text part (expected to be a Spanish Word - but not guaranteed)
    //			Group 4: The "Suffix" - a.k.a. the "stuff" after the Spanish-Word/Text-token
    // ********************************************************************************************
    private static final String     PUNCTUATION = "([,-:;\'\"¿?¡!“”&@\\.\\?\\$\\(\\)\\s]*)";
    private static final String     LANGUAGE    = "([a-zA-ZáéíóúüñýÁÉÍÓÚÜÑÝ]*?)"; 
    private static final String     RE1			= "(^" + PUNCTUATION + LANGUAGE + PUNCTUATION + "$)";
    private static final Pattern    P1			= Pattern.compile(RE1, Pattern.CASE_INSENSITIVE);
    // ********************************************************************************************

    /**
     * This software is not perfect - Human Language is a new order of issues.  There are many
     * features that could be added to make a better translator, but I have been busy writing an
     * HTML Scrape Package instead.  When you see this array, what it means is that these words are
     * extremely common words in Spanish, but usually, in about 80% to 90% of cases, aren't verbs.
     * A "Lexical Analysis" could probably figure out much better when a word is guaranteed to be
     * verb, but for now, these words are "just skipped" and never identified as verbs at all.
     * 
     * <BR /><BR /><B>NOTE:</B> You may change this at your discretion, just re-assign the array.
     */
    // When these wordsa are found in the newspaper, don't include them at all!
    public static String[] skip =
    { "como", "casa", "para", "uno", "una", "cosa", "nada", "entre", "dallas" };

    /**
     * This will call the "addSpanishVerbSpans" on each {@code TextNode} found in the page 
     * {@code Vector}.
     * 
     * @param regularVerbsFound If this parameter isn't null, than any and all regular verbs found
     * within  the text will be added to this {@code TreeSet}.  If this parameter is null, it will
     * be ignored.
     * 
     * @param irregularVerbsFound If this parameter isn't null, than any irregular-verbs found in
     * this text will be added to this {@code TreeSet}.  If this parameter is null, it will be
     * ignored.
     * 
     * @param wordsNotFound All words that are found, and aren't verbs are entered into this
     * {@code TreeSet}, if this  parameter is not null.  If this parameter is null, it will be
     * ignored.
     * 
     * @see #addSpanishVerbSpans(String, TreeSet, TreeSet, TreeSet)
     */
    public static void addSpanishVerbSpans( 
            Vector<HTMLNode> page,
            TreeSet<String> regularVerbsFound,
            TreeSet<String> irregularVerbsFound,
            TreeSet<String> wordsNotFound
        )
    {
        HTMLNode n;

        for (int i=0; i < page.size(); i++)

            if ((n = page.elementAt(i)) instanceof TextNode)
            {
                Vector<HTMLNode> withSpans = addSpanishVerbSpans
                    (n.str, regularVerbsFound, irregularVerbsFound, wordsNotFound);

                page.removeElementAt(i);
                page.addAll(i, withSpans);

                i += withSpans.size() - 1;   
                    // Trust me, this is right!
                    // If "withSpans.size() == 1" (a.k.a. "no-change"), then should do: i += 0;
                    // If "withSpans.size() == 2" (increased by 1), then should do: i += 1;
            }
    }

    /**
     * The purpose of this class is to go through the Spanish Verbs in an HTML page, and replace
     * them with HTML {@code <SPAN>} elements to facilitate Verb-Conjugation Popup-Windows.
     * 
     * @param regularVerbsFound If this parameter isn't null, than any and all regular verbs found
     * within  the text will be added to this {@code TreeSet}.  If this parameter is null, it will
     * be ignored.
     * 
     * @param irregularVerbsFound If this parameter isn't null, than any irregular-verbs found in
     * this text  will be added to this {@code TreeSet}.  If this parameter is null, it will be
     * ignored.
     * 
     * @param wordsNotFound All words that are found, and aren't verbs are entered into this
     * {@code TreeSet}, if this  parameter is not null.  If this parameter is null, it will be
     * ignored.
     * 
     * @return An html sub-page (as a {@code Vector}) where each found Spanish-Verb has been
     * surrounded by an HTML {@code <SPAN>} element  that indicates the regularity of the verb,
     * and it's infinitive-form conjugation.
     * 
     * @see ES#onlyLanguageChars(String)
     * @see ES#toLowerCaseSpanish(String)
     * @see HTMLPage#getPageTokens(CharSequence, boolean)
     */
    public static Vector<HTMLNode> addSpanishVerbSpans(
            String          text,
            TreeSet<String> regularVerbsFound,
            TreeSet<String> irregularVerbsFound,
            TreeSet<String> wordsNotFound
        )
    {
        // Keep list of found regular-verbs in the tree-set
        boolean keepRV = regularVerbsFound != null;

        // Keep list of found irregular-verbs in the tree set
        boolean keepIV = irregularVerbsFound != null;

        // Keep list of words that weren't verbs in the tree-set
        boolean keepNV = wordsNotFound != null;

        StringBuilder outSB = new StringBuilder();

        // Splits the string by spaces
        String[] words = text.split(" ");
        
        for (int j=0; j < words.length; j++)
        {
            // Sometimes it is the empty string or just white-space
            String trim = words[j].trim();

            if (trim.length() == 0)
            { outSB.append(" " + words[j]); continue; }
            
            // Eliminates leading and trailing punctuation & HTML tags
            Matcher m = P1.matcher(trim);

            if (! m.find())
            { outSB.append(" " + words[j]); continue; }

            String pre  = m.group(2);
            String word = m.group(3);
            String post = m.group(4);

            if (! ES.onlyLanguageChars(word)) System.out.println
                ("ORIG: [" + words[j] + "], " + pre + ", " + word + ", " + post);

            if (word == null)
                { outSB.append(" " + words[j]); continue; }

            if (pre == null)    pre = "";
            if (post == null)   post = "";

            if (word.length() == 0)
                { outSB.append(" " + words[j]); continue; }

            String lc = ES.toLowerCaseSpanish(word);

            // Skip the "ultra-common" non-verbs that look just like verbs.
            for (String w : skip) if (lc.equals(w)) continue;

            String infinitive=  getInfinitive(lc);

            if (infinitive == null)
                { if (keepNV) wordsNotFound.add(lc); continue; }

            else
                { if (keepRV) regularVerbsFound.add(infinitive); }

            outSB.append(" " + pre + "<SPAN CLASS=\"");

            if (isIrregular(infinitive))
                { outSB.append('I'); if (keepIV) irregularVerbsFound.add(infinitive); }

            else
                { outSB.append('R'); }

            outSB.append("V\" DATA-V=\"" + infinitive + "\">" + word + "</SPAN>" + post);
        }

        outSB.append('\n');

        return HTMLPage.getPageTokens(outSB, false);
    }
}