RegExFiles.java.html

package Torello.Java;

import java.util.*;
import java.io.*;
import java.util.regex.*;
import java.util.zip.*;

/**
 * A utility for saving Regular-Expressions in a text-file that may be lazily-loaded at run-time
 * when needed.
 * 
 * <EMBED CLASS='external-html' DATA-FILE-ID=REGEX_FILES>
 */
@Torello.JavaDoc.StaticFunctional
public class RegExFiles
{
    private RegExFiles() { }

    /**
     * This loads a regular expression text file.  Each line is interpreted as a new Regular
     * Expression {@code Pattern}.
     *
     * <BR /><BR />This method expects the <B><I>entire regular expression to fit on a single
     * line</I></B>, and therefore, each new line containing text-data (without a starting
     * <B>{@code '#'}</B>) will be compile into a new regular expression.  Use the {@code '\n'}
     * within the expression to generated newlines.
     *
     * <BR /><BR /><B CLASS=JDDescLabel>Some Syntax Rules:</B>
     * 
     * <BR /><UL CLASS=JDUL>
     * <LI> <B>Comment lines</B> are lines beginning with the <I>POUND</I> <B>({@code '#'})</B>
     *      sign.
     *      </LI>
     * <LI> <B>Blank lines</B> are ignored by the file-parse completely.</LI>
     * <LI> Lines with <B>only white-space</B> are considered blank.</LI>
     * <LI> <B>Flag Lines</B> are lines that begin with two, successive, <I>POUND</I>
     *      <B>({@code '##'})</B> signs.
     *      </LI>
     * <LI> All non-comment, non-blank and non-flag lines are converted into Regular-Expression
     *      {@code Pattern's}
     *      </LI>
     * </UL>
     *
     * <BR /><BR /><B CLASS=JDDescLabel>LFEC Note:</B>
     * 
     * <BR />This method will <I>halt program execution</I> if any exceptions occur when loading a
     * Regular-Expression text file!  This is the primary-purpose of all {@code 'LFEC'} - Load File
     * Exception Catch methods.
     *
     * @param f Filename for a Regular Expression
     *
     * @return A {@code Vector} containing one compiled regular expression per line.  Comment lines
     * &amp; blank lines will all be ignored.
     *
     * @see java.util.regex.Pattern
     * @see #generateFlags(String)
     * @see LFEC#ERROR_EXIT(Throwable, String)
     */
    public static Vector<Pattern> LFEC(String f)
    {
        try
            { return parse(FileRW.loadFileToVector(f, false), f); }

        catch (Throwable t)
        {
            LFEC.ERROR_EXIT(t, "Attempt to load Regular Expression file: [" + f + "], failed.\n");
        }

        return null; // Should NOT be possible to reach this statement...
    }

    /**
     * This does the <B><I>exact same thing</I></B> as {@link LFEC}, but loads the file into a
     * {@code Vector} using the "JAR File" information included here.  In this case, parameter
     * {@code f} indicates a jar-file class-loader pointer.  It will not load from the standard
     * file-system.
     *
     * <BR /><BR /><B CLASS=JDDescLabel>Java's <CODE>getResourceAsStream</CODE>:</B>
     * 
     * <BR />The JAR implies that Java's "load resource as stream" features are being used in place
     * of standard file i/o routines.  Specifically, this loads from a JAR file, as seen below:
     *
     * <DIV CLASS=SNIP>{@code
     * BufferedReader br =
     *     new BufferedReader(new InputStreamReader(c.getResourceAsStream(f)));
     * }</DIV>
     *
     * @param c This contains the class that is loading the file.  It is not too important to use
     * the "exact class" - since the only reason the class doing the loading is because the
     * "Class Loader" employs the exact "Package Name" of the class for figuring out the
     * directory / sub-directory where the data-file is stored.  This variable may not be null.
     *
     * <BR /><BR /><B>EXAMPLE:</B> If you wanted to load a "Regular Expressions.txt" file that
     * was in the same BASH/Debian/etc...  directory as the following class - the following call
     * to {@code 'RegExFiles'} would load the text-file "Regular Expressions.txt" into memory
     * quickly.  The primary purpose being that text files are <B><I>much easier to read than
     * 'double-escaped' Java {@code String's}.</I></B>
     *
     * <BR /><BR /><B>NOTE:</B> It might be important to read the Java Doc's about the
     * {@code 'getResourceAsStream(String)'} method for retrieving data that was stored to a JAR
     * file instead of a UNIX/BASH/MS-DOS system file.  Oracle's Java 8 would help.
     *
     * <EMBED CLASS='external-html' DATA-FILE-ID=RAWTYPES>
     *
     * @param f This is a file-pointer to a file stored inside a Java JAR file.
     *
     * @return A Vector containing one compiled regular expression per line.  Comment lines &amp;
     * blank lines will all be ignored.
     *
     * @see #LFEC(String)
     * @see #parse(Vector, String)
     * @see LFEC#ERROR_EXIT(Throwable, String)
     */
    public static Vector<Pattern> LFEC_JAR(Class<?> c, String f)
    {
        try (
            InputStream     is = c.getResourceAsStream(f);
            BufferedReader  br = new BufferedReader(new InputStreamReader(is));
        )
        {
            String          s       = "";
            StringBuilder   sb      = new StringBuilder();
            Vector<String>  file    = new Vector<String>();

            while ((s = br.readLine()) != null) file.addElement(s);

            return parse(file, f);
        }

        catch (Throwable t)
        { 
            LFEC.ERROR_EXIT(
                t,
                "Attempted to load Regular Expression file: [" + f + "]\n" +
                "From jar-file using class: [" + c.getCanonicalName() + "]\n" +
                "Did not load successfully."
            );
        }

        // Should NOT be possible to reach this statement...
        // Compiler does not recognize LFEC.ERROR_EXIT

        return null;
    }

    /**
     * This is identical to {@code LFEC_JAR}, except that it presumes the file was compressed
     * before saving.
     *
     * @param c This contains the class that is loading the file.  It is not too important to use
     * the "exact class" - since the only reason the class doing the loading is because the "Class
     * Loader" employs the exact "Package Name" of the class for figuring out the directory /
     * sub-directory where the data-file is stored.  This variable may not be null.  Again, the
     * class-loader looks in the directory of the package that contains this class!
     *
     * <BR /><BR /><B>NOTE:</B> The method {@code public static Vector<Pattern> 
     * LFEC_JAR(Class, String;)} has a more detailed look at the particular use of this parameter.
     * The easy way to understand is: just pass the class that is doing the actual loading of the
     * regular-expression <B><I>(presuming the regex.dat file is in the same directory as the 
     * {@code '.class'} file!)</I></B>
     *
     * <EMBED CLASS='external-html' DATA-FILE-ID=RAWTYPES>
     *
     * @param f This is a file-pointer to a file stored inside a Java JAR file.
     *
     * @return A {@code Vector} containing one compiled regular expression per line.  Comment
     * lines &amp; blank lines will all be ignored.
     *
     * @see #LFEC_JAR(Class, String)
     * @see #parse(Vector, String)
     * @see LFEC#ERROR_EXIT(Throwable, String)
     */
    public static Vector<Pattern> LFEC_JAR_ZIP(Class<?> c, String f)
    {
        try (
            InputStream         is      = c.getResourceAsStream(f);
            GZIPInputStream     gzip    = new GZIPInputStream(is);
            ObjectInputStream   ois     = new ObjectInputStream(gzip);
        )
        {
            Object              ret         = ois.readObject();
            String              fileStr     = (String) ret;
            Vector<String>      file        = new Vector<>();
            int                 newLinePos  = 0;

            while ((newLinePos = fileStr.indexOf('\n')) != -1)
            {
                file.addElement(fileStr.substring(0, newLinePos));
                fileStr = fileStr.substring(newLinePos + 1);
            }

            return parse(file, f);

        }

        catch (Throwable t)
        {
            LFEC.ERROR_EXIT(t,
                "Attempted to load Regular Expression file: [" + f + "]\n" +
                "From jar-file using class: [" + c.getCanonicalName() + "]\n" +
                "Content was zipped, but failed to load."
            );
        }

        return null; // Should NOT be possible to reach this statement...
    }

    /**
     * This does the <B><I>exact same thing</I></B> as {@link LFEC}, but takes a "pre-loaded file"
     * as a {@code Vector}.  This is an internal class - used to ensure that the methods:
     * {@code LFEC_JAR} and {@code LFEC} do the exact same thing.
     *
     * @param file This presumes that the regular-expression text-file has been loaded into a
     * {@code Vector<String>} (w/out the "include newlines" option!)
     *
     * @param name The name of the file loading is required so that error-printing-information is
     * easier.
     *
     * @return A {@code Vector} containing one compiled regular expression per line.  Comment lines
     * &amp; blank lines will all be ignored.
     *
     * @see #LFEC(String)
     */
    protected static Vector<Pattern> parse(Vector<String> file, String name)
    {
        try
        {
            Vector<Pattern> ret     = new Vector<Pattern>();
            int             flags   = 0;

            for (String line : file)
            {
                if (line.trim().length() == 0) continue;

                if (line.charAt(0) == '#')
                {
                    if (line.length() > 1) if (line.charAt(1) == '#') flags = generateFlags(line);
                    continue;
                }

                if (flags != 0)                 ret.add(Pattern.compile(line, flags));
                else                            ret.add(Pattern.compile(line));

                flags = 0;
            }

            return ret;
        }

        catch (Throwable t)
            { LFEC.ERROR_EXIT(t, "error parsing regular expression file: " + name); }

        return null; // Should NOT be possible to reach this statement...
    }

    /**
     * This information has been copied from Java's regular expression: {@code Pattern}. This is a
     * Helper function as it converts the text-{@code String's} into their constants, so that a
     * user may include these text {@code String's} in a regular expression file.
     *
     * <BR /><BR /><B>NOTE:</B> The regular expression loader will only load regular expressions
     * that fit on a single line of text.  Other than lines that begin with a comment, each line
     * is intended/interpreted as an independent Regular Expression.
     *
     * @see java.util.regex.Pattern
     */
    protected static int generateFlags(String line)
    {
        int mask = 0;

        if (line.contains("CANON_EQ"))          mask |= Pattern.CANON_EQ;
        if (line.contains("CASE_INSENSITIVE"))  mask |= Pattern.CASE_INSENSITIVE;
        if (line.contains("DOTALL"))            mask |= Pattern.DOTALL;
        if (line.contains("COMMENTS"))          mask |= Pattern.COMMENTS;
        if (line.contains("LITERAL"))           mask |= Pattern.LITERAL;
        if (line.contains("MULTILINE"))         mask |= Pattern.MULTILINE;
        if (line.contains("UNICODE_CASE"))      mask |= Pattern.UNICODE_CASE;

        return mask;
    }
}