001package Torello.Java;
002
003import java.util.*;
004import java.io.*;
005import java.util.regex.*;
006import java.util.zip.*;
007
008/**
009 * A utility for saving Regular-Expressions in a text-file that may be lazily-loaded at run-time
010 * when needed.
011 * 
012 * <EMBED CLASS='external-html' DATA-FILE-ID=REGEX_FILES>
013 */
014@Torello.JavaDoc.StaticFunctional
015public class RegExFiles
016{
017    private RegExFiles() { }
018
019    /**
020     * This loads a regular expression text file.  Each line is interpreted as a new Regular
021     * Expression {@code Pattern}.
022     *
023     * <BR /><BR />This method expects the <B><I>entire regular expression to fit on a single
024     * line</I></B>, and therefore, each new line containing text-data (without a starting
025     * <B>{@code '#'}</B>) will be compile into a new regular expression.  Use the {@code '\n'}
026     * within the expression to generated newlines.
027     *
028     * <BR /><BR /><B CLASS=JDDescLabel>Some Syntax Rules:</B>
029     * 
030     * <BR /><UL CLASS=JDUL>
031     * <LI> <B>Comment lines</B> are lines beginning with the <I>POUND</I> <B>({@code '#'})</B>
032     *      sign.
033     *      </LI>
034     * <LI> <B>Blank lines</B> are ignored by the file-parse completely.</LI>
035     * <LI> Lines with <B>only white-space</B> are considered blank.</LI>
036     * <LI> <B>Flag Lines</B> are lines that begin with two, successive, <I>POUND</I>
037     *      <B>({@code '##'})</B> signs.
038     *      </LI>
039     * <LI> All non-comment, non-blank and non-flag lines are converted into Regular-Expression
040     *      {@code Pattern's}
041     *      </LI>
042     * </UL>
043     *
044     * <BR /><BR /><B CLASS=JDDescLabel>LFEC Note:</B>
045     * 
046     * <BR />This method will <I>halt program execution</I> if any exceptions occur when loading a
047     * Regular-Expression text file!  This is the primary-purpose of all {@code 'LFEC'} - Load File
048     * Exception Catch methods.
049     *
050     * @param f Filename for a Regular Expression
051     *
052     * @return A {@code Vector} containing one compiled regular expression per line.  Comment lines
053     * &amp; blank lines will all be ignored.
054     *
055     * @see java.util.regex.Pattern
056     * @see #generateFlags(String)
057     * @see LFEC#ERROR_EXIT(Throwable, String)
058     */
059    public static Vector<Pattern> LFEC(String f)
060    {
061        try
062            { return parse(FileRW.loadFileToVector(f, false), f); }
063
064        catch (Throwable t)
065        {
066            LFEC.ERROR_EXIT(t, "Attempt to load Regular Expression file: [" + f + "], failed.\n");
067        }
068
069        return null; // Should NOT be possible to reach this statement...
070    }
071
072    /**
073     * This does the <B><I>exact same thing</I></B> as {@link LFEC}, but loads the file into a
074     * {@code Vector} using the "JAR File" information included here.  In this case, parameter
075     * {@code f} indicates a jar-file class-loader pointer.  It will not load from the standard
076     * file-system.
077     *
078     * <BR /><BR /><B CLASS=JDDescLabel>Java's <CODE>getResourceAsStream</CODE>:</B>
079     * 
080     * <BR />The JAR implies that Java's "load resource as stream" features are being used in place
081     * of standard file i/o routines.  Specifically, this loads from a JAR file, as seen below:
082     *
083     * <DIV CLASS=SNIP>{@code
084     * BufferedReader br =
085     *     new BufferedReader(new InputStreamReader(c.getResourceAsStream(f)));
086     * }</DIV>
087     *
088     * @param c This contains the class that is loading the file.  It is not too important to use
089     * the "exact class" - since the only reason the class doing the loading is because the
090     * "Class Loader" employs the exact "Package Name" of the class for figuring out the
091     * directory / sub-directory where the data-file is stored.  This variable may not be null.
092     *
093     * <BR /><BR /><B>EXAMPLE:</B> If you wanted to load a "Regular Expressions.txt" file that
094     * was in the same BASH/Debian/etc...  directory as the following class - the following call
095     * to {@code 'RegExFiles'} would load the text-file "Regular Expressions.txt" into memory
096     * quickly.  The primary purpose being that text files are <B><I>much easier to read than
097     * 'double-escaped' Java {@code String's}.</I></B>
098     *
099     * <BR /><BR /><B>NOTE:</B> It might be important to read the Java Doc's about the
100     * {@code 'getResourceAsStream(String)'} method for retrieving data that was stored to a JAR
101     * file instead of a UNIX/BASH/MS-DOS system file.  Oracle's Java 8 would help.
102     *
103     * <EMBED CLASS='external-html' DATA-FILE-ID=RAWTYPES>
104     *
105     * @param f This is a file-pointer to a file stored inside a Java JAR file.
106     *
107     * @return A Vector containing one compiled regular expression per line.  Comment lines &amp;
108     * blank lines will all be ignored.
109     *
110     * @see #LFEC(String)
111     * @see #parse(Vector, String)
112     * @see LFEC#ERROR_EXIT(Throwable, String)
113     */
114    public static Vector<Pattern> LFEC_JAR(Class<?> c, String f)
115    {
116        try (
117            InputStream     is = c.getResourceAsStream(f);
118            BufferedReader  br = new BufferedReader(new InputStreamReader(is));
119        )
120        {
121            String          s       = "";
122            StringBuilder   sb      = new StringBuilder();
123            Vector<String>  file    = new Vector<String>();
124
125            while ((s = br.readLine()) != null) file.addElement(s);
126
127            return parse(file, f);
128        }
129
130        catch (Throwable t)
131        { 
132            LFEC.ERROR_EXIT(
133                t,
134                "Attempted to load Regular Expression file: [" + f + "]\n" +
135                "From jar-file using class: [" + c.getCanonicalName() + "]\n" +
136                "Did not load successfully."
137            );
138        }
139
140        // Should NOT be possible to reach this statement...
141        // Compiler does not recognize LFEC.ERROR_EXIT
142
143        return null;
144    }
145
146    /**
147     * This is identical to {@code LFEC_JAR}, except that it presumes the file was compressed
148     * before saving.
149     *
150     * @param c This contains the class that is loading the file.  It is not too important to use
151     * the "exact class" - since the only reason the class doing the loading is because the "Class
152     * Loader" employs the exact "Package Name" of the class for figuring out the directory /
153     * sub-directory where the data-file is stored.  This variable may not be null.  Again, the
154     * class-loader looks in the directory of the package that contains this class!
155     *
156     * <BR /><BR /><B>NOTE:</B> The method {@code public static Vector<Pattern> 
157     * LFEC_JAR(Class, String;)} has a more detailed look at the particular use of this parameter.
158     * The easy way to understand is: just pass the class that is doing the actual loading of the
159     * regular-expression <B><I>(presuming the regex.dat file is in the same directory as the 
160     * {@code '.class'} file!)</I></B>
161     *
162     * <EMBED CLASS='external-html' DATA-FILE-ID=RAWTYPES>
163     *
164     * @param f This is a file-pointer to a file stored inside a Java JAR file.
165     *
166     * @return A {@code Vector} containing one compiled regular expression per line.  Comment
167     * lines &amp; blank lines will all be ignored.
168     *
169     * @see #LFEC_JAR(Class, String)
170     * @see #parse(Vector, String)
171     * @see LFEC#ERROR_EXIT(Throwable, String)
172     */
173    public static Vector<Pattern> LFEC_JAR_ZIP(Class<?> c, String f)
174    {
175        try (
176            InputStream         is      = c.getResourceAsStream(f);
177            GZIPInputStream     gzip    = new GZIPInputStream(is);
178            ObjectInputStream   ois     = new ObjectInputStream(gzip);
179        )
180        {
181            Object              ret         = ois.readObject();
182            String              fileStr     = (String) ret;
183            Vector<String>      file        = new Vector<>();
184            int                 newLinePos  = 0;
185
186            while ((newLinePos = fileStr.indexOf('\n')) != -1)
187            {
188                file.addElement(fileStr.substring(0, newLinePos));
189                fileStr = fileStr.substring(newLinePos + 1);
190            }
191
192            return parse(file, f);
193
194        }
195
196        catch (Throwable t)
197        {
198            LFEC.ERROR_EXIT(t,
199                "Attempted to load Regular Expression file: [" + f + "]\n" +
200                "From jar-file using class: [" + c.getCanonicalName() + "]\n" +
201                "Content was zipped, but failed to load."
202            );
203        }
204
205        return null; // Should NOT be possible to reach this statement...
206    }
207
208    /**
209     * This does the <B><I>exact same thing</I></B> as {@link LFEC}, but takes a "pre-loaded file"
210     * as a {@code Vector}.  This is an internal class - used to ensure that the methods:
211     * {@code LFEC_JAR} and {@code LFEC} do the exact same thing.
212     *
213     * @param file This presumes that the regular-expression text-file has been loaded into a
214     * {@code Vector<String>} (w/out the "include newlines" option!)
215     *
216     * @param name The name of the file loading is required so that error-printing-information is
217     * easier.
218     *
219     * @return A {@code Vector} containing one compiled regular expression per line.  Comment lines
220     * &amp; blank lines will all be ignored.
221     *
222     * @see #LFEC(String)
223     */
224    protected static Vector<Pattern> parse(Vector<String> file, String name)
225    {
226        try
227        {
228            Vector<Pattern> ret     = new Vector<Pattern>();
229            int             flags   = 0;
230
231            for (String line : file)
232            {
233                if (line.trim().length() == 0) continue;
234
235                if (line.charAt(0) == '#')
236                {
237                    if (line.length() > 1) if (line.charAt(1) == '#') flags = generateFlags(line);
238                    continue;
239                }
240
241                if (flags != 0)                 ret.add(Pattern.compile(line, flags));
242                else                            ret.add(Pattern.compile(line));
243
244                flags = 0;
245            }
246
247            return ret;
248        }
249
250        catch (Throwable t)
251            { LFEC.ERROR_EXIT(t, "error parsing regular expression file: " + name); }
252
253        return null; // Should NOT be possible to reach this statement...
254    }
255
256    /**
257     * This information has been copied from Java's regular expression: {@code Pattern}. This is a
258     * Helper function as it converts the text-{@code String's} into their constants, so that a
259     * user may include these text {@code String's} in a regular expression file.
260     *
261     * <BR /><BR /><B>NOTE:</B> The regular expression loader will only load regular expressions
262     * that fit on a single line of text.  Other than lines that begin with a comment, each line
263     * is intended/interpreted as an independent Regular Expression.
264     *
265     * @see java.util.regex.Pattern
266     */
267    protected static int generateFlags(String line)
268    {
269        int mask = 0;
270
271        if (line.contains("CANON_EQ"))          mask |= Pattern.CANON_EQ;
272        if (line.contains("CASE_INSENSITIVE"))  mask |= Pattern.CASE_INSENSITIVE;
273        if (line.contains("DOTALL"))            mask |= Pattern.DOTALL;
274        if (line.contains("COMMENTS"))          mask |= Pattern.COMMENTS;
275        if (line.contains("LITERAL"))           mask |= Pattern.LITERAL;
276        if (line.contains("MULTILINE"))         mask |= Pattern.MULTILINE;
277        if (line.contains("UNICODE_CASE"))      mask |= Pattern.UNICODE_CASE;
278
279        return mask;
280    }
281}