001package Torello.Java; 002 003import java.util.*; 004import java.io.*; 005import java.util.regex.*; 006import java.util.zip.*; 007 008/** 009 * A utility for saving Regular-Expressions in a text-file that may be lazily-loaded at run-time 010 * when needed. 011 * 012 * <EMBED CLASS='external-html' DATA-FILE-ID=REGEX_FILES> 013 */ 014@Torello.JavaDoc.StaticFunctional 015public class RegExFiles 016{ 017 private RegExFiles() { } 018 019 /** 020 * This loads a regular expression text file. Each line is interpreted as a new Regular 021 * Expression {@code Pattern}. 022 * 023 * <BR /><BR />This method expects the <B><I>entire regular expression to fit on a single 024 * line</I></B>, and therefore, each new line containing text-data (without a starting 025 * <B>{@code '#'}</B>) will be compile into a new regular expression. Use the {@code '\n'} 026 * within the expression to generated newlines. 027 * 028 * <BR /><BR /><B CLASS=JDDescLabel>Some Syntax Rules:</B> 029 * 030 * <BR /><UL CLASS=JDUL> 031 * <LI> <B>Comment lines</B> are lines beginning with the <I>POUND</I> <B>({@code '#'})</B> 032 * sign. 033 * </LI> 034 * <LI> <B>Blank lines</B> are ignored by the file-parse completely.</LI> 035 * <LI> Lines with <B>only white-space</B> are considered blank.</LI> 036 * <LI> <B>Flag Lines</B> are lines that begin with two, successive, <I>POUND</I> 037 * <B>({@code '##'})</B> signs. 038 * </LI> 039 * <LI> All non-comment, non-blank and non-flag lines are converted into Regular-Expression 040 * {@code Pattern's} 041 * </LI> 042 * </UL> 043 * 044 * <BR /><BR /><B CLASS=JDDescLabel>LFEC Note:</B> 045 * 046 * <BR />This method will <I>halt program execution</I> if any exceptions occur when loading a 047 * Regular-Expression text file! This is the primary-purpose of all {@code 'LFEC'} - Load File 048 * Exception Catch methods. 049 * 050 * @param f Filename for a Regular Expression 051 * 052 * @return A {@code Vector} containing one compiled regular expression per line. Comment lines 053 * & blank lines will all be ignored. 054 * 055 * @see java.util.regex.Pattern 056 * @see #generateFlags(String) 057 * @see LFEC#ERROR_EXIT(Throwable, String) 058 */ 059 public static Vector<Pattern> LFEC(String f) 060 { 061 try 062 { return parse(FileRW.loadFileToVector(f, false), f); } 063 064 catch (Throwable t) 065 { 066 LFEC.ERROR_EXIT(t, "Attempt to load Regular Expression file: [" + f + "], failed.\n"); 067 } 068 069 return null; // Should NOT be possible to reach this statement... 070 } 071 072 /** 073 * This does the <B><I>exact same thing</I></B> as {@link LFEC}, but loads the file into a 074 * {@code Vector} using the "JAR File" information included here. In this case, parameter 075 * {@code f} indicates a jar-file class-loader pointer. It will not load from the standard 076 * file-system. 077 * 078 * <BR /><BR /><B CLASS=JDDescLabel>Java's <CODE>getResourceAsStream</CODE>:</B> 079 * 080 * <BR />The JAR implies that Java's "load resource as stream" features are being used in place 081 * of standard file i/o routines. Specifically, this loads from a JAR file, as seen below: 082 * 083 * <DIV CLASS=SNIP>{@code 084 * BufferedReader br = 085 * new BufferedReader(new InputStreamReader(c.getResourceAsStream(f))); 086 * }</DIV> 087 * 088 * @param c This contains the class that is loading the file. It is not too important to use 089 * the "exact class" - since the only reason the class doing the loading is because the 090 * "Class Loader" employs the exact "Package Name" of the class for figuring out the 091 * directory / sub-directory where the data-file is stored. This variable may not be null. 092 * 093 * <BR /><BR /><B>EXAMPLE:</B> If you wanted to load a "Regular Expressions.txt" file that 094 * was in the same BASH/Debian/etc... directory as the following class - the following call 095 * to {@code 'RegExFiles'} would load the text-file "Regular Expressions.txt" into memory 096 * quickly. The primary purpose being that text files are <B><I>much easier to read than 097 * 'double-escaped' Java {@code String's}.</I></B> 098 * 099 * <BR /><BR /><B>NOTE:</B> It might be important to read the Java Doc's about the 100 * {@code 'getResourceAsStream(String)'} method for retrieving data that was stored to a JAR 101 * file instead of a UNIX/BASH/MS-DOS system file. Oracle's Java 8 would help. 102 * 103 * <EMBED CLASS='external-html' DATA-FILE-ID=RAWTYPES> 104 * 105 * @param f This is a file-pointer to a file stored inside a Java JAR file. 106 * 107 * @return A Vector containing one compiled regular expression per line. Comment lines & 108 * blank lines will all be ignored. 109 * 110 * @see #LFEC(String) 111 * @see #parse(Vector, String) 112 * @see LFEC#ERROR_EXIT(Throwable, String) 113 */ 114 public static Vector<Pattern> LFEC_JAR(Class<?> c, String f) 115 { 116 try ( 117 InputStream is = c.getResourceAsStream(f); 118 BufferedReader br = new BufferedReader(new InputStreamReader(is)); 119 ) 120 { 121 String s = ""; 122 StringBuilder sb = new StringBuilder(); 123 Vector<String> file = new Vector<String>(); 124 125 while ((s = br.readLine()) != null) file.addElement(s); 126 127 return parse(file, f); 128 } 129 130 catch (Throwable t) 131 { 132 LFEC.ERROR_EXIT( 133 t, 134 "Attempted to load Regular Expression file: [" + f + "]\n" + 135 "From jar-file using class: [" + c.getCanonicalName() + "]\n" + 136 "Did not load successfully." 137 ); 138 } 139 140 // Should NOT be possible to reach this statement... 141 // Compiler does not recognize LFEC.ERROR_EXIT 142 143 return null; 144 } 145 146 /** 147 * This is identical to {@code LFEC_JAR}, except that it presumes the file was compressed 148 * before saving. 149 * 150 * @param c This contains the class that is loading the file. It is not too important to use 151 * the "exact class" - since the only reason the class doing the loading is because the "Class 152 * Loader" employs the exact "Package Name" of the class for figuring out the directory / 153 * sub-directory where the data-file is stored. This variable may not be null. Again, the 154 * class-loader looks in the directory of the package that contains this class! 155 * 156 * <BR /><BR /><B>NOTE:</B> The method {@code public static Vector<Pattern> 157 * LFEC_JAR(Class, String;)} has a more detailed look at the particular use of this parameter. 158 * The easy way to understand is: just pass the class that is doing the actual loading of the 159 * regular-expression <B><I>(presuming the regex.dat file is in the same directory as the 160 * {@code '.class'} file!)</I></B> 161 * 162 * <EMBED CLASS='external-html' DATA-FILE-ID=RAWTYPES> 163 * 164 * @param f This is a file-pointer to a file stored inside a Java JAR file. 165 * 166 * @return A {@code Vector} containing one compiled regular expression per line. Comment 167 * lines & blank lines will all be ignored. 168 * 169 * @see #LFEC_JAR(Class, String) 170 * @see #parse(Vector, String) 171 * @see LFEC#ERROR_EXIT(Throwable, String) 172 */ 173 public static Vector<Pattern> LFEC_JAR_ZIP(Class<?> c, String f) 174 { 175 try ( 176 InputStream is = c.getResourceAsStream(f); 177 GZIPInputStream gzip = new GZIPInputStream(is); 178 ObjectInputStream ois = new ObjectInputStream(gzip); 179 ) 180 { 181 Object ret = ois.readObject(); 182 String fileStr = (String) ret; 183 Vector<String> file = new Vector<>(); 184 int newLinePos = 0; 185 186 while ((newLinePos = fileStr.indexOf('\n')) != -1) 187 { 188 file.addElement(fileStr.substring(0, newLinePos)); 189 fileStr = fileStr.substring(newLinePos + 1); 190 } 191 192 return parse(file, f); 193 194 } 195 196 catch (Throwable t) 197 { 198 LFEC.ERROR_EXIT(t, 199 "Attempted to load Regular Expression file: [" + f + "]\n" + 200 "From jar-file using class: [" + c.getCanonicalName() + "]\n" + 201 "Content was zipped, but failed to load." 202 ); 203 } 204 205 return null; // Should NOT be possible to reach this statement... 206 } 207 208 /** 209 * This does the <B><I>exact same thing</I></B> as {@link LFEC}, but takes a "pre-loaded file" 210 * as a {@code Vector}. This is an internal class - used to ensure that the methods: 211 * {@code LFEC_JAR} and {@code LFEC} do the exact same thing. 212 * 213 * @param file This presumes that the regular-expression text-file has been loaded into a 214 * {@code Vector<String>} (w/out the "include newlines" option!) 215 * 216 * @param name The name of the file loading is required so that error-printing-information is 217 * easier. 218 * 219 * @return A {@code Vector} containing one compiled regular expression per line. Comment lines 220 * & blank lines will all be ignored. 221 * 222 * @see #LFEC(String) 223 */ 224 protected static Vector<Pattern> parse(Vector<String> file, String name) 225 { 226 try 227 { 228 Vector<Pattern> ret = new Vector<Pattern>(); 229 int flags = 0; 230 231 for (String line : file) 232 { 233 if (line.trim().length() == 0) continue; 234 235 if (line.charAt(0) == '#') 236 { 237 if (line.length() > 1) if (line.charAt(1) == '#') flags = generateFlags(line); 238 continue; 239 } 240 241 if (flags != 0) ret.add(Pattern.compile(line, flags)); 242 else ret.add(Pattern.compile(line)); 243 244 flags = 0; 245 } 246 247 return ret; 248 } 249 250 catch (Throwable t) 251 { LFEC.ERROR_EXIT(t, "error parsing regular expression file: " + name); } 252 253 return null; // Should NOT be possible to reach this statement... 254 } 255 256 /** 257 * This information has been copied from Java's regular expression: {@code Pattern}. This is a 258 * Helper function as it converts the text-{@code String's} into their constants, so that a 259 * user may include these text {@code String's} in a regular expression file. 260 * 261 * <BR /><BR /><B>NOTE:</B> The regular expression loader will only load regular expressions 262 * that fit on a single line of text. Other than lines that begin with a comment, each line 263 * is intended/interpreted as an independent Regular Expression. 264 * 265 * @see java.util.regex.Pattern 266 */ 267 protected static int generateFlags(String line) 268 { 269 int mask = 0; 270 271 if (line.contains("CANON_EQ")) mask |= Pattern.CANON_EQ; 272 if (line.contains("CASE_INSENSITIVE")) mask |= Pattern.CASE_INSENSITIVE; 273 if (line.contains("DOTALL")) mask |= Pattern.DOTALL; 274 if (line.contains("COMMENTS")) mask |= Pattern.COMMENTS; 275 if (line.contains("LITERAL")) mask |= Pattern.LITERAL; 276 if (line.contains("MULTILINE")) mask |= Pattern.MULTILINE; 277 if (line.contains("UNICODE_CASE")) mask |= Pattern.UNICODE_CASE; 278 279 return mask; 280 } 281}