1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 | package Torello.HTML; import Torello.Java.FileRW; import java.util.Vector; import java.io.IOException; import java.util.regex.*; final class ParserREInternal { // SAME AS THIS REGULAR-EXPRESSION: [\s>\/] /* private static final char[] validTokenPlusOneChars = { '>', '/', ' ', '\t', '\n', (char) 11, '\f', '\r' }; */ static void getTokens( Vector<HTMLNode> ret, String htmlStr, int sPos, int ePos, boolean eliminateHTMLTags, String matchesFile, String justTextFile ) throws IOException { boolean logMatches = matchesFile != null; boolean logJustText = justTextFile != null; Matcher m = HTMLRegEx.P1.matcher(htmlStr); StringBuffer matches = logMatches ? new StringBuffer() : null; StringBuffer justText = logJustText ? new StringBuffer() : null; int start = sPos; // HTML RegEx Matcher 'start' string-index int end = sPos; // HTML RegEx Matcher 'end' string-index int cursor = sPos; // The "loop counter" final int HTML_EOF = ePos; // The "imagined EOF" (prevents sub-string) final byte MAX_TOK_LEN = HTMLTags.maxTokenLength(); // Longest (currently registered) HTML String-Token Length // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** // Main loop breaks whenever one of the inner while-loops reaches EOF... // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** while (true) { // All (100%) of HTML Elements begin with the less-than '<' symbol. Advance the // cursor until one is found, or we fall off the end of the page. while ( (cursor < HTML_EOF) && (htmlStr.charAt(cursor) != '<')) cursor++; // If we have reached EOF (or sub-page), before finding the '<' STOP IMMEDIATELY. if (cursor == HTML_EOF) break; // Start now holds the position of the next less-than symbol on the page start = cursor; cursor++; if (cursor == HTML_EOF) break; // This may be a "Closing Tag" - if so we have to advance the cursor on extra place TC openOrClosed = TC.OpeningTags; if (htmlStr.charAt(cursor) == '/') { openOrClosed = TC.ClosingTags; cursor++; if (cursor == HTML_EOF) break; } // Whether this HTML Element will go on to match as a "TC.OpeningTags" or // "TC.ClosingTags" the variable 'tokStartPos' now holds the starting string-index of // the HTML Element Tag/Token name int tokStartPos = cursor; // All HTML Elements have Tag/Token Names that may only contain letters or numbers (are // 'Alpha-Numeric'). Keep advancing the cursor until EOF, Token-too-long, or a non // Alpha-Numeric char is found. while ( (cursor < HTML_EOF) && ((cursor - tokStartPos) <= MAX_TOK_LEN) && Character.isLetterOrDigit(htmlStr.charAt(cursor)) ) cursor++; // If EOF was reached first, then exit main loop IMMEDIATELY. if (cursor == HTML_EOF) break; // If the Token String would be too long to match a valid token, start over. if ((cursor - tokStartPos) > MAX_TOK_LEN) continue; // Ensure that the first non-alpha-numeric char that was identified is either a // greater-than symbol '>' or was white-space. If neither, then start over. char charAfterToken = htmlStr.charAt(cursor); if ( (charAfterToken != '>') && (! Character.isWhitespace(charAfterToken)) ) continue; // Eye of Newt, Wool of Bat, Toe of Emoji-Frog, Code-Point (UNICODE) caused a bug once // This solved it -- and unfortunately, the web-address that crashed the parser cannot // be found. if (htmlStr.codePointAt(cursor) > 255) continue; // This is the "potential" HTML Element tag/token name. There is, obviously, a // possibility that it is not actually an HTML Element name. String token = htmlStr.substring(tokStartPos, cursor); // This will verify that the token that was found (was after a less-than '<' symbol) // is actually a valid HTML Element name. If not, the 'hasTag' method will return null TagNode tn = HTMLTags.hasTag(token, openOrClosed); // If this wasn't a valid HTML Element name, then skip it, and start over. if (tn == null) continue; // Set the RegEx Matcher's "Region" (look it up in the JDK JavaDoc's) // So that it will match starting at the first less-than-symbol that was found // earlier in this loop. This '<' symbol position was saved to int 'start' m.region(start, HTML_EOF); // If the RegEx Matcher cannot match the string beginning at 'start', then this // simply cannot be an HTML Element. Review the JDK Doc's for "looking at". It // is convenient... It (basically) asserts that a '^' symbol is included in the RegEx. // (BUT NOT A '$') - if you are familiar with the '^' and '$' meanings in RegEx. if (! m.lookingAt()) continue; // This holds the complete HTML Element (including any attributes) from the // opening '<' to the closing '>' symbols. String htmlTag = m.group(); // Use the "pre-instantiated" TagNode' - UNLESS the particular TagNode in question has // "innerTag" information (like class="..." or HREF="..." or ID="..." - in which case // it is a longer string!) if (tn.str.length() != htmlTag.length()) tn = new TagNode(htmlTag); // Used to be called the "Mis-Match String" (before optimization). This is *ALL* the // text between the PREVIOUS RegEx HTML Element Match, and the start of CURRENT // HTML Element Match. // NOTE! ORDER IS IMPORTANT HERE! String text = htmlStr.substring(end, start); end = m.end(); // MUST COME AFTER PREVIOUS LINE! cursor = end; String trimmedStr = text.trim(); // 'text' would have length greater than zero if there were any character/text that // occurred between the PREVIOUS HTML Element Match, and CURRENT HTML Element Match. if (text.length() != 0) ret.addElement(new TextNode(text)); // One (LEGACY) feature that is being maintained (since it was useful), is to eliminate // all the HTML TagNodes, and only return the TextNodes... if (! eliminateHTMLTags) ret.addElement(tn); // If TextNode-Logging is requested, do not log the empty (white-space only) TextNodes if (trimmedStr.length() != 0) { // The un-trimmed text-line if (logJustText) justText.append(text); // The trimmed text-line if (logMatches) matches.append("TEXT:\t\t[" + trimmedStr + "]\n"); } // EXTREMELY USEFUL (LEGACY / DEBUGGING) Feature, that will be kept / maintained. if (logMatches) matches.append("GROUP():\t[" + htmlTag + "]\n"); } // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** // MAIN WHILE LOOP END // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** // The parser was skipping the very last piece of non-HTML text that was occurring after // the very last HTML-tag on any given page. I hadn't checked this part in ages. // Sometimes, for pages that don't begin and end with <HTML>...</HTML> tag, but rather are // just short blubs of HTML, the last sentence was being "dropped". if (end != HTML_EOF) { // End has the value of the "very last" RegEx-Match End-Pos // If this is not the end of the string, then add last snippet of non-HTML text String text = htmlStr.substring(end, HTML_EOF); String trimmedStr = text.trim(); // NOW Add the text-node. ret.addElement(new TextNode(text)); if (trimmedStr.length() != 0) { // The un-trimmed text-line if (logJustText) justText.append(text); // The trimmed text-line if (logMatches) matches.append("TEXT:\t\t[" + trimmedStr + "]\n"); } } // Write these String Buffers to a file. if (logMatches) FileRW.appendToFile(matches, matchesFile); if (logJustText) FileRW.appendToFile(justText, justTextFile); } } |