1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 | package Torello.HTML; import Torello.Java.FileRW; import java.util.Vector; import java.io.IOException; import java.util.regex.*; final class ParserRE { static Vector<HTMLNode> parsePageTokens( CharSequence html, boolean eliminateHTMLTags, String rawHTMLFile, String matchesFile, String justTextFile ) throws IOException { if (rawHTMLFile != null) FileRW.writeFile(html, rawHTMLFile); if (matchesFile != null) FileRW.writeFile("", matchesFile); if (justTextFile != null) FileRW.writeFile("", justTextFile); Vector<HTMLNode> ret = new Vector<>(); String htmlStr = html.toString(); int end = 0; Matcher m = HTMLRegEx.P2.matcher(htmlStr); // P2 FIND'S COMMENT NODES while (m.find()) { CommentNode newCommentNode = new CommentNode(m.group()); int start = m.start(); // The 'Primary' (Core) Parser will append parsed HTMLNode's to the Vector 'ret' // The HTML is 'split' by comment-nodes first! if (start > end) ParserREInternal.getTokens( ret, htmlStr, end /* previous value of end */, start, eliminateHTMLTags, matchesFile, justTextFile ); // NOTICE THE ORDER: Use the "previous value" of 'end', then update end = m.end(); // NOTICE THE ORDER: The HTML *before* the Comment RegEx Matcher is parsed, and // incorporated into the return vector first. Then the Comment that was matched is // added to the Vector. ret.addElement(newCommentNode); // LEGACY FEATURE: Keeping the "matches" file output is very good for debugging, and // error checking. if (matchesFile != null) FileRW.appendToFile("COMMENT:\t[" + newCommentNode.str + "]\n", matchesFile); } // if the last CommentNode had more HTML/TextNode's after it, this text also has to be // parsed. This text is demarcated by [end, htmlStr.length()], where the value of 'end' is // the index of the end of the last/final CommentNode RegEx Match. if (end < htmlStr.length()) ParserREInternal.getTokens( ret, htmlStr, end, htmlStr.length(), eliminateHTMLTags, matchesFile, justTextFile ); return ret; } } |