1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 | package Torello.HTML; import java.util.regex.*; final class HTMLRegEx { // Used by class ParserRE to parse comment nodes, and by Torello.HTML.TagNode static final Pattern P1 = Pattern.compile( "<\\/?(\\w{1,127})" + "(?:" + "[\\w-]+=\"[^\"]*?\"" + "|" // attribute="any valid string, without (the same) quote" + "[\\w-]+='[^']*?'" + "|" // attribute='any valid string without (the same) quote' + "[\\w-]+=[\\w-]*" + "|" // attribute=any-valid-string-no-spaces-or-punctuation-etc + "[\\w-]+" + "|" // attribute + "\\s+" + "|" // any white-space + "[^>]+" + ")*" // Any miscellaneous characters ("Junk?"), *EXCEPT* a "greater-than" // (MUST be THE LAST or-option) // NOTE: The above "|" (or-branch), *MUST* be at the end... or else // It will match everything, (except '>'), and miss the whole point. // (Specifically, the first three attribute-value pair clauses are // how to avoid the greater-than-within-tag problem!!! + "\\/?>", // Ending-HTML-Tag symbol is a "greater-than" or "slash-greater-than" Pattern.DOTALL ); // (Package-Local RegEx) Used by class ParserRE and ParserHM to parse comment-nodes. static final Pattern P2 = Pattern.compile("<!--.*?-->", Pattern.DOTALL); } |