1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
package Torello.HTML;

import java.util.regex.*;

final class HTMLRegEx
{
    // Used by class ParserRE to parse comment nodes, and by Torello.HTML.TagNode
    static final Pattern P1 = Pattern.compile(
        "<\\/?(\\w{1,127})"                     +
            "(?:"   +   "[\\w-]+=\"[^\"]*?\""   + "|"   // attribute="any valid string, without (the same) quote"
                    +   "[\\w-]+='[^']*?'"      + "|"   // attribute='any valid string without (the same) quote'
                    +   "[\\w-]+=[\\w-]*"       + "|"   // attribute=any-valid-string-no-spaces-or-punctuation-etc
                    +   "[\\w-]+"               + "|"   // attribute
                    +   "\\s+"                  + "|"   // any white-space
                    +   "[^>]+"                 + ")*"  // Any miscellaneous characters ("Junk?"), *EXCEPT* a "greater-than"
                                                        // (MUST be THE LAST or-option)
                                                        //      NOTE: The above "|" (or-branch), *MUST* be at the end... or else
                                                        //            It will match everything, (except '>'), and miss the whole point.
                                                        //            (Specifically, the first three attribute-value pair clauses are
                                                        //             how to avoid the greater-than-within-tag problem!!!  
            + "\\/?>",                                  // Ending-HTML-Tag symbol is a "greater-than" or "slash-greater-than"
            Pattern.DOTALL
        );

    // (Package-Local RegEx) Used by class ParserRE and ParserHM to parse comment-nodes.
    static final Pattern P2 = Pattern.compile("<!--.*?-->", Pattern.DOTALL);
}