1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
package Torello.HTML.parse;

import Torello.HTML.*;
import Torello.Java.FileRW;

import java.util.Vector;
import java.io.IOException;
import java.util.regex.*;

public final class ParserRE
{
    public static Vector<HTMLNode> parsePageTokens(
            CharSequence html, boolean eliminateHTMLTags,
            String rawHTMLFile, String matchesFile, String justTextFile
        )
        throws IOException
    {

        if (rawHTMLFile != null)    FileRW.writeFile(html, rawHTMLFile);
        if (matchesFile != null)    FileRW.writeFile("", matchesFile);
        if (justTextFile != null)   FileRW.writeFile("", justTextFile);

        Vector<HTMLNode>    ret     = new Vector<>();
        String              htmlStr = html.toString();
        int                 end     = 0;
        Matcher             m       = HTMLRegEx.P2.matcher(htmlStr); // P2 FIND'S COMMENT NODES

        while (m.find())
        {
            CommentNode newCommentNode  = new CommentNode(m.group());
            int         start           = m.start();

            // The 'Primary' (Core) Parser will append parsed HTMLNode's to the Vector 'ret'
            // The HTML is 'split' by comment-nodes first!

            if (start > end)
                ParserREInternal.getTokens(
                    ret, htmlStr, end /* previous value of end */, start,
                    eliminateHTMLTags, matchesFile, justTextFile
                );

            // NOTICE THE ORDER: Use the "previous value" of 'end', then update
            end = m.end();

            // NOTICE THE ORDER: The HTML *before* the Comment RegEx Matcher is parsed, and
            // incorporated into the return vector first.  Then the Comment that was matched is
            // added to the Vector.

            ret.addElement(newCommentNode);

            // LEGACY FEATURE: Keeping the "matches" file output is very good for debugging, and
            // error checking.

            if (matchesFile != null)
                FileRW.appendToFile("COMMENT:\t[" + newCommentNode.str + "]\n", matchesFile);
        }

        // if the last CommentNode had more HTML/TextNode's after it, this text also has to be
        // parsed.  This text is demarcated by [end, htmlStr.length()], where the value of 'end' is
        // the index of the end of the last/final CommentNode RegEx Match.

        if (end < htmlStr.length())

            ParserREInternal.getTokens(
                ret, htmlStr, end, htmlStr.length(), 
                eliminateHTMLTags, matchesFile, justTextFile
            );

        return ret;
    }
}