ParserREInternal.java.html

package Torello.HTML.HelperPackages.parse;

import Torello.HTML.*;
import Torello.Java.FileRW;

import java.util.Vector;
import java.io.IOException;
import java.util.regex.*;

public final class ParserREInternal
{
    // SAME AS THIS REGULAR-EXPRESSION: [\s>\/]
    private static final char[] validTokenPlusOneChars =
        { '>', '/', ' ', '\t', '\n', (char) 11, '\f', '\r' };

    static void getTokens(
            Vector<HTMLNode> ret,
            String htmlStr, int sPos, int ePos,
            boolean eliminateHTMLTags,
            String matchesFile, String justTextFile
        )
        throws IOException
    {
        boolean             logMatches  = matchesFile != null;
        boolean             logJustText = justTextFile != null;

        Matcher             m           = HTMLRegEx.P1.matcher(htmlStr);
        StringBuffer        matches     = logMatches	? new StringBuffer() : null;
        StringBuffer        justText    = logJustText	? new StringBuffer() : null;
        int                 start       = sPos; // HTML RegEx Matcher 'start' string-index
        int                 end	        = sPos; // HTML RegEx Matcher 'end' string-index
        int                 cursor      = sPos; // The "loop counter"
        final int           HTML_EOF    = ePos; // The "imagined EOF" (prevents sub-string)
        final byte          MAX_TOK_LEN = HTMLTags.maxTokenLength();
                            // Longest (currently registered) HTML String-Token Length


        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // Main loop breaks whenever one of the inner while-loops reaches EOF...
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

        while (true)
        {
            // All (100%) of HTML Elements begin with the less-than '<' symbol.  Advance the
            // cursor until one is found, or we fall off the end of the page.

            while (     (cursor < HTML_EOF) 
                    &&  (htmlStr.charAt(cursor) != '<'))
                cursor++;

            // If we have reached EOF (or sub-page), before finding the '<' STOP IMMEDIATELY.
            if (cursor == HTML_EOF)                                 break;

            // Start now holds the position of the next less-than symbol on the page
            start = cursor;
            cursor++;
            if (cursor == HTML_EOF)                                 break;

            // This may be a "Closing Tag" - if so we have to advance the cursor on extra place
            TC openOrClosed = TC.OpeningTags;

            if (htmlStr.charAt(cursor) == '/')
            {
                openOrClosed = TC.ClosingTags;
                cursor++;
                if (cursor == HTML_EOF)                             break;
            }

            // Whether this HTML Element will go on to match as a "TC.OpeningTags" or
            // "TC.ClosingTags"  the variable 'tokStartPos' now holds the starting string-index of
            // the HTML Element Tag/Token name

            int tokStartPos = cursor;

            // All HTML Elements have Tag/Token Names that may only contain letters or numbers (are
            // 'Alpha-Numeric').  Keep advancing the cursor until EOF, Token-too-long, or a non
            // Alpha-Numeric char is found.

            while (     (cursor < HTML_EOF) 
                    &&  ((cursor - tokStartPos) <= MAX_TOK_LEN)
                    &&  Character.isLetterOrDigit(htmlStr.charAt(cursor))
                )
                cursor++;

            // If EOF was reached first, then exit main loop IMMEDIATELY.
            if (cursor == HTML_EOF)                                 break;

            // If the Token String would be too long to match a valid token, start over.
            if ((cursor - tokStartPos) > MAX_TOK_LEN)               continue;

            // Ensure that the first non-alpha-numeric char that was identified is either a 
            // greater-than symbol '>' or was white-space.  If neither, then start over.

            char charAfterToken = htmlStr.charAt(cursor);

            if (    (charAfterToken != '>')
                &&  (! Character.isWhitespace(charAfterToken))
                )                                                   continue;

            // Eye of Newt, Wool of Bat, Toe of Emoji-Frog, Code-Point (UNICODE) caused a bug once
            // This solved it -- and unfortunately, the web-address that crashed the parser cannot
            // be found.

            if (htmlStr.codePointAt(cursor) > 255)                  continue;

            // This is the "potential" HTML Element tag/token name.  There is, obviously, a
            // possibility that it is not actually an HTML Element name.

            String token = htmlStr.substring(tokStartPos, cursor);

            // This will verify that the token that was found (was after a less-than '<' symbol)
            // is actually a valid HTML Element name.  If not, the 'hasTag' method will return null

            TagNode tn = HTMLTags.hasTag(token, openOrClosed);

            // If this wasn't a valid HTML Element name, then skip it, and start over.
            if (tn == null)                                         continue;

            // Set the RegEx Matcher's "Region" (look it up in the JDK JavaDoc's)
            // So that it will match starting at the first less-than-symbol that was found
            // earlier in this loop.  This '<' symbol position was saved to int 'start'

            m.region(start, HTML_EOF);

            // If the RegEx Matcher cannot match the string beginning at 'start', then this
            // simply cannot be an HTML Element.  Review the JDK Doc's for "looking at".  It
            // is convenient... It (basically) asserts that a '^' symbol is included in the RegEx.
            // (BUT NOT A '$') - if you are familiar with the '^' and '$' meanings in RegEx.

            if (! m.lookingAt())                                    continue;

            // This holds the complete HTML Element (including any attributes) from the
            // opening '<' to the closing '>' symbols.

            String htmlTag = m.group();

            // Use the "pre-instantiated" TagNode' - UNLESS the particular TagNode in question has
            // "innerTag" information (like class="..." or HREF="..." or ID="..." - in which case
            // it is a longer string!)

            if (tn.str.length() != htmlTag.length()) tn = new TagNode(htmlTag);

            // Used to be called the "Mis-Match String" (before optimization).  This is *ALL* the
            // text between the PREVIOUS RegEx HTML Element Match, and the start of CURRENT 
            // HTML Element Match.

            // NOTE! ORDER IS IMPORTANT HERE!
            String  text                = htmlStr.substring(end, start);
                    end                 = m.end(); // MUST COME AFTER PREVIOUS LINE!
                    cursor              = end;
            String  trimmedStr          = text.trim();

            // 'text' would have length greater than zero if there were any character/text that
            // occurred between the PREVIOUS HTML Element Match, and CURRENT HTML Element Match.

            if (text.length() != 0) ret.addElement(new TextNode(text));

            // One (LEGACY) feature that is being maintained (since it was useful), is to eliminate
            // all the HTML TagNodes, and only return the TextNodes...

            if (! eliminateHTMLTags) ret.addElement(tn);

            // If TextNode-Logging is requested, do not log the empty (white-space only) TextNodes
            if (trimmedStr.length() != 0)
            {
                // The un-trimmed text-line
                if (logJustText)    justText.append(text);

                // The trimmed text-line
                if (logMatches)     matches.append("TEXT:\t\t[" + trimmedStr + "]\n");
            }

            // EXTREMELY USEFUL (LEGACY / DEBUGGING) Feature, that will be kept / maintained.
            if (logMatches) matches.append("GROUP():\t[" + htmlTag + "]\n");
        }


        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // MAIN WHILE LOOP END
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

        // The parser was skipping the very last piece of non-HTML text that was occurring after
        // the very last HTML-tag on any given page.   I hadn't checked this part in ages.
        // Sometimes, for pages that don't begin and end with <HTML>...</HTML> tag, but rather are
        // just short blubs of HTML, the last sentence was being "dropped".

        if (end != HTML_EOF)
        {
            // End has the value of the "very last" RegEx-Match End-Pos
            // If this is not the end of the string, then add last snippet of non-HTML text

            String text         = htmlStr.substring(end, HTML_EOF);
            String trimmedStr   = text.trim();

            // NOW Add the text-node.
            ret.addElement(new TextNode(text));

            if (trimmedStr.length() != 0)
            {
                // The un-trimmed text-line
                if (logJustText)    justText.append(text);

                // The trimmed text-line
                if (logMatches)     matches.append("TEXT:\t\t[" + trimmedStr + "]\n");
            }
        }

        // Write these String Buffers to a file.
        if (logMatches)		FileRW.appendToFile(matches,	matchesFile);
        if (logJustText)	FileRW.appendToFile(justText,	justTextFile);
    }
}