Str.java.html

package Torello.CSS;

import Torello.Java.Additional.ByRef;

import java.util.Vector;
import java.util.function.Consumer;
import java.util.stream.IntStream;

/**
 * CSS-Tokenizer Class for {@code String}-Literals.
 */
@Torello.JavaDoc.JDHeaderBackgroundImg(EmbedTagFileID="CSS_TOK")
public class Str extends CSSToken
    implements CharSequence, java.io.Serializable, Comparable<CharSequence>
{
    /** <EMBED CLASS='external-html' DATA-FILE-ID=SVUID> */
    protected static final long serialVersionUID = 1;


    // ********************************************************************************************
    // ********************************************************************************************
    // Public & Final Fields
    // ********************************************************************************************
    // ********************************************************************************************


    /**
     * The quotation mark type used to quote this {@code String}-Literal.  The value placed in
     * this Java {@code char} primitive may only be a Single-Quotation Mark, or a Double-Quotation.
     * No other types of quotations are included in this class parser.
     */
    public final char quote;

    /**
     * This is the actual {@code String}-Literal that this {@code CSSToken} represents.  This Java
     * {@code String} will never actually contain the opening and closing quotation marks that were
     * used to create this {@code String}.
     * 
     * <BR /><BR /><B CLASS=JDDescLabel>Unescaped String:</B>
     * 
     * <BR />if this {@code String} utilized any Escape-Sequences representing Unicode Characters,
     * the Unescaped-Characters are used within this {@code String} to replace the original,
     * escaped, sequences.
     * 
     * <BR /><BR /><B CLASS=JDDescLabel>ChatGPT Provided Exmaple:</B>
     * 
     * <BR />There is a quoted, {@code String} below, provided by AI.  Note that ChatGPT 
     * initially gave me a slightly different answer written as <CODE>"\u2713 Checkmark"</CODE>
     * (which you may or may not notice has a {@code 'u'} character between the Reverse-Solidus
     * Backslash character and the Hexadecimal Characters {@code '2713'}).
     * 
     * <BR /><BR />After further research, ChatGPT apologized for it's mistake saying:
     * 
     * <BR /><BR /><I>"You're correct, and I apologize for the oversight. In CSS, Unicode escape
     * sequences within string literals do not start with the u character. Instead, they consist of
     * a backslash followed by up to six hexadecimal digits, representing the Unicode code point."
     * </I>
     * 
     * <BR /><BR />This crap sort of amazes me.  I really can't believe it.  Anything that Stack
     * Overflow is busy condemning, with a very high likelihood is bound to be pretty good.
     * 
     * <DIV CLASS=CSS>{@code
     * .selector::before
     * {
     *     content: "\2713 Checkmark";
     *     font-family: Arial, sans-serif;
     * }
     * }</DIV>
     * 
     * <BR />The above CSS-{@code String} (which is inside the {@code 'content'} Property-Value)
     * would be stored in the field {@code 'unescaped'} as: <CODE>&#x2713; Checkmark</CODE>.
     * <BR />
     */
    public final String unescaped;


    // ********************************************************************************************
    // ********************************************************************************************
    // Private Constructor, API "is" and "if" Methods
    // ********************************************************************************************
    // ********************************************************************************************


    private Str(
            final int[]             css,
            final int               sPos,
            final int               ePos, 
            final IntStream.Builder b
        )
    {
        super(css, sPos, ePos);

        this.quote = (char) css[sPos];

        // This Code-Points array contains the String-Contents.  Note that this array WILL NOT
        // contain the starting and ending quotation-marks.  Also, if there were any escaped 
        // characters / code-points in the String, they will have been unescaped, since there is no
        // longer any need to have the remain escaped.
    
        int[] codePoints = b.build().toArray();

        // This does nothing more than convert an int[] Code-Points array to a java.lang.String.
        // This is one of Java's String Constructors.

        this.unescaped = new String(codePoints, 0, codePoints.length);
    }

    @Override 
    public final boolean isStr() { return true; }

    @Override
    public final Str ifStr() { return this; }


    // ********************************************************************************************
    // ********************************************************************************************
    // Tokenizer's "is" Method(s)
    // ********************************************************************************************
    // ********************************************************************************************


    static boolean is(final int codePoint)
    { return (codePoint == '\'') || (codePoint == '"'); }


    // ********************************************************************************************
    // ********************************************************************************************
    // User's Constructor: a static "build" method
    // ********************************************************************************************
    // ********************************************************************************************


    /**
     * <EMBED CLASS=defs DATA-TOK=Str DATA-P=stringLiteral>
     * <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_DESC>
     * @param stringLiteral <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_PARAM>
     * @return <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_RET>
     * @throws TokenizeException <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_TOK_EX>
     */
    @SuppressWarnings("unchecked")
    public static Str build(final String stringLiteral)
    { return (Str) CSSToken.build(stringLiteral, INPUT_CHECKER, Str::consume); }

    private static final CSSToken.InputChecker INPUT_CHECKER = (int[] css) ->
    {
        if (! is(css[0])) throw new TokenizeException
            ("Input String does not start with a valid CSS String-Literal");
    };


    // ********************************************************************************************
    // ********************************************************************************************
    // CONSUME
    // ********************************************************************************************
    // ********************************************************************************************


    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
    // COPIED FROM:
    // https://drafts.csswg.org/css-syntax-3/#consume-a-string-token
    //
    // COPIED ON:
    // March 26th, 2024
    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
    // 
    // 4.3.5. Consume a string token
    //
    // This section describes how to consume a string token from a stream of code points. It
    // returns either a <string-token> or <bad-string-token>.
    // 
    // This algorithm may be called with an ending code point, which denotes the code point that
    // ends the string. If an ending code point is not specified, the current input code point is
    // used.
    // 
    // Initially create a <string-token> with its value set to the empty string.
    // [I interpret this as follows:]
    //      final int               quote   = tr.css[tr.pos];
    //      final IntStream.Builder b       = IntStream.builder();
    //
    // Repeatedly consume the next input code point from the stream:
    // [This is fancy W3C Psuedo-Code Talk for a `use a for-loop`]
    // [Below is a For-Loop Body, written in W3C Psuedo-Code]
    // 
    // ** ending code point [NOTE: This says: The Matching-and-Closing Quotation-Mark]
    //      ==> Return the <string-token>.
    //
    // ** EOF
    //      ==> This is a parse error. Return the <string-token>.
    // 
    // ** newline
    //      ==> This is a parse error. Reconsume the current input code point, create a
    //          <bad-string-token>, and return it.
    // 
    // ** U+005C REVERSE SOLIDUS (\)
    //      ==> 1) If the next input code point is EOF, do nothing.
    //          2) Otherwise, if the next input code point is a newline, consume it.
    //          3) Otherwise, (the stream starts with a valid escape) consume an escaped code point
    //             and append the returned code point to the <string-token>’s value.
    // 
    // ** anything else
    //      ==> Append the current input code point to the <string-token>’s value.
    //          [Which I do using an IntStream.Builder as: b.accept(c);]
    //
    // A Bug?  I don't know if I'm misunderstanding, but I think there is a minor bug in
    // the above Pseudo-Code.  For the character after the Reverse-Solidus Option, Option #2 says
    // Consume the next-newline, but it doesn't seem skip all tabs and spaces after the newline has
    // been consumed.  This method skips all ' ' and '\t' characters after consuming the newline.
    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

    /**
     * This is a tokenizer method which <B>"consumes"</B> the next {@code String}-Literal from the
     * input Code-Point Array.
     * 
     * <EMBED CLASS=defs DATA-TOK=String-Literal DATA-URL=consume-string-token DATA-OP=Consume>
     * <EMBED CLASS=external-html DATA-FILE-ID=COPIED_CSS_WG>
     * <EMBED CLASS=external-html DATA-FILE-ID=STRING_TOKEN>
     * <EMBED CLASS=external-html DATA-FILE-ID=STRING_TOK_SVG>
     */
    public static void consume(                                 // When invoked from 'CSSTokenizer'
            final int[]                     css,                // C, int[] css
            final ByRef<Integer>            POS,                // P, array-pos loop-variable
            final Consumer<CSSToken>        returnParsedToken,  // T, Vector<CSSToken>.add
            final Consumer<TokenizeError>   errorEncountered    // E, Vector<TokenizeError>.add
        )
    {
        final int               quote   = css[POS.f];
        final IntStream.Builder b       = IntStream.builder();

        int pos=POS.f+1;
        int c;

        WHILE_LOOP:
        while (pos < css.length) switch (c = css[pos])
        {
            case '\'':
            case '"':

                // This switch-statement is asking whether or not the quotation that was just
                // encountered is the same quotation that was used to start this string-literal.

                if (c == quote)
                {
                    returnParsedToken.accept(new Str(css, POS.f, ++pos, b));
                    POS.f = pos;
                    return;
                }

                // Again, the IntStream is there to help build the "Un-Escaped" Version of the
                // String.  Note that this IntStream.Builder would be completely unnecessary if it
                // weren't for the Escaped-Characters.  If there are not any Escaped-Chars, one
                // could simply build the String using the beginning and ending indices from the
                // int[] css array.
                //
                // ALSO: 
                //
                // Since Java doesn't allow the variable `quote` inside of a `case`branch, BOTH
                // '\'' and '"' have to be in the case.  For that reason alone, we also have to do
                // this `else` part, otherwise, this `else`branch would be handled automatically by
                // this Switch-Statement's `default` clause.
                //
                // FINALLY:
                //
                // This 'c' is just a Double-Quote that has harmlessly been placed within a String
                // that uses Single-Quotes...  **OR** a harmless Single-Quote that was placed 
                // within a String-Literal that was specified using Double-Quotes.

                b.accept(c);

                pos++;
                continue WHILE_LOOP;

            case '\n':
            case '\r':
            case '\f':

                // Note that, here, a `\r\n` is irrelevant, because this character will be
                // "Re-Consumed" by the loop that called this method.  If this is a 2-character
                // New-Line, it will be handled in by whatever surrounding method called this.

                returnParsedToken.accept(new BadStr(css, POS.f, pos, b));

                errorEncountered.accept(
                    new TokenizeError(
                        css, POS.f, pos, Str.class,
                        "A String-Literal's Open-Quotation was found but a new-line character " +
                        "has been encountered before the Matching, Closing Quotation-Mark was " +
                        "identified."
                    ));

                POS.f = pos;
                return;

            // From the Pseudo-Code at the top of this method, this is the case:
            // U+005C REVERSE SOLIDUS (\)

            case '\\':

                // EOF-Reached, then quit immediately.  There was a "REVERSE SOLIDUS" at the very
                // last character of the CSS-File or String.

                if ((++pos) == css.length) b.accept(0xFFFD); // U+FFFD REPLACEMENT CHARACTER (�)

                // Here, if there is a `\r\n` it needs to be "consumed" immediately.  It cannot be
                // ignored here.  Note that the original Pseudo-Code that was copied from:
                //
                // https://drafts.csswg.org/css-syntax-3/#consume-an-escaped-code-point
                //
                // There, it states that there should be a CSS "Pre-Processor Phase" that
                // eliminates all '\r' and '\r\n' characters and replaces them with '\n'
                //
                // In Java-HTML, it is somewhat imperative that an HTML File can be perfectly
                // re-constructed from a Vector<HTMLNode>.  Therefore the top-level design decision
                // has been made such that any Vector<CSSToken> can be used to perfectly
                // reconstruct a CSS File
                // 
                // As a result, running the CSS Pre-Processor suggested by the Web-Site is being
                // ignored.  This means there are a few small places that need to handle
                // "New-Lines" slightly more carefully.  This is one of them.

                else if (((c = css[pos]) == '\n') || (c == '\f'))
                {
                    while ( (++pos < css.length)
                        &&  ((c = css[pos]) == ' ') || (c == '\t'));
                }

                else if (c == '\r')
                {
                    // There was a properly-escaped new-line, but unfortunately, the next line was
                    // empty.  This will be turned into a 'BadStr', at the very end of this method.

                    if (++pos == css.length) break WHILE_LOOP;
        
                    // This is how `\r\n` is handled:  SKIP the subsequent `\n` after the `\r`
                    if (css[pos] == '\n') pos++;

                    // After the Reverse-Solidus (BackSlash), and after thew single or double
                    // character new-line, then skip any spaces or tabs on the next line of text.

                    while ( (pos < css.length)
                        &&  ((c = css[pos]) == ' ') || (c == '\t'))
                        pos++;
                }

                else pos = CSSUtil.consumeEscapedUnicode(css, pos, b);

                continue WHILE_LOOP;

            default:
                b.accept(c);
                pos++;
        }

        // Thesee statements are only reachable if the above loop was terminated by due to the 
        // value of 'pos' reaching the end of the tr.css code-point array.  If these statemetns are
        // reached, this is guaranteed to be an error.

        returnParsedToken.accept(new BadStr(css, POS.f, css.length, b));

        errorEncountered.accept(
            new TokenizeError(
                css, POS.f, pos, Str.class,
                "A String Literal's Open-Quotation was found, but unfortunately EOF was reached " +
                "before identifying the Matching, Closing-Quotation mark"
            ));

        POS.f = pos;
    }    
}