Identifier.java.html

package Torello.CSS;

import Torello.Java.Additional.ByRef;

import java.util.Vector;
import java.util.stream.IntStream;
import java.util.function.Consumer;

@Torello.JavaDoc.JDHeaderBackgroundImg(EmbedTagFileID="CSS_TOK")
public class Identifier extends CSSToken
    implements CharSequence, java.io.Serializable, Comparable<CharSequence>
{
    /** <EMBED CLASS='external-html' DATA-FILE-ID=SVUID> */
    protected static final long serialVersionUID = 1;


    // ********************************************************************************************
    // ********************************************************************************************
    // Public & Final Fields
    // ********************************************************************************************
    // ********************************************************************************************


    /**
     * This contains the <B STYLE='color: red;'><I>unescaped</I></B> text that that constitutes
     * this identifier.  Identifiers are permitted to use Escaped-Unicode Sequences.  If any
     * characters were escaped, this {@code String} will have the unescaped variant of the
     * {@code String} stored here.
     */
    public final String identifier;


    // ********************************************************************************************
    // ********************************************************************************************
    // Private Constructor, API "is" and "if" Methods
    // ********************************************************************************************
    // ********************************************************************************************


    Identifier(
            final int[]     css,
            final int       sPos,
            final int       ePos,
            final String    identifier
    )
    {
        super(css, sPos, ePos);
        this.identifier = identifier;
    }

    @Override 
    public final boolean isIdentifier() { return true; }

    @Override
    public final Identifier ifIdentifier() { return this; }


    // ********************************************************************************************
    // ********************************************************************************************
    // User's Constructor: a static "build" method
    // ********************************************************************************************
    // ********************************************************************************************


    /**
     * <EMBED CLASS=defs DATA-TOK=Identifier DATA-P=identStr>
     * <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_DESC>
     * @param identStr <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_PARAM>
     * @return <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_RET>
     * @throws TokenizeException <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_TOK_EX>
     */
    @SuppressWarnings("unchecked")
    public static Identifier build(final String identStr)
    { return (Identifier) CSSToken.build(identStr, INPUT_CHECKER, Identifier::_PRIVATE_CONSUME); }

    private static final CSSToken.InputChecker INPUT_CHECKER = (int[] css) ->
    {
        if (css.length < 1) throw new TokenizeException(Identifier.class);

        if (! startsIdentSequence(css, 0)) throw new TokenizeException
            ("String-text beginning does not constitute a valid CSS Identifier-Token");
    };


    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
    // Note: This is an "Extra Consume" Method.  Class Identifier has no straight-forward consumer
    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
    //
    // Static-Builder Methods really aren't 1/100th as nice and direct as an actual constructor.
    // As I explained in the class CSSToken, there is not really any way around this - unless I
    // were to completely clobber the CSS-Working-Group's Pseudo-Code and extra provided-diagrams.
    //
    // These "Static Build" methods aren't that bad, but since "Class Identifier" is the second
    // biggest of the Data-Classes / Parse-Classes, this "build" method is slightly more
    // complicated
    //
    // This method is a PRIVATE-METHOD which is only invoked by the "build"-Method that is directly
    // above!  The code below is DOING ABSOLUTELY-NOTHING but adding and removing "wrappers".
    // Wrappers inside of code usually really are a bit ugly (at least to me)...
    //
    // This stuff has been thoroughly tested, and it does work.  I ran this through a bunch of
    // different edge-cases.

    private static final void _PRIVATE_CONSUME(
            final int[]                 css,
            final ByRef<Integer>        POS,
            final Consumer<CSSToken>    returnParsedToken 
        )
    {
        // ByRef / Wrappers...  wrappers, wrappers, wrappers that is all this is doing...
        // Every single one of the other classes that have a "consume" method utilize these input
        // and output parameters so that they can interact with BOTH the parser AND the
        // User-Constructor via the SAME EXACT "consume" method.
        // 
        // Again, Class "Identifier" is the only of the classes that does not actually have a
        // regular "consume" method, although, that is what this one essentially is.  Since there 
        // must be Exception-Checks before all "consume" methods, (and since this one doesn't have
        // one), this method, as mentioned before, is PRIVATE, and only invoked directly above by
        // "build!"

        final ByRef<String> ident = new ByRef<>(null);
        final int ePos = Identifier.consumeIdentSequence(css, POS.f, ident);
        returnParsedToken.accept(new Identifier(css, POS.f, ePos, ident.f));
    }


    // ********************************************************************************************
    // ********************************************************************************************
    // Tokenizer's "is" Method(s)
    // ********************************************************************************************
    // ********************************************************************************************


    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
    // Copied from: 
    // https://drafts.csswg.org/css-syntax-3/#ident-code-point
    // March 27th, 2024
    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
    // ** ident code point
    //      ==> An ident-start code point, a digit, or U+002D HYPHEN-MINUS (-).
    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

    static boolean isIdentCodePoint(int codePoint)
    {
        if (isIdentStartCodePoint(codePoint))           return true;
        if ((codePoint >= '0') && (codePoint <= '9'))   return true;
        if (codePoint == '-')                           return true;

        return false;
    }


    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
    // Copied from: 
    // https://drafts.csswg.org/css-syntax-3/#ident-start-code-point
    // March 27th, 2024
    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
    // ** ident-start code point
    //      ==> A letter, a non-ASCII ident code point, or U+005F LOW LINE (_).
    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

    static boolean isIdentStartCodePoint(int codePoint)
    {
        if ((codePoint >= 'a') && (codePoint <= 'z'))   return true;
        if ((codePoint >= 'A') && (codePoint <= 'Z'))   return true;
        if (codePoint == '_')                           return true;
        if (isNonASCIIIdentCodePoint(codePoint))        return true;

        return false;
    }


    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
    // Copied from: 
    // https://drafts.csswg.org/css-syntax-3/#non-ascii-ident-code-point
    // March 27th, 2024
    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
    // ** non-ASCII ident code point
    //      ==> A code point whose value is any of:
    //          * U+00B7
    //          * between U+00C0 and U+00D6
    //          * between U+00D8 and U+00F6
    //          * between U+00F8 and U+037D
    //          * between U+037F and U+1FFF
    //          * U+200C, U+200D, U+203F, U+2040
    //          * between U+2070 and U+218F
    //          * between U+2C00 and U+2FEF
    //          * between U+3001 and U+D7FF
    //          * between U+F900 and U+FDCF
    //          * between U+FDF0 and U+FFFD
    //          * greater than or equal to U+10000
    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

    static boolean isNonASCIIIdentCodePoint(int c)
    {
        if (c == 0x00B7) return true;

        // between U+00C0 and U+00D6
        // between U+00D8 and U+00F6
        // between U+00F8 and U+037D
        // between U+037F and U+1FFF

        if ((c >= 0x00C0) && (c <= 0x1FFF))
        {
            return      (c != 0x00D7)
                    &&  (c != 0x00F7)
                    &&  (c != 0x037E);
        }

        // U+200C, U+200D, U+203F, U+2040
        if (c == 0x200C) return true;
        if (c == 0x200D) return true;
        if (c == 0x203F) return true;
        if (c == 0x2040) return true;

        // between U+2070 and U+218F
        if ((c >= 0x2070) && (c <= 0x218F)) return true;

        // between U+2C00 and U+2FEF
        if ((c >= 0x2C00) && (c <= 0x2FEF)) return true;

        // between U+3001 and U+D7FF
        if ((c >= 0x2001) && (c <= 0xD7FF)) return true;

        // between U+F900 and U+FDCF
        if ((c >= 0xF900) && (c <= 0xFDCF)) return true;

        // between U+FDF0 and U+FFFD
        if ((c >= 0xFDF0) && (c <= 0xFFFD)) return true;

        // greater than or equal to U+10000
        if (c >= 0x10000) return true;

        return false;
    }

    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
    // Copied from:
    // https://drafts.csswg.org/css-syntax-3/#check-if-three-code-points-would-start-an-ident-sequence
    // March 27th, 2024
    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
    // 
    // 4.3.9. Check if three code points would start an ident sequence
    // 
    // This section describes how to check if three code points would start an ident sequence. The
    // algorithm described here can be called explicitly with three code points, or can be called
    // with the input stream itself. In the latter case, the three code points in question are the
    // current input code point and the next two input code points, in that order.
    // 
    // NOTE: This algorithm will not consume any additional code points.
    // 
    // Look at the first code point:
    // 
    // ** U+002D HYPHEN-MINUS
    //      ==> If the second code point is an ident-start code point or a U+002D HYPHEN-MINUS, or
    //          the second and third code points are a valid escape, return true. Otherwise, return
    //          false.
    //
    // ** ident-start code point
    //      ==> Return true.
    // 
    // ** U+005C REVERSE SOLIDUS (\)
    //      ==> If the first and second code points are a valid escape, return true.
    //          Otherwise, return false.
    // 
    // ** anything else
    //      ==> Return false.
    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

    /**
     * Checks whether or not the next token to consume is one of three available identifier-token,
     * classes.
     * 
     * <EMBED CLASS=defs DATA-TOK=Identifier-Name
     *      DATA-URL=check-if-three-code-points-would-start-an-ident-sequence DATA-OP=Check>
     * <EMBED CLASS=external-html DATA-FILE-ID=COPIED_CSS_WG>
     * <EMBED CLASS=external-html DATA-FILE-ID=CHECK_IDENT_SEQ_3CP>
     * 
     * @param css CSS-{@code String} as an array of code-points.
     * @param sPos The array-index where the tokenizer is to consume its next token
     * @return {@code TRUE} if and only if the next token in the array  is an identifier
     */
    public static boolean startsIdentSequence(final int[] css, final int sPos)
    {
        final int c1 = ((sPos + 0) < css.length) ? css[sPos + 0] : 0;
        final int c2 = ((sPos + 1) < css.length) ? css[sPos + 1] : 0;
        final int c3 = ((sPos + 2) < css.length) ? css[sPos + 2] : 0;

        // ** U+002D HYPHEN-MINUS
        //      ==> If the second code point is an ident-start code point or a U+002D HYPHEN-MINUS, or
        //          the second and third code points are a valid escape, return true. Otherwise, return
        //          false.

        if (c1 == '-')
        {
            if (isIdentStartCodePoint(c2))      return true;
            if (c2 == '-')                      return true;
            if (CSSUtil.isValidEscape(c2, c3))  return true;
    
            return false;
        }

        // ** ident-start code point ==> Return true.
        if (isIdentStartCodePoint(c1)) return true;

        // ** U+005C REVERSE SOLIDUS (\)
        //      ==> If the first and second code points are a valid escape, return true.
        //          Otherwise, return false.

        if (CSSUtil.isValidEscape(c1, c2)) return true;

        // ** anything else ==> Return false.
        return false;
    }


    // ********************************************************************************************
    // ********************************************************************************************
    // CONSUME
    // ********************************************************************************************
    // ********************************************************************************************


    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
    // Copied from:
    // https://drafts.csswg.org/css-syntax-3/#consume-an-ident-sequence
    // March 27th, 2024
    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
    // 
    // 4.3.12. Consume an ident sequence
    //
    // This section describes how to consume an ident sequence from a stream of code points. It
    // returns a string containing the largest name that can be formed from adjacent code points in
    // the stream, starting from the first.
    // 
    // NOTE: This algorithm does not do the verification of the first few code points that are
    // necessary to ensure the returned code points would constitute an <ident-token>. If that is
    // the intended use, ensure that the stream starts with an ident sequence before calling this
    // algorithm.
    // 
    // Let result initially be an empty string.
    // 
    // Repeatedly consume the next input code point from the stream:
    // 
    // ** ident code point:
    //      ==> Append the code point to result.
    // 
    // ** the stream starts with a valid escape
    //      ==> Consume an escaped code point. Append the returned code point to result.
    // 
    // ** anything else
    //      ==> Reconsume the current input code point. Return result.
    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

    /**
     * This is a tokenizer method which <B>"consumes"</B> the next {@code Identifier}-Sequence
     * from the input Code-Point Array.
     * 
     * <EMBED CLASS=defs DATA-TOK=Identifier-Sequence DATA-URL=consume-an-ident-sequence
     *      DATA-OP=Consume>
     * <EMBED CLASS=external-html DATA-FILE-ID=COPIED_CSS_WG>
     * <EMBED CLASS=external-html DATA-FILE-ID=IDENTIFIER_SEQUENCE>
     * <EMBED CLASS=external-html DATA-FILE-ID=IDENTIFIER_TOK_SVG>
     */
    protected static int consumeIdentSequence(
            final int[]         css,
            final int           sPos,
            final ByRef<String> identifier
        )
    {
        IntStream.Builder   b   = IntStream.builder();
        int                 c   = 0;
        int                 pos = sPos;

        while (pos < css.length)

            if (isIdentCodePoint(c = css[pos]))
                { b.accept(c); pos++; }

            else if (CSSUtil.isValidEscape(css, pos))
                pos = CSSUtil.consumeEscapedUnicode(css, ++pos, b);

            // break happens before increment
            else break;

        int[] identifierAsCodePoints = b.build().toArray();

        // Uses the great Pass-Reference Tuple, which is ByRef
        identifier.f = new String(identifierAsCodePoints, 0, identifierAsCodePoints.length);

        return pos;
    }

    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
    // Copied from:
    // https://drafts.csswg.org/css-syntax-3/#consume-an-ident-like-token
    // March 2024
    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
    //
    // 4.3.4. Consume an ident-like token
    // 
    // This section describes how to consume an ident-like token from a stream of code points.
    // It returns an <ident-token>, <function-token>, <url-token>, or <bad-url-token>.
    // 
    // Consume an ident sequence, and let **STRING** be the result.
    // 
    // **   If string’s value is an ASCII case-insensitive match for "url", and the next input code
    //      point is U+0028 LEFT PARENTHESIS ((), consume it. While the next two input code points
    //      are whitespace, consume the next input code point. If the next one or two input code
    //      points are:
    //          * U+0022 QUOTATION MARK ("),
    //          * U+0027 APOSTROPHE ('),
    //          * or whitespace
    //      followed by:
    //          * U+0022 QUOTATION MARK (")
    //          * or U+0027 APOSTROPHE ('),
    //
    //      then create a <function-token> with its value set to **STRING** and return it.
    //      Otherwise, consume a url token, and return it.
    // 
    // **   Otherwise, if the next input code point is U+0028 LEFT PARENTHESIS ((), consume it.
    //      Create a <function-token> with its value set to **STRING** and return it.
    // 
    // **   Otherwise, create an <ident-token> with its value set to **STRING** and return it.

    /**
     * This is a tokenizer method which <B>"consumes"</B> the next {@code Identifier}-Token (or 
     * Identifier-Token Subclass) from the input Code-Point Array.
     * 
     * <EMBED CLASS=defs DATA-TOK=Identifier-Like-Token DATA-URL=consume-ident-like-token
     *      DATA-OP=Consume>
     * <EMBED CLASS=external-html DATA-FILE-ID=COPIED_CSS_WG>
     * <EMBED CLASS=external-html DATA-FILE-ID=IDENT_LIKE_TOKEN>
     */
    protected static void consumeIdentLikeSequence(
            final int[]                     css,
            final ByRef<Integer>            POS,
            final Consumer<CSSToken>        returnParsedToken,
            final Consumer<TokenizeError>   errorEncountered
        )
    {
        // Consume an ident sequence, and let string be the result.
        ByRef<String>   identifier  = new ByRef<>();
        int             identEPos   = consumeIdentSequence(css, POS.f, identifier);


        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // FIRST-CASE: Handle the "url(" possibility
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        //
        // If string’s value is an ASCII case-insensitive match for "url", and the next input code
        // point is U+0028 LEFT PARENTHESIS ((), consume it

        if (    identifier.f.equalsIgnoreCase("url")
            &&  (identEPos < css.length)
            &&  (css[identEPos] == '(')
        )
        {
            // Java-HTML isn't doing this in the EXACT-PRECISE order expresed in the Pseudo-Code
            // Here, this is added first.  It makes no different, because once the left-parenthesis
            // has been identified, this is going to be a "Func" instance.  Furthermore the end-pos
            // of "Func(" will be exactly the left-parenthesis.

            returnParsedToken.accept(new Func(css, POS.f, identEPos + 1, identifier.f));

            POS.f = identEPos + 1;

            // Below is the actual Pseudo-Code Comment from the CSS-WG Document.  Since Comments
            // are still in the "int[] css", a while-loop is actually necessary - AND the "Func"
            // instance needs to be inserted into the output-Consumer FIRST
            //
            // While the next two input code points are whitespace, consume the next input code
            // point.

            while (true)

                if (Whitespace.is(css[POS.f]))
                    Whitespace.consume(css, POS, returnParsedToken);

                else if (Comment.is(css, POS.f))
                    Comment.consume(css, POS, returnParsedToken, errorEncountered);

                else break;

            // This part is identical to the CSS-WG Pseudo-Code, but again, it is every-so-slightly
            // out of order.  I don't actually stip out all the comments at the beginning, in the
            // "PRE-PROCESSOR" phase.  This Tokenizer does not employ the pre-processor.  There are
            // only two operations that are performed by the "PRE-PROCESSOR" - one is to stip all
            // comments upon entry, and two is to replace all '\r\n' and '\r' with just '\n'
            //
            // That's all it does!  I don't do either of those!  Which is explained in the JavaDoc
            // Pages - the Sum of all CSSToken's generated must be explicitly identical to the 
            // original CSS Input-String.
            //
            // Below is the original CSS-WG Comment.
            //
            // IT IS SOMEWHAT IMPORTANT TO NOTE/UNDERSTAND that ALL THIS IS SAYING is that if the
            // "url(enclosed_stuff)" - the "enclosed_stuff" is encapsulated inside of quoation
            // marks, then "enclosed_stuff" SHOULD BE IGNORED COMPLETELY, and handled as "Str"
            // token by the MAIN-TOKENIZER LOOP (in class CSSTokenzier).
            //
            // The only way that an CSSToken instance of either "URLToken" or "BadURL" would EVER
            // need to be parsed would be if the URL-Part - "enclosed_stuff" were ONLY encapsulated
            // within parenthesis, and left-off / were-not-wrapped-with any quotation marks at all.
            // 
            // If the next one or two input code points are
            //      * U+0022 QUOTATION MARK ("),
            //      * U+0027 APOSTROPHE ('),
            //      * or whitespace
            //
            // followed by:
            //      * U+0022 QUOTATION MARK (")
            //      * or U+0027 APOSTROPHE ('),
            //

            if ((css[POS.f] == '\'') || (css[POS.f] == '"') || (css[POS.f] == ')')) return;

            // FINALLY, The "consume" method inside Class "URLToken" will actually generate a 
            // "BadURL" instance - if the URL is, indeed, bad!  There is no need to worry about it
            // here at all.

            if (POS.f < css.length)
            {
                // NOTE: The 'false' at the end is to solve a minor problem where Class URLToken's
                //       "build" method is every-so-slightly different from the "consume" method.
                //       The "consume" method consumes a closing Right-Parenthesis, but the "build"
                //       method DOES NOT PROVIDE SUCH A PARENTHESIS.
                // 
                // In order to differentiate the two different cases/situations, here 'false' is 
                // passed to method "consume", and when Class URLToken.build() is invoked, that
                // method passes 'true' to the last parameter.

                URLToken.consume(css, POS, returnParsedToken, errorEncountered, false);
                return;
            }

            return;
        }


        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // SECOND-CASE: Handle ALL OTHER "Func(" Possibilities
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        //
        // Otherwise, if the next input code point is U+0028 LEFT PARENTHESIS ((), consume it.
        // Create a <function-token> with its value set to string and return it.

        else if ((identEPos < css.length) && (css[identEPos] == '('))
        {
            returnParsedToken.accept(new Func(css, POS.f, identEPos + 1, identifier.f));
            POS.f = identEPos + 1;
        }


        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // LAST-CASE: This Just an "identifierStr" - NOT AN "identifierStr(" - No Left-Parenthesis
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

        else
        {
            returnParsedToken.accept(new Identifier(css, POS.f, identEPos, identifier.f));
            POS.f = identEPos;
        }
    }
}