Source code

001package Torello.CSS;
002
003import Torello.Java.Additional.ByRef;
004
005import java.util.Vector;
006import java.util.stream.IntStream;
007import java.util.function.Consumer;
008
009@Torello.JavaDoc.JDHeaderBackgroundImg(EmbedTagFileID="CSS_TOK")
010public class Identifier extends CSSToken
011    implements CharSequence, java.io.Serializable, Comparable<CharSequence>
012{
013    /** <EMBED CLASS='external-html' DATA-FILE-ID=SVUID> */
014    protected static final long serialVersionUID = 1;
015
016
017    // ********************************************************************************************
018    // ********************************************************************************************
019    // Public & Final Fields
020    // ********************************************************************************************
021    // ********************************************************************************************
022
023
024    /**
025     * This contains the <B STYLE='color: red;'><I>unescaped</I></B> text that that constitutes
026     * this identifier.  Identifiers are permitted to use Escaped-Unicode Sequences.  If any
027     * characters were escaped, this {@code String} will have the unescaped variant of the
028     * {@code String} stored here.
029     */
030    public final String identifier;
031
032
033    // ********************************************************************************************
034    // ********************************************************************************************
035    // Private Constructor, API "is" and "if" Methods
036    // ********************************************************************************************
037    // ********************************************************************************************
038
039
040    Identifier(
041            final int[]     css,
042            final int       sPos,
043            final int       ePos,
044            final String    identifier
045    )
046    {
047        super(css, sPos, ePos);
048        this.identifier = identifier;
049    }
050
051    @Override 
052    public final boolean isIdentifier() { return true; }
053
054    @Override
055    public final Identifier ifIdentifier() { return this; }
056
057
058    // ********************************************************************************************
059    // ********************************************************************************************
060    // User's Constructor: a static "build" method
061    // ********************************************************************************************
062    // ********************************************************************************************
063
064
065    /**
066     * <EMBED CLASS=defs DATA-TOK=Identifier DATA-P=identStr>
067     * <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_DESC>
068     * @param identStr <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_PARAM>
069     * @return <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_RET>
070     * @throws TokenizeException <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_TOK_EX>
071     */
072    @SuppressWarnings("unchecked")
073    public static Identifier build(final String identStr)
074    { return (Identifier) CSSToken.build(identStr, INPUT_CHECKER, Identifier::_PRIVATE_CONSUME); }
075
076    private static final CSSToken.InputChecker INPUT_CHECKER = (int[] css) ->
077    {
078        if (css.length < 1) throw new TokenizeException(Identifier.class);
079
080        if (! startsIdentSequence(css, 0)) throw new TokenizeException
081            ("String-text beginning does not constitute a valid CSS Identifier-Token");
082    };
083
084
085    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
086    // Note: This is an "Extra Consume" Method.  Class Identifier has no straight-forward consumer
087    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
088    //
089    // Static-Builder Methods really aren't 1/100th as nice and direct as an actual constructor.
090    // As I explained in the class CSSToken, there is not really any way around this - unless I
091    // were to completely clobber the CSS-Working-Group's Pseudo-Code and extra provided-diagrams.
092    //
093    // These "Static Build" methods aren't that bad, but since "Class Identifier" is the second
094    // biggest of the Data-Classes / Parse-Classes, this "build" method is slightly more
095    // complicated
096    //
097    // This method is a PRIVATE-METHOD which is only invoked by the "build"-Method that is directly
098    // above!  The code below is DOING ABSOLUTELY-NOTHING but adding and removing "wrappers".
099    // Wrappers inside of code usually really are a bit ugly (at least to me)...
100    //
101    // This stuff has been thoroughly tested, and it does work.  I ran this through a bunch of
102    // different edge-cases.
103
104    private static final void _PRIVATE_CONSUME(
105            final int[]                 css,
106            final ByRef<Integer>        POS,
107            final Consumer<CSSToken>    returnParsedToken 
108        )
109    {
110        // ByRef / Wrappers...  wrappers, wrappers, wrappers that is all this is doing...
111        // Every single one of the other classes that have a "consume" method utilize these input
112        // and output parameters so that they can interact with BOTH the parser AND the
113        // User-Constructor via the SAME EXACT "consume" method.
114        // 
115        // Again, Class "Identifier" is the only of the classes that does not actually have a
116        // regular "consume" method, although, that is what this one essentially is.  Since there 
117        // must be Exception-Checks before all "consume" methods, (and since this one doesn't have
118        // one), this method, as mentioned before, is PRIVATE, and only invoked directly above by
119        // "build!"
120
121        final ByRef<String> ident = new ByRef<>(null);
122        final int ePos = Identifier.consumeIdentSequence(css, POS.f, ident);
123        returnParsedToken.accept(new Identifier(css, POS.f, ePos, ident.f));
124    }
125
126
127    // ********************************************************************************************
128    // ********************************************************************************************
129    // Tokenizer's "is" Method(s)
130    // ********************************************************************************************
131    // ********************************************************************************************
132
133
134    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
135    // Copied from: 
136    // https://drafts.csswg.org/css-syntax-3/#ident-code-point
137    // March 27th, 2024
138    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
139    // ** ident code point
140    //      ==> An ident-start code point, a digit, or U+002D HYPHEN-MINUS (-).
141    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
142
143    static boolean isIdentCodePoint(int codePoint)
144    {
145        if (isIdentStartCodePoint(codePoint))           return true;
146        if ((codePoint >= '0') && (codePoint <= '9'))   return true;
147        if (codePoint == '-')                           return true;
148
149        return false;
150    }
151
152
153    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
154    // Copied from: 
155    // https://drafts.csswg.org/css-syntax-3/#ident-start-code-point
156    // March 27th, 2024
157    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
158    // ** ident-start code point
159    //      ==> A letter, a non-ASCII ident code point, or U+005F LOW LINE (_).
160    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
161
162    static boolean isIdentStartCodePoint(int codePoint)
163    {
164        if ((codePoint >= 'a') && (codePoint <= 'z'))   return true;
165        if ((codePoint >= 'A') && (codePoint <= 'Z'))   return true;
166        if (codePoint == '_')                           return true;
167        if (isNonASCIIIdentCodePoint(codePoint))        return true;
168
169        return false;
170    }
171
172
173    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
174    // Copied from: 
175    // https://drafts.csswg.org/css-syntax-3/#non-ascii-ident-code-point
176    // March 27th, 2024
177    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
178    // ** non-ASCII ident code point
179    //      ==> A code point whose value is any of:
180    //          * U+00B7
181    //          * between U+00C0 and U+00D6
182    //          * between U+00D8 and U+00F6
183    //          * between U+00F8 and U+037D
184    //          * between U+037F and U+1FFF
185    //          * U+200C, U+200D, U+203F, U+2040
186    //          * between U+2070 and U+218F
187    //          * between U+2C00 and U+2FEF
188    //          * between U+3001 and U+D7FF
189    //          * between U+F900 and U+FDCF
190    //          * between U+FDF0 and U+FFFD
191    //          * greater than or equal to U+10000
192    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
193
194    static boolean isNonASCIIIdentCodePoint(int c)
195    {
196        if (c == 0x00B7) return true;
197
198        // between U+00C0 and U+00D6
199        // between U+00D8 and U+00F6
200        // between U+00F8 and U+037D
201        // between U+037F and U+1FFF
202
203        if ((c >= 0x00C0) && (c <= 0x1FFF))
204        {
205            return      (c != 0x00D7)
206                    &&  (c != 0x00F7)
207                    &&  (c != 0x037E);
208        }
209
210        // U+200C, U+200D, U+203F, U+2040
211        if (c == 0x200C) return true;
212        if (c == 0x200D) return true;
213        if (c == 0x203F) return true;
214        if (c == 0x2040) return true;
215
216        // between U+2070 and U+218F
217        if ((c >= 0x2070) && (c <= 0x218F)) return true;
218
219        // between U+2C00 and U+2FEF
220        if ((c >= 0x2C00) && (c <= 0x2FEF)) return true;
221
222        // between U+3001 and U+D7FF
223        if ((c >= 0x2001) && (c <= 0xD7FF)) return true;
224
225        // between U+F900 and U+FDCF
226        if ((c >= 0xF900) && (c <= 0xFDCF)) return true;
227
228        // between U+FDF0 and U+FFFD
229        if ((c >= 0xFDF0) && (c <= 0xFFFD)) return true;
230
231        // greater than or equal to U+10000
232        if (c >= 0x10000) return true;
233
234        return false;
235    }
236
237    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
238    // Copied from:
239    // https://drafts.csswg.org/css-syntax-3/#check-if-three-code-points-would-start-an-ident-sequence
240    // March 27th, 2024
241    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
242    // 
243    // 4.3.9. Check if three code points would start an ident sequence
244    // 
245    // This section describes how to check if three code points would start an ident sequence. The
246    // algorithm described here can be called explicitly with three code points, or can be called
247    // with the input stream itself. In the latter case, the three code points in question are the
248    // current input code point and the next two input code points, in that order.
249    // 
250    // NOTE: This algorithm will not consume any additional code points.
251    // 
252    // Look at the first code point:
253    // 
254    // ** U+002D HYPHEN-MINUS
255    //      ==> If the second code point is an ident-start code point or a U+002D HYPHEN-MINUS, or
256    //          the second and third code points are a valid escape, return true. Otherwise, return
257    //          false.
258    //
259    // ** ident-start code point
260    //      ==> Return true.
261    // 
262    // ** U+005C REVERSE SOLIDUS (\)
263    //      ==> If the first and second code points are a valid escape, return true.
264    //          Otherwise, return false.
265    // 
266    // ** anything else
267    //      ==> Return false.
268    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
269
270    /**
271     * Checks whether or not the next token to consume is one of three available identifier-token,
272     * classes.
273     * 
274     * <EMBED CLASS=defs DATA-TOK=Identifier-Name
275     *      DATA-URL=check-if-three-code-points-would-start-an-ident-sequence DATA-OP=Check>
276     * <EMBED CLASS=external-html DATA-FILE-ID=COPIED_CSS_WG>
277     * <EMBED CLASS=external-html DATA-FILE-ID=CHECK_IDENT_SEQ_3CP>
278     * 
279     * @param css CSS-{@code String} as an array of code-points.
280     * @param sPos The array-index where the tokenizer is to consume its next token
281     * @return {@code TRUE} if and only if the next token in the array  is an identifier
282     */
283    public static boolean startsIdentSequence(final int[] css, final int sPos)
284    {
285        final int c1 = ((sPos + 0) < css.length) ? css[sPos + 0] : 0;
286        final int c2 = ((sPos + 1) < css.length) ? css[sPos + 1] : 0;
287        final int c3 = ((sPos + 2) < css.length) ? css[sPos + 2] : 0;
288
289        // ** U+002D HYPHEN-MINUS
290        //      ==> If the second code point is an ident-start code point or a U+002D HYPHEN-MINUS, or
291        //          the second and third code points are a valid escape, return true. Otherwise, return
292        //          false.
293
294        if (c1 == '-')
295        {
296            if (isIdentStartCodePoint(c2))      return true;
297            if (c2 == '-')                      return true;
298            if (CSSUtil.isValidEscape(c2, c3))  return true;
299    
300            return false;
301        }
302
303        // ** ident-start code point ==> Return true.
304        if (isIdentStartCodePoint(c1)) return true;
305
306        // ** U+005C REVERSE SOLIDUS (\)
307        //      ==> If the first and second code points are a valid escape, return true.
308        //          Otherwise, return false.
309
310        if (CSSUtil.isValidEscape(c1, c2)) return true;
311
312        // ** anything else ==> Return false.
313        return false;
314    }
315
316
317    // ********************************************************************************************
318    // ********************************************************************************************
319    // CONSUME
320    // ********************************************************************************************
321    // ********************************************************************************************
322
323
324    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
325    // Copied from:
326    // https://drafts.csswg.org/css-syntax-3/#consume-an-ident-sequence
327    // March 27th, 2024
328    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
329    // 
330    // 4.3.12. Consume an ident sequence
331    //
332    // This section describes how to consume an ident sequence from a stream of code points. It
333    // returns a string containing the largest name that can be formed from adjacent code points in
334    // the stream, starting from the first.
335    // 
336    // NOTE: This algorithm does not do the verification of the first few code points that are
337    // necessary to ensure the returned code points would constitute an <ident-token>. If that is
338    // the intended use, ensure that the stream starts with an ident sequence before calling this
339    // algorithm.
340    // 
341    // Let result initially be an empty string.
342    // 
343    // Repeatedly consume the next input code point from the stream:
344    // 
345    // ** ident code point:
346    //      ==> Append the code point to result.
347    // 
348    // ** the stream starts with a valid escape
349    //      ==> Consume an escaped code point. Append the returned code point to result.
350    // 
351    // ** anything else
352    //      ==> Reconsume the current input code point. Return result.
353    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
354
355    /**
356     * This is a tokenizer method which <B>"consumes"</B> the next {@code Identifier}-Sequence
357     * from the input Code-Point Array.
358     * 
359     * <EMBED CLASS=defs DATA-TOK=Identifier-Sequence DATA-URL=consume-an-ident-sequence
360     *      DATA-OP=Consume>
361     * <EMBED CLASS=external-html DATA-FILE-ID=COPIED_CSS_WG>
362     * <EMBED CLASS=external-html DATA-FILE-ID=IDENTIFIER_SEQUENCE>
363     * <EMBED CLASS=external-html DATA-FILE-ID=IDENTIFIER_TOK_SVG>
364     */
365    protected static int consumeIdentSequence(
366            final int[]         css,
367            final int           sPos,
368            final ByRef<String> identifier
369        )
370    {
371        IntStream.Builder   b   = IntStream.builder();
372        int                 c   = 0;
373        int                 pos = sPos;
374
375        while (pos < css.length)
376
377            if (isIdentCodePoint(c = css[pos]))
378                { b.accept(c); pos++; }
379
380            else if (CSSUtil.isValidEscape(css, pos))
381                pos = CSSUtil.consumeEscapedUnicode(css, ++pos, b);
382
383            // break happens before increment
384            else break;
385
386        int[] identifierAsCodePoints = b.build().toArray();
387
388        // Uses the great Pass-Reference Tuple, which is ByRef
389        identifier.f = new String(identifierAsCodePoints, 0, identifierAsCodePoints.length);
390
391        return pos;
392    }
393
394    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
395    // Copied from:
396    // https://drafts.csswg.org/css-syntax-3/#consume-an-ident-like-token
397    // March 2024
398    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
399    //
400    // 4.3.4. Consume an ident-like token
401    // 
402    // This section describes how to consume an ident-like token from a stream of code points.
403    // It returns an <ident-token>, <function-token>, <url-token>, or <bad-url-token>.
404    // 
405    // Consume an ident sequence, and let **STRING** be the result.
406    // 
407    // **   If string’s value is an ASCII case-insensitive match for "url", and the next input code
408    //      point is U+0028 LEFT PARENTHESIS ((), consume it. While the next two input code points
409    //      are whitespace, consume the next input code point. If the next one or two input code
410    //      points are:
411    //          * U+0022 QUOTATION MARK ("),
412    //          * U+0027 APOSTROPHE ('),
413    //          * or whitespace
414    //      followed by:
415    //          * U+0022 QUOTATION MARK (")
416    //          * or U+0027 APOSTROPHE ('),
417    //
418    //      then create a <function-token> with its value set to **STRING** and return it.
419    //      Otherwise, consume a url token, and return it.
420    // 
421    // **   Otherwise, if the next input code point is U+0028 LEFT PARENTHESIS ((), consume it.
422    //      Create a <function-token> with its value set to **STRING** and return it.
423    // 
424    // **   Otherwise, create an <ident-token> with its value set to **STRING** and return it.
425
426    /**
427     * This is a tokenizer method which <B>"consumes"</B> the next {@code Identifier}-Token (or 
428     * Identifier-Token Subclass) from the input Code-Point Array.
429     * 
430     * <EMBED CLASS=defs DATA-TOK=Identifier-Like-Token DATA-URL=consume-ident-like-token
431     *      DATA-OP=Consume>
432     * <EMBED CLASS=external-html DATA-FILE-ID=COPIED_CSS_WG>
433     * <EMBED CLASS=external-html DATA-FILE-ID=IDENT_LIKE_TOKEN>
434     */
435    protected static void consumeIdentLikeSequence(
436            final int[]                     css,
437            final ByRef<Integer>            POS,
438            final Consumer<CSSToken>        returnParsedToken,
439            final Consumer<TokenizeError>   errorEncountered
440        )
441    {
442        // Consume an ident sequence, and let string be the result.
443        ByRef<String>   identifier  = new ByRef<>();
444        int             identEPos   = consumeIdentSequence(css, POS.f, identifier);
445
446
447        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
448        // FIRST-CASE: Handle the "url(" possibility
449        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
450        //
451        // If string’s value is an ASCII case-insensitive match for "url", and the next input code
452        // point is U+0028 LEFT PARENTHESIS ((), consume it
453
454        if (    identifier.f.equalsIgnoreCase("url")
455            &&  (identEPos < css.length)
456            &&  (css[identEPos] == '(')
457        )
458        {
459            // Java-HTML isn't doing this in the EXACT-PRECISE order expresed in the Pseudo-Code
460            // Here, this is added first.  It makes no different, because once the left-parenthesis
461            // has been identified, this is going to be a "Func" instance.  Furthermore the end-pos
462            // of "Func(" will be exactly the left-parenthesis.
463
464            returnParsedToken.accept(new Func(css, POS.f, identEPos + 1, identifier.f));
465
466            POS.f = identEPos + 1;
467
468            // Below is the actual Pseudo-Code Comment from the CSS-WG Document.  Since Comments
469            // are still in the "int[] css", a while-loop is actually necessary - AND the "Func"
470            // instance needs to be inserted into the output-Consumer FIRST
471            //
472            // While the next two input code points are whitespace, consume the next input code
473            // point.
474
475            while (true)
476
477                if (Whitespace.is(css[POS.f]))
478                    Whitespace.consume(css, POS, returnParsedToken);
479
480                else if (Comment.is(css, POS.f))
481                    Comment.consume(css, POS, returnParsedToken, errorEncountered);
482
483                else break;
484
485            // This part is identical to the CSS-WG Pseudo-Code, but again, it is every-so-slightly
486            // out of order.  I don't actually stip out all the comments at the beginning, in the
487            // "PRE-PROCESSOR" phase.  This Tokenizer does not employ the pre-processor.  There are
488            // only two operations that are performed by the "PRE-PROCESSOR" - one is to stip all
489            // comments upon entry, and two is to replace all '\r\n' and '\r' with just '\n'
490            //
491            // That's all it does!  I don't do either of those!  Which is explained in the JavaDoc
492            // Pages - the Sum of all CSSToken's generated must be explicitly identical to the 
493            // original CSS Input-String.
494            //
495            // Below is the original CSS-WG Comment.
496            //
497            // IT IS SOMEWHAT IMPORTANT TO NOTE/UNDERSTAND that ALL THIS IS SAYING is that if the
498            // "url(enclosed_stuff)" - the "enclosed_stuff" is encapsulated inside of quoation
499            // marks, then "enclosed_stuff" SHOULD BE IGNORED COMPLETELY, and handled as "Str"
500            // token by the MAIN-TOKENIZER LOOP (in class CSSTokenzier).
501            //
502            // The only way that an CSSToken instance of either "URLToken" or "BadURL" would EVER
503            // need to be parsed would be if the URL-Part - "enclosed_stuff" were ONLY encapsulated
504            // within parenthesis, and left-off / were-not-wrapped-with any quotation marks at all.
505            // 
506            // If the next one or two input code points are
507            //      * U+0022 QUOTATION MARK ("),
508            //      * U+0027 APOSTROPHE ('),
509            //      * or whitespace
510            //
511            // followed by:
512            //      * U+0022 QUOTATION MARK (")
513            //      * or U+0027 APOSTROPHE ('),
514            //
515
516            if ((css[POS.f] == '\'') || (css[POS.f] == '"') || (css[POS.f] == ')')) return;
517
518            // FINALLY, The "consume" method inside Class "URLToken" will actually generate a 
519            // "BadURL" instance - if the URL is, indeed, bad!  There is no need to worry about it
520            // here at all.
521
522            if (POS.f < css.length)
523            {
524                // NOTE: The 'false' at the end is to solve a minor problem where Class URLToken's
525                //       "build" method is every-so-slightly different from the "consume" method.
526                //       The "consume" method consumes a closing Right-Parenthesis, but the "build"
527                //       method DOES NOT PROVIDE SUCH A PARENTHESIS.
528                // 
529                // In order to differentiate the two different cases/situations, here 'false' is 
530                // passed to method "consume", and when Class URLToken.build() is invoked, that
531                // method passes 'true' to the last parameter.
532
533                URLToken.consume(css, POS, returnParsedToken, errorEncountered, false);
534                return;
535            }
536
537            return;
538        }
539
540
541        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
542        // SECOND-CASE: Handle ALL OTHER "Func(" Possibilities
543        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
544        //
545        // Otherwise, if the next input code point is U+0028 LEFT PARENTHESIS ((), consume it.
546        // Create a <function-token> with its value set to string and return it.
547
548        else if ((identEPos < css.length) && (css[identEPos] == '('))
549        {
550            returnParsedToken.accept(new Func(css, POS.f, identEPos + 1, identifier.f));
551            POS.f = identEPos + 1;
552        }
553
554
555        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
556        // LAST-CASE: This Just an "identifierStr" - NOT AN "identifierStr(" - No Left-Parenthesis
557        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
558
559        else
560        {
561            returnParsedToken.accept(new Identifier(css, POS.f, identEPos, identifier.f));
562            POS.f = identEPos;
563        }
564    }
565}