1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
package Torello.CSS;

import Torello.Java.Additional.ByRef;

import java.util.Vector;
import java.util.function.Consumer;
import java.util.stream.IntStream;

/**
 * CSS-Tokenizer Class for {@code String}-Literals.
 */
@Torello.JavaDoc.JDHeaderBackgroundImg(EmbedTagFileID="CSS_TOK")
public class Str extends CSSToken
    implements CharSequence, java.io.Serializable, Comparable<CharSequence>
{
    /** <EMBED CLASS='external-html' DATA-FILE-ID=SVUID> */
    protected static final long serialVersionUID = 1;


    // ********************************************************************************************
    // ********************************************************************************************
    // Public & Final Fields
    // ********************************************************************************************
    // ********************************************************************************************


    /**
     * The quotation mark type used to quote this {@code String}-Literal.  The value placed in
     * this Java {@code char} primitive may only be a Single-Quotation Mark, or a Double-Quotation.
     * No other types of quotations are included in this class parser.
     */
    public final char quote;

    /**
     * This is the actual {@code String}-Literal that this {@code CSSToken} represents.  This Java
     * {@code String} will never actually contain the opening and closing quotation marks that were
     * used to create this {@code String}.
     * 
     * <BR /><BR /><B CLASS=JDDescLabel>Unescaped String:</B>
     * 
     * <BR />if this {@code String} utilized any Escape-Sequences representing Unicode Characters,
     * the Unescaped-Characters are used within this {@code String} to replace the original,
     * escaped, sequences.
     * 
     * <BR /><BR /><B CLASS=JDDescLabel>ChatGPT Provided Exmaple:</B>
     * 
     * <BR />There is a quoted, {@code String} below, provided by AI.  Note that ChatGPT 
     * initially gave me a slightly different answer written as <CODE>"\u2713 Checkmark"</CODE>
     * (which you may or may not notice has a {@code 'u'} character between the Reverse-Solidus
     * Backslash character and the Hexadecimal Characters {@code '2713'}).
     * 
     * <BR /><BR />After further research, ChatGPT apologized for it's mistake saying:
     * 
     * <BR /><BR /><I>"You're correct, and I apologize for the oversight. In CSS, Unicode escape
     * sequences within string literals do not start with the u character. Instead, they consist of
     * a backslash followed by up to six hexadecimal digits, representing the Unicode code point."
     * </I>
     * 
     * <BR /><BR />This crap sort of amazes me.  I really can't believe it.  Anything that Stack
     * Overflow is busy condemning, with a very high likelihood is bound to be pretty good.
     * 
     * <DIV CLASS=CSS>{@code
     * .selector::before
     * {
     *     content: "\2713 Checkmark";
     *     font-family: Arial, sans-serif;
     * }
     * }</DIV>
     * 
     * <BR />The above CSS-{@code String} (which is inside the {@code 'content'} Property-Value)
     * would be stored in the field {@code 'unescaped'} as: <CODE>&#x2713; Checkmark</CODE>.
     * <BR />
     */
    public final String unescaped;


    // ********************************************************************************************
    // ********************************************************************************************
    // Private Constructor, API "is" and "if" Methods
    // ********************************************************************************************
    // ********************************************************************************************


    private Str(
            final int[]             css,
            final int               sPos,
            final int               ePos, 
            final IntStream.Builder b
        )
    {
        super(css, sPos, ePos);

        this.quote = (char) css[sPos];

        // This Code-Points array contains the String-Contents.  Note that this array WILL NOT
        // contain the starting and ending quotation-marks.  Also, if there were any escaped 
        // characters / code-points in the String, they will have been unescaped, since there is no
        // longer any need to have the remain escaped.
    
        int[] codePoints = b.build().toArray();

        // This does nothing more than convert an int[] Code-Points array to a java.lang.String.
        // This is one of Java's String Constructors.

        this.unescaped = new String(codePoints, 0, codePoints.length);
    }

    @Override 
    public final boolean isStr() { return true; }

    @Override
    public final Str ifStr() { return this; }


    // ********************************************************************************************
    // ********************************************************************************************
    // Tokenizer's "is" Method(s)
    // ********************************************************************************************
    // ********************************************************************************************


    static boolean is(final int codePoint)
    { return (codePoint == '\'') || (codePoint == '"'); }


    // ********************************************************************************************
    // ********************************************************************************************
    // User's Constructor: a static "build" method
    // ********************************************************************************************
    // ********************************************************************************************


    /**
     * <EMBED CLASS=defs DATA-TOK=Str DATA-P=stringLiteral>
     * <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_DESC>
     * @param stringLiteral <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_PARAM>
     * @return <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_RET>
     * @throws TokenizeException <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_TOK_EX>
     */
    @SuppressWarnings("unchecked")
    public static Str build(final String stringLiteral)
    { return (Str) CSSToken.build(stringLiteral, INPUT_CHECKER, Str::consume); }

    private static final CSSToken.InputChecker INPUT_CHECKER = (int[] css) ->
    {
        if (! is(css[0])) throw new TokenizeException
            ("Input String does not start with a valid CSS String-Literal");
    };


    // ********************************************************************************************
    // ********************************************************************************************
    // CONSUME
    // ********************************************************************************************
    // ********************************************************************************************


    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
    // COPIED FROM:
    // https://drafts.csswg.org/css-syntax-3/#consume-a-string-token
    //
    // COPIED ON:
    // March 26th, 2024
    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
    // 
    // 4.3.5. Consume a string token
    //
    // This section describes how to consume a string token from a stream of code points. It
    // returns either a <string-token> or <bad-string-token>.
    // 
    // This algorithm may be called with an ending code point, which denotes the code point that
    // ends the string. If an ending code point is not specified, the current input code point is
    // used.
    // 
    // Initially create a <string-token> with its value set to the empty string.
    // [I interpret this as follows:]
    //      final int               quote   = tr.css[tr.pos];
    //      final IntStream.Builder b       = IntStream.builder();
    //
    // Repeatedly consume the next input code point from the stream:
    // [This is fancy W3C Psuedo-Code Talk for a `use a for-loop`]
    // [Below is a For-Loop Body, written in W3C Psuedo-Code]
    // 
    // ** ending code point [NOTE: This says: The Matching-and-Closing Quotation-Mark]
    //      ==> Return the <string-token>.
    //
    // ** EOF
    //      ==> This is a parse error. Return the <string-token>.
    // 
    // ** newline
    //      ==> This is a parse error. Reconsume the current input code point, create a
    //          <bad-string-token>, and return it.
    // 
    // ** U+005C REVERSE SOLIDUS (\)
    //      ==> 1) If the next input code point is EOF, do nothing.
    //          2) Otherwise, if the next input code point is a newline, consume it.
    //          3) Otherwise, (the stream starts with a valid escape) consume an escaped code point
    //             and append the returned code point to the <string-token>’s value.
    // 
    // ** anything else
    //      ==> Append the current input code point to the <string-token>’s value.
    //          [Which I do using an IntStream.Builder as: b.accept(c);]
    //
    // A Bug?  I don't know if I'm misunderstanding, but I think there is a minor bug in
    // the above Pseudo-Code.  For the character after the Reverse-Solidus Option, Option #2 says
    // Consume the next-newline, but it doesn't seem skip all tabs and spaces after the newline has
    // been consumed.  This method skips all ' ' and '\t' characters after consuming the newline.
    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

    /**
     * This is a tokenizer method which <B>"consumes"</B> the next {@code String}-Literal from the
     * input Code-Point Array.
     * 
     * <EMBED CLASS=defs DATA-TOK=String-Literal DATA-URL=consume-string-token DATA-OP=Consume>
     * <EMBED CLASS=external-html DATA-FILE-ID=COPIED_CSS_WG>
     * <EMBED CLASS=external-html DATA-FILE-ID=STRING_TOKEN>
     * <EMBED CLASS=external-html DATA-FILE-ID=STRING_TOK_SVG>
     */
    public static void consume(                                 // When invoked from 'CSSTokenizer'
            final int[]                     css,                // C, int[] css
            final ByRef<Integer>            POS,                // P, array-pos loop-variable
            final Consumer<CSSToken>        returnParsedToken,  // T, Vector<CSSToken>.add
            final Consumer<TokenizeError>   errorEncountered    // E, Vector<TokenizeError>.add
        )
    {
        final int               quote   = css[POS.f];
        final IntStream.Builder b       = IntStream.builder();

        int pos=POS.f+1;
        int c;

        WHILE_LOOP:
        while (pos < css.length) switch (c = css[pos])
        {
            case '\'':
            case '"':

                // This switch-statement is asking whether or not the quotation that was just
                // encountered is the same quotation that was used to start this string-literal.

                if (c == quote)
                {
                    returnParsedToken.accept(new Str(css, POS.f, ++pos, b));
                    POS.f = pos;
                    return;
                }

                // Again, the IntStream is there to help build the "Un-Escaped" Version of the
                // String.  Note that this IntStream.Builder would be completely unnecessary if it
                // weren't for the Escaped-Characters.  If there are not any Escaped-Chars, one
                // could simply build the String using the beginning and ending indices from the
                // int[] css array.
                //
                // ALSO: 
                //
                // Since Java doesn't allow the variable `quote` inside of a `case`branch, BOTH
                // '\'' and '"' have to be in the case.  For that reason alone, we also have to do
                // this `else` part, otherwise, this `else`branch would be handled automatically by
                // this Switch-Statement's `default` clause.
                //
                // FINALLY:
                //
                // This 'c' is just a Double-Quote that has harmlessly been placed within a String
                // that uses Single-Quotes...  **OR** a harmless Single-Quote that was placed 
                // within a String-Literal that was specified using Double-Quotes.

                b.accept(c);

                pos++;
                continue WHILE_LOOP;

            case '\n':
            case '\r':
            case '\f':

                // Note that, here, a `\r\n` is irrelevant, because this character will be
                // "Re-Consumed" by the loop that called this method.  If this is a 2-character
                // New-Line, it will be handled in by whatever surrounding method called this.

                returnParsedToken.accept(new BadStr(css, POS.f, pos, b));

                errorEncountered.accept(
                    new TokenizeError(
                        css, POS.f, pos, Str.class,
                        "A String-Literal's Open-Quotation was found but a new-line character " +
                        "has been encountered before the Matching, Closing Quotation-Mark was " +
                        "identified."
                    ));

                POS.f = pos;
                return;

            // From the Pseudo-Code at the top of this method, this is the case:
            // U+005C REVERSE SOLIDUS (\)

            case '\\':

                // EOF-Reached, then quit immediately.  There was a "REVERSE SOLIDUS" at the very
                // last character of the CSS-File or String.

                if ((++pos) == css.length) b.accept(0xFFFD); // U+FFFD REPLACEMENT CHARACTER (�)

                // Here, if there is a `\r\n` it needs to be "consumed" immediately.  It cannot be
                // ignored here.  Note that the original Pseudo-Code that was copied from:
                //
                // https://drafts.csswg.org/css-syntax-3/#consume-an-escaped-code-point
                //
                // There, it states that there should be a CSS "Pre-Processor Phase" that
                // eliminates all '\r' and '\r\n' characters and replaces them with '\n'
                //
                // In Java-HTML, it is somewhat imperative that an HTML File can be perfectly
                // re-constructed from a Vector<HTMLNode>.  Therefore the top-level design decision
                // has been made such that any Vector<CSSToken> can be used to perfectly
                // reconstruct a CSS File
                // 
                // As a result, running the CSS Pre-Processor suggested by the Web-Site is being
                // ignored.  This means there are a few small places that need to handle
                // "New-Lines" slightly more carefully.  This is one of them.

                else if (((c = css[pos]) == '\n') || (c == '\f'))
                {
                    while ( (++pos < css.length)
                        &&  ((c = css[pos]) == ' ') || (c == '\t'));
                }

                else if (c == '\r')
                {
                    // There was a properly-escaped new-line, but unfortunately, the next line was
                    // empty.  This will be turned into a 'BadStr', at the very end of this method.

                    if (++pos == css.length) break WHILE_LOOP;
        
                    // This is how `\r\n` is handled:  SKIP the subsequent `\n` after the `\r`
                    if (css[pos] == '\n') pos++;

                    // After the Reverse-Solidus (BackSlash), and after thew single or double
                    // character new-line, then skip any spaces or tabs on the next line of text.

                    while ( (pos < css.length)
                        &&  ((c = css[pos]) == ' ') || (c == '\t'))
                        pos++;
                }

                else pos = CSSUtil.consumeEscapedUnicode(css, pos, b);

                continue WHILE_LOOP;

            default:
                b.accept(c);
                pos++;
        }

        // Thesee statements are only reachable if the above loop was terminated by due to the 
        // value of 'pos' reaching the end of the tr.css code-point array.  If these statemetns are
        // reached, this is guaranteed to be an error.

        returnParsedToken.accept(new BadStr(css, POS.f, css.length, b));

        errorEncountered.accept(
            new TokenizeError(
                css, POS.f, pos, Str.class,
                "A String Literal's Open-Quotation was found, but unfortunately EOF was reached " +
                "before identifying the Matching, Closing-Quotation mark"
            ));

        POS.f = pos;
    }    
}