Source code

001package Torello.CSS;
002
003import Torello.Java.Additional.ByRef;
004
005import java.util.Vector;
006import java.util.function.Consumer;
007import java.util.stream.IntStream;
008
009/**
010 * CSS-Tokenizer Class for {@code String}-Literals.
011 */
012@Torello.JavaDoc.JDHeaderBackgroundImg(EmbedTagFileID="CSS_TOK")
013public class Str extends CSSToken
014    implements CharSequence, java.io.Serializable, Comparable<CharSequence>
015{
016    /** <EMBED CLASS='external-html' DATA-FILE-ID=SVUID> */
017    protected static final long serialVersionUID = 1;
018
019
020    // ********************************************************************************************
021    // ********************************************************************************************
022    // Public & Final Fields
023    // ********************************************************************************************
024    // ********************************************************************************************
025
026
027    /**
028     * The quotation mark type used to quote this {@code String}-Literal.  The value placed in
029     * this Java {@code char} primitive may only be a Single-Quotation Mark, or a Double-Quotation.
030     * No other types of quotations are included in this class parser.
031     */
032    public final char quote;
033
034    /**
035     * This is the actual {@code String}-Literal that this {@code CSSToken} represents.  This Java
036     * {@code String} will never actually contain the opening and closing quotation marks that were
037     * used to create this {@code String}.
038     * 
039     * <BR /><BR /><B CLASS=JDDescLabel>Unescaped String:</B>
040     * 
041     * <BR />if this {@code String} utilized any Escape-Sequences representing Unicode Characters,
042     * the Unescaped-Characters are used within this {@code String} to replace the original,
043     * escaped, sequences.
044     * 
045     * <BR /><BR /><B CLASS=JDDescLabel>ChatGPT Provided Exmaple:</B>
046     * 
047     * <BR />There is a quoted, {@code String} below, provided by AI.  Note that ChatGPT 
048     * initially gave me a slightly different answer written as <CODE>"\u2713 Checkmark"</CODE>
049     * (which you may or may not notice has a {@code 'u'} character between the Reverse-Solidus
050     * Backslash character and the Hexadecimal Characters {@code '2713'}).
051     * 
052     * <BR /><BR />After further research, ChatGPT apologized for it's mistake saying:
053     * 
054     * <BR /><BR /><I>"You're correct, and I apologize for the oversight. In CSS, Unicode escape
055     * sequences within string literals do not start with the u character. Instead, they consist of
056     * a backslash followed by up to six hexadecimal digits, representing the Unicode code point."
057     * </I>
058     * 
059     * <BR /><BR />This crap sort of amazes me.  I really can't believe it.  Anything that Stack
060     * Overflow is busy condemning, with a very high likelihood is bound to be pretty good.
061     * 
062     * <DIV CLASS=CSS>{@code
063     * .selector::before
064     * {
065     *     content: "\2713 Checkmark";
066     *     font-family: Arial, sans-serif;
067     * }
068     * }</DIV>
069     * 
070     * <BR />The above CSS-{@code String} (which is inside the {@code 'content'} Property-Value)
071     * would be stored in the field {@code 'unescaped'} as: <CODE>&#x2713; Checkmark</CODE>.
072     * <BR />
073     */
074    public final String unescaped;
075
076
077    // ********************************************************************************************
078    // ********************************************************************************************
079    // Private Constructor, API "is" and "if" Methods
080    // ********************************************************************************************
081    // ********************************************************************************************
082
083
084    private Str(
085            final int[]             css,
086            final int               sPos,
087            final int               ePos, 
088            final IntStream.Builder b
089        )
090    {
091        super(css, sPos, ePos);
092
093        this.quote = (char) css[sPos];
094
095        // This Code-Points array contains the String-Contents.  Note that this array WILL NOT
096        // contain the starting and ending quotation-marks.  Also, if there were any escaped 
097        // characters / code-points in the String, they will have been unescaped, since there is no
098        // longer any need to have the remain escaped.
099    
100        int[] codePoints = b.build().toArray();
101
102        // This does nothing more than convert an int[] Code-Points array to a java.lang.String.
103        // This is one of Java's String Constructors.
104
105        this.unescaped = new String(codePoints, 0, codePoints.length);
106    }
107
108    @Override 
109    public final boolean isStr() { return true; }
110
111    @Override
112    public final Str ifStr() { return this; }
113
114
115    // ********************************************************************************************
116    // ********************************************************************************************
117    // Tokenizer's "is" Method(s)
118    // ********************************************************************************************
119    // ********************************************************************************************
120
121
122    static boolean is(final int codePoint)
123    { return (codePoint == '\'') || (codePoint == '"'); }
124
125
126    // ********************************************************************************************
127    // ********************************************************************************************
128    // User's Constructor: a static "build" method
129    // ********************************************************************************************
130    // ********************************************************************************************
131
132
133    /**
134     * <EMBED CLASS=defs DATA-TOK=Str DATA-P=stringLiteral>
135     * <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_DESC>
136     * @param stringLiteral <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_PARAM>
137     * @return <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_RET>
138     * @throws TokenizeException <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_TOK_EX>
139     */
140    @SuppressWarnings("unchecked")
141    public static Str build(final String stringLiteral)
142    { return (Str) CSSToken.build(stringLiteral, INPUT_CHECKER, Str::consume); }
143
144    private static final CSSToken.InputChecker INPUT_CHECKER = (int[] css) ->
145    {
146        if (! is(css[0])) throw new TokenizeException
147            ("Input String does not start with a valid CSS String-Literal");
148    };
149
150
151    // ********************************************************************************************
152    // ********************************************************************************************
153    // CONSUME
154    // ********************************************************************************************
155    // ********************************************************************************************
156
157
158    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
159    // COPIED FROM:
160    // https://drafts.csswg.org/css-syntax-3/#consume-a-string-token
161    //
162    // COPIED ON:
163    // March 26th, 2024
164    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
165    // 
166    // 4.3.5. Consume a string token
167    //
168    // This section describes how to consume a string token from a stream of code points. It
169    // returns either a <string-token> or <bad-string-token>.
170    // 
171    // This algorithm may be called with an ending code point, which denotes the code point that
172    // ends the string. If an ending code point is not specified, the current input code point is
173    // used.
174    // 
175    // Initially create a <string-token> with its value set to the empty string.
176    // [I interpret this as follows:]
177    //      final int               quote   = tr.css[tr.pos];
178    //      final IntStream.Builder b       = IntStream.builder();
179    //
180    // Repeatedly consume the next input code point from the stream:
181    // [This is fancy W3C Psuedo-Code Talk for a `use a for-loop`]
182    // [Below is a For-Loop Body, written in W3C Psuedo-Code]
183    // 
184    // ** ending code point [NOTE: This says: The Matching-and-Closing Quotation-Mark]
185    //      ==> Return the <string-token>.
186    //
187    // ** EOF
188    //      ==> This is a parse error. Return the <string-token>.
189    // 
190    // ** newline
191    //      ==> This is a parse error. Reconsume the current input code point, create a
192    //          <bad-string-token>, and return it.
193    // 
194    // ** U+005C REVERSE SOLIDUS (\)
195    //      ==> 1) If the next input code point is EOF, do nothing.
196    //          2) Otherwise, if the next input code point is a newline, consume it.
197    //          3) Otherwise, (the stream starts with a valid escape) consume an escaped code point
198    //             and append the returned code point to the <string-token>’s value.
199    // 
200    // ** anything else
201    //      ==> Append the current input code point to the <string-token>’s value.
202    //          [Which I do using an IntStream.Builder as: b.accept(c);]
203    //
204    // A Bug?  I don't know if I'm misunderstanding, but I think there is a minor bug in
205    // the above Pseudo-Code.  For the character after the Reverse-Solidus Option, Option #2 says
206    // Consume the next-newline, but it doesn't seem skip all tabs and spaces after the newline has
207    // been consumed.  This method skips all ' ' and '\t' characters after consuming the newline.
208    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
209
210    /**
211     * This is a tokenizer method which <B>"consumes"</B> the next {@code String}-Literal from the
212     * input Code-Point Array.
213     * 
214     * <EMBED CLASS=defs DATA-TOK=String-Literal DATA-URL=consume-string-token DATA-OP=Consume>
215     * <EMBED CLASS=external-html DATA-FILE-ID=COPIED_CSS_WG>
216     * <EMBED CLASS=external-html DATA-FILE-ID=STRING_TOKEN>
217     * <EMBED CLASS=external-html DATA-FILE-ID=STRING_TOK_SVG>
218     */
219    public static void consume(                                 // When invoked from 'CSSTokenizer'
220            final int[]                     css,                // C, int[] css
221            final ByRef<Integer>            POS,                // P, array-pos loop-variable
222            final Consumer<CSSToken>        returnParsedToken,  // T, Vector<CSSToken>.add
223            final Consumer<TokenizeError>   errorEncountered    // E, Vector<TokenizeError>.add
224        )
225    {
226        final int               quote   = css[POS.f];
227        final IntStream.Builder b       = IntStream.builder();
228
229        int pos=POS.f+1;
230        int c;
231
232        WHILE_LOOP:
233        while (pos < css.length) switch (c = css[pos])
234        {
235            case '\'':
236            case '"':
237
238                // This switch-statement is asking whether or not the quotation that was just
239                // encountered is the same quotation that was used to start this string-literal.
240
241                if (c == quote)
242                {
243                    returnParsedToken.accept(new Str(css, POS.f, ++pos, b));
244                    POS.f = pos;
245                    return;
246                }
247
248                // Again, the IntStream is there to help build the "Un-Escaped" Version of the
249                // String.  Note that this IntStream.Builder would be completely unnecessary if it
250                // weren't for the Escaped-Characters.  If there are not any Escaped-Chars, one
251                // could simply build the String using the beginning and ending indices from the
252                // int[] css array.
253                //
254                // ALSO: 
255                //
256                // Since Java doesn't allow the variable `quote` inside of a `case`branch, BOTH
257                // '\'' and '"' have to be in the case.  For that reason alone, we also have to do
258                // this `else` part, otherwise, this `else`branch would be handled automatically by
259                // this Switch-Statement's `default` clause.
260                //
261                // FINALLY:
262                //
263                // This 'c' is just a Double-Quote that has harmlessly been placed within a String
264                // that uses Single-Quotes...  **OR** a harmless Single-Quote that was placed 
265                // within a String-Literal that was specified using Double-Quotes.
266
267                b.accept(c);
268
269                pos++;
270                continue WHILE_LOOP;
271
272            case '\n':
273            case '\r':
274            case '\f':
275
276                // Note that, here, a `\r\n` is irrelevant, because this character will be
277                // "Re-Consumed" by the loop that called this method.  If this is a 2-character
278                // New-Line, it will be handled in by whatever surrounding method called this.
279
280                returnParsedToken.accept(new BadStr(css, POS.f, pos, b));
281
282                errorEncountered.accept(
283                    new TokenizeError(
284                        css, POS.f, pos, Str.class,
285                        "A String-Literal's Open-Quotation was found but a new-line character " +
286                        "has been encountered before the Matching, Closing Quotation-Mark was " +
287                        "identified."
288                    ));
289
290                POS.f = pos;
291                return;
292
293            // From the Pseudo-Code at the top of this method, this is the case:
294            // U+005C REVERSE SOLIDUS (\)
295
296            case '\\':
297
298                // EOF-Reached, then quit immediately.  There was a "REVERSE SOLIDUS" at the very
299                // last character of the CSS-File or String.
300
301                if ((++pos) == css.length) b.accept(0xFFFD); // U+FFFD REPLACEMENT CHARACTER (�)
302
303                // Here, if there is a `\r\n` it needs to be "consumed" immediately.  It cannot be
304                // ignored here.  Note that the original Pseudo-Code that was copied from:
305                //
306                // https://drafts.csswg.org/css-syntax-3/#consume-an-escaped-code-point
307                //
308                // There, it states that there should be a CSS "Pre-Processor Phase" that
309                // eliminates all '\r' and '\r\n' characters and replaces them with '\n'
310                //
311                // In Java-HTML, it is somewhat imperative that an HTML File can be perfectly
312                // re-constructed from a Vector<HTMLNode>.  Therefore the top-level design decision
313                // has been made such that any Vector<CSSToken> can be used to perfectly
314                // reconstruct a CSS File
315                // 
316                // As a result, running the CSS Pre-Processor suggested by the Web-Site is being
317                // ignored.  This means there are a few small places that need to handle
318                // "New-Lines" slightly more carefully.  This is one of them.
319
320                else if (((c = css[pos]) == '\n') || (c == '\f'))
321                {
322                    while ( (++pos < css.length)
323                        &&  ((c = css[pos]) == ' ') || (c == '\t'));
324                }
325
326                else if (c == '\r')
327                {
328                    // There was a properly-escaped new-line, but unfortunately, the next line was
329                    // empty.  This will be turned into a 'BadStr', at the very end of this method.
330
331                    if (++pos == css.length) break WHILE_LOOP;
332        
333                    // This is how `\r\n` is handled:  SKIP the subsequent `\n` after the `\r`
334                    if (css[pos] == '\n') pos++;
335
336                    // After the Reverse-Solidus (BackSlash), and after thew single or double
337                    // character new-line, then skip any spaces or tabs on the next line of text.
338
339                    while ( (pos < css.length)
340                        &&  ((c = css[pos]) == ' ') || (c == '\t'))
341                        pos++;
342                }
343
344                else pos = CSSUtil.consumeEscapedUnicode(css, pos, b);
345
346                continue WHILE_LOOP;
347
348            default:
349                b.accept(c);
350                pos++;
351        }
352
353        // Thesee statements are only reachable if the above loop was terminated by due to the 
354        // value of 'pos' reaching the end of the tr.css code-point array.  If these statemetns are
355        // reached, this is guaranteed to be an error.
356
357        returnParsedToken.accept(new BadStr(css, POS.f, css.length, b));
358
359        errorEncountered.accept(
360            new TokenizeError(
361                css, POS.f, pos, Str.class,
362                "A String Literal's Open-Quotation was found, but unfortunately EOF was reached " +
363                "before identifying the Matching, Closing-Quotation mark"
364            ));
365
366        POS.f = pos;
367    }    
368}