001package Torello.CSS;
002
003import Torello.Java.Additional.ByRef;
004import Torello.Java.Additional.EffectivelyFinal;
005import Torello.Java.UnreachableError;
006
007import java.util.Vector;
008import java.util.stream.IntStream;
009import java.util.function.Consumer;
010
011/** Any {@code URL} */
012@Torello.JavaDoc.JDHeaderBackgroundImg(EmbedTagFileID="CSS_TOK")
013public class URLToken extends CSSToken
014    implements CharSequence, java.io.Serializable, Comparable<CharSequence>
015{
016    /** <EMBED CLASS='external-html' DATA-FILE-ID=SVUID> */
017    protected static final long serialVersionUID = 1;
018
019    /** The unescaped text that constitutes this {@code URL}. */
020    public final String unescapedURL;
021
022
023    // ********************************************************************************************
024    // ********************************************************************************************
025    // Private Constructor, API "is" and "if" Methods
026    // ********************************************************************************************
027    // ********************************************************************************************
028
029
030    private URLToken(
031            final int[]             css,
032            final int               sPos,
033            final int               ePos,
034            final IntStream.Builder urlStrBuilder
035        )
036    {
037        super(css, sPos, ePos);
038
039        int[] urlArr        = urlStrBuilder.build().toArray();
040        this.unescapedURL   = new String(urlArr, 0, urlArr.length);
041    }
042
043    @Override 
044    public final boolean isURL() { return true; }
045
046    @Override
047    public final URLToken ifURL() { return this; }
048
049
050    // ********************************************************************************************
051    // ********************************************************************************************
052    // User's Constructor: a static "build" method
053    // ********************************************************************************************
054    // ********************************************************************************************
055
056
057    /**
058     * <EMBED CLASS=defs DATA-TOK=Str DATA-P=urlStr>
059     * <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_DESC>
060     * @param urlStr <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_PARAM>
061     * @return <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_RET>
062     * @throws TokenizeException <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_TOK_EX>
063     */
064    @SuppressWarnings("unchecked")
065    public static URLToken build(final String urlStr)
066    {
067        if (urlStr.length() == 0) throw new TokenizeException();
068
069        final int[] css = urlStr.codePoints().toArray();
070
071        if (css.length < 1) throw new TokenizeException(URLToken.class);
072
073        if (Whitespace.is(css[0])) throw new TokenizeException
074            ("A URL cannot begin with Whitespace.");
075
076        final EffectivelyFinal<CSSToken> saveIt = new EffectivelyFinal<>(null);
077
078        final Consumer<CSSToken> acceptor = (CSSToken t) ->
079        {
080            if (t instanceof Whitespace) throw new TokenizeException
081                ("The URL provided contained unescaped Whitespace");
082
083            else if (t instanceof Comment) throw new TokenizeException
084                ("The URL provided contained a CSS Comment");
085
086            else if (t instanceof BadURL) throw new TokenizeException
087                ("The URL provided was parsed into an instanceof BadURL: [" + t.str + "]");
088
089            else if (t instanceof URLToken) saveIt.f = t;
090
091            // These are the only types that may be returned by Class CSSToken
092            else throw new UnreachableError();
093        };
094
095        URLToken.consume(
096            css,
097            new ByRef<>(0),
098            acceptor,
099            (TokenizeError te) -> te.throwException(),
100            true
101        );
102
103        // Need to guarantee that the entire String was consumed in the process of tokenizing the
104        // input String.  'TokenzeException' has a nicely worded Esception-Message to explain what
105        // has occured here.
106
107        if (urlStr.length() != saveIt.f.str.length())
108            throw new TokenizeException(urlStr, saveIt.f.str);
109
110        return (URLToken) saveIt.f;
111    }
112
113
114    // ********************************************************************************************
115    // ********************************************************************************************
116    // CONSUME
117    // ********************************************************************************************
118    // ********************************************************************************************
119
120
121    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
122    // Copied from:
123    // https://drafts.csswg.org/css-syntax-3/#consume-url-token
124    // April 2024
125    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
126    //
127    // 4.3.6. Consume a url token
128    // 
129    // This section describes how to consume a url token from a stream of code points. It returns
130    // either a <url-token> or a <bad-url-token>.
131    // 
132    // NOTE: This algorithm assumes that the initial "url(" has already been consumed. This
133    // algorithm also assumes that it’s being called to consume an "unquoted" value, like url(foo).
134    // A quoted value, like url("foo"), is parsed as a <function-token>. Consume an ident-like
135    // token automatically handles this distinction; this algorithm shouldn’t be called directly
136    // otherwise.
137    // 
138    // 1. Initially create a <url-token> with its value set to the empty string.
139    // 
140    // 2. Consume as much whitespace as possible.
141    // 
142    // 3. Repeatedly consume the next input code point from the stream:
143    // 
144    // **   U+0029 RIGHT PARENTHESIS ())
145    //      Return the <url-token>.
146    // 
147    // **   EOF
148    //      This is a parse error. Return the <url-token>.
149    // 
150    // **   whitespace
151    //      Consume as much whitespace as possible. If the next input code point is
152    //      U+0029 RIGHT PARENTHESIS ()) or EOF, consume it and return the <url-token> (if EOF was
153    //      encountered, this is a parse error); otherwise, consume the remnants of a bad url,
154    //      create a <bad-url-token>, and return it.
155    // 
156    // **   U+0022 QUOTATION MARK (")
157    // **   U+0027 APOSTROPHE (')
158    // **   U+0028 LEFT PARENTHESIS (()
159    // **   non-printable code point
160    //      This is a parse error. Consume the remnants of a bad url, create a <bad-url-token>, and
161    //      return it.
162    // 
163    // **   U+005C REVERSE SOLIDUS (\)
164    //      If the stream starts with a valid escape, consume an escaped code point and append the
165    //      returned code point to the <url-token>’s value.
166    // 
167    //      Otherwise, this is a parse error. Consume the remnants of a bad url, create a
168    //      <bad-url-token>, and return it.
169    // 
170    // **   anything else
171    //      Append the current input code point to the <url-token>’s value.
172
173    /**
174     * This is a tokenizer method which <B>"consumes"</B> the next {@code URLToken} from the input
175     * Code-Point Array.
176     * 
177     * <EMBED CLASS=defs DATA-TOK=URLToken DATA-URL=consume-url-token DATA-OP=Consume>
178     * <EMBED CLASS=external-html DATA-FILE-ID=COPIED_CSS_WG>
179     * <EMBED CLASS=external-html DATA-FILE-ID=URL_TOKEN>
180     * <EMBED CLASS=external-html DATA-FILE-ID=URL_TOK_SVG>
181     */
182    protected static void consume(                              // When invoked from 'CSSTokenizer'
183            final int[]                     css,                // C, int[] css
184            final ByRef<Integer>            POS,                // P, array-pos loop-variable
185            final Consumer<CSSToken>        returnParsedToken,  // T, Vector<CSSToken>.add
186            final Consumer<TokenizeError>   errorEncountered,   // E, Vector<TokenizeError>.add
187            final boolean                   fromBuildMethod // Minor-Hack to solve a problems
188                                                            // SOLVING-PROBLEMS, THAT'S WHAT WE DO
189        )
190    {
191        final IntStream.Builder urlStrBuilder   = IntStream.builder();
192        final int               sPos            = POS.f;
193
194        int c;
195
196        while (POS.f < css.length) switch (c = css[POS.f])
197        {
198            // **   U+0029 RIGHT PARENTHESIS ())
199            //      Return the <url-token>.
200
201            case ')':
202
203                returnParsedToken.accept(new URLToken(css, sPos, POS.f, urlStrBuilder));
204                return;
205
206            // **   whitespace
207            //      Consume as much whitespace as possible. If the next input code point is
208            //      U+0029 RIGHT PARENTHESIS ()) or EOF, consume it and return the <url-token> (if EOF was
209            //      encountered, this is a parse error); otherwise, consume the remnants of a bad url,
210            //      create a <bad-url-token>, and return it.
211
212            case '\u000B':
213            case ' ':
214            case '\t':
215            case '\f':
216            case '\n':
217            case '\r':
218
219                final int ePos = POS.f;
220
221                Vector<CSSToken>    v           = new Vector<>();
222                Consumer<CSSToken>  acceptor    = v::add;
223
224                while (POS.f < css.length)
225
226                    if (Whitespace.is(css[POS.f]))
227                        Whitespace.consume(css, POS, acceptor);
228
229                    else if (Comment.is(css, POS.f))
230                        Comment.consume(css, POS, acceptor, errorEncountered);
231
232                    else break;
233
234                // ==> EOF, consume it and return the <url-token> (if EOF was encountered, this is
235                //     a parse error)
236
237                if (POS.f >= css.length)
238                {
239                    if (! fromBuildMethod) errorEncountered.accept(
240                        new TokenizeError(
241                            css, sPos, POS.f, URLToken.class,
242                            "CSS-Input EOF was encountered before reaching the URL's closing ')'"
243                        ));
244
245                    returnParsedToken.accept(new URLToken(css, sPos, ePos, urlStrBuilder));
246                    if (v.size() > 0) for (CSSToken t : v) returnParsedToken.accept(t);
247                }
248
249                else if (css[POS.f] == ')')
250                {
251                    returnParsedToken.accept(new URLToken(css, sPos, ePos, urlStrBuilder));
252                    if (v.size() > 0) for (CSSToken t : v) returnParsedToken.accept(t);
253                }
254
255                else
256                {
257                    errorEncountered.accept(
258                        new TokenizeError(
259                            css, sPos, POS.f, URLToken.class,
260                            "Whitespace and/or comments before the end of a URL"
261                        ));
262
263                    BadURL.consume(css, POS, returnParsedToken, sPos);
264                }
265
266                return;
267 
268            // **   U+0022 QUOTATION MARK (")
269            // **   U+0027 APOSTROPHE (')
270            // **   U+0028 LEFT PARENTHESIS (()
271            // **   non-printable code point
272            //      This is a parse error. Consume the remnants of a bad url, create a
273            //      <bad-url-token>, and return it.
274
275            case '"':
276            case '\'':
277            case '(':
278
279                errorEncountered.accept(
280                    new TokenizeError(
281                        css, sPos, POS.f, URLToken.class,
282                        "Unescaped Character within URL Found: ['" + c + "'']"
283                    ));
284
285                // NOTE: The "non-printable code-point" will be handled by the default-case
286                BadURL.consume(css, POS, returnParsedToken, sPos);
287                return;
288
289 
290            // **   U+005C REVERSE SOLIDUS (\)
291            //      If the stream starts with a valid escape, consume an escaped code point and
292            //      append the returned code point to the <url-token>’s value.
293            // 
294            //      Otherwise, this is a parse error. Consume the remnants of a bad url, create a
295            //      <bad-url-token>, and return it.
296
297            case '\\':
298
299                if (CSSUtil.isValidEscape(css, POS.f))
300                {
301                    POS.f = CSSUtil.consumeEscapedUnicode(css, POS.f+1, urlStrBuilder);
302                    break;
303                }
304
305                else
306                {
307                    errorEncountered.accept(
308                        new TokenizeError(
309                            css, sPos, POS.f, URLToken.class,
310                            "A Reverse-Solidu (Backslash) Character was encountered, but " +
311                            "unfortunately it was not a valid CSS Character-Escape Sequence"
312                        ));
313
314                    BadURL.consume(css, POS, returnParsedToken, sPos);
315                    return;
316                }
317
318            // **   non-printable code point
319            //      This is a parse error. Consume the remnants of a bad url, create a
320            //      <bad-url-token>, and return it.
321            // 
322            // **   anything else
323            //      Append the current input code point to the <url-token>’s value.
324
325            default:
326
327                if (CSSUtil.nonPrintableCodePoint(c))
328                {
329                    errorEncountered.accept(
330                        new TokenizeError(
331                            css, sPos, POS.f, URLToken.class,
332                            "A non-printable Code-Point was Encountered (CodePonit #" + c + ")"
333                        ));
334
335                    BadURL.consume(css, POS, returnParsedToken, sPos);
336                    return;
337                }
338
339                POS.f++;
340                urlStrBuilder.accept(c);
341        }
342
343
344        // If this line is reached, it means that the loop "broke" because the end of the CSS was
345        // reached.  If there had been a proper ending to the URL, it would already have been 
346        // returned inside the Loop's Main Switch-Statement
347        // 
348        // MINOR-SPAGHETTI: If this "consume" method is called from "build", then there will not
349        //                  be a closing ')'.  If this line is reached, and it was called from the
350        //                  build-method, this is success, rather than failure
351        //
352        // The value of boolean "fromBuildMethod" is retrieved as a parameter from this method's
353        // input-parameters.  URLToken.consume(...) is called from one two places:
354        //      1) Identifer.consumeIdentLikeSequence
355        //      2) Method "build" (at the top of this class)
356        //
357        // When called from
358        //      2) Identifier:      fromBuildMethod ==> false
359        //      3) Build (above):   fromBuildMethod ==> true
360
361        if (! fromBuildMethod) errorEncountered.accept(
362            new TokenizeError(
363                css, sPos, POS.f, URLToken.class,
364                "EOF Encountered prior to reaching the end of a URL"
365            ));
366
367        returnParsedToken.accept(new URLToken(css, sPos, POS.f, urlStrBuilder));
368    }
369}