1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 | package Torello.CSS; import Torello.Java.Additional.ByRef; import java.util.Vector; import java.util.function.Consumer; import java.util.stream.IntStream; /** * CSS-Tokenizer Class for {@code String}-Literals. */ @Torello.JavaDoc.JDHeaderBackgroundImg(EmbedTagFileID="CSS_TOK") public class Str extends CSSToken implements CharSequence, java.io.Serializable, Comparable<CharSequence> { /** <EMBED CLASS='external-html' DATA-FILE-ID=SVUID> */ protected static final long serialVersionUID = 1; // ******************************************************************************************** // ******************************************************************************************** // Public & Final Fields // ******************************************************************************************** // ******************************************************************************************** /** * The quotation mark type used to quote this {@code String}-Literal. The value placed in * this Java {@code char} primitive may only be a Single-Quotation Mark, or a Double-Quotation. * No other types of quotations are included in this class parser. */ public final char quote; /** * This is the actual {@code String}-Literal that this {@code CSSToken} represents. This Java * {@code String} will never actually contain the opening and closing quotation marks that were * used to create this {@code String}. * * <BR /><BR /><B CLASS=JDDescLabel>Unescaped String:</B> * * <BR />if this {@code String} utilized any Escape-Sequences representing Unicode Characters, * the Unescaped-Characters are used within this {@code String} to replace the original, * escaped, sequences. * * <BR /><BR /><B CLASS=JDDescLabel>ChatGPT Provided Exmaple:</B> * * <BR />There is a quoted, {@code String} below, provided by AI. Note that ChatGPT * initially gave me a slightly different answer written as <CODE>"\u2713 Checkmark"</CODE> * (which you may or may not notice has a {@code 'u'} character between the Reverse-Solidus * Backslash character and the Hexadecimal Characters {@code '2713'}). * * <BR /><BR />After further research, ChatGPT apologized for it's mistake saying: * * <BR /><BR /><I>"You're correct, and I apologize for the oversight. In CSS, Unicode escape * sequences within string literals do not start with the u character. Instead, they consist of * a backslash followed by up to six hexadecimal digits, representing the Unicode code point." * </I> * * <BR /><BR />This crap sort of amazes me. I really can't believe it. Anything that Stack * Overflow is busy condemning, with a very high likelihood is bound to be pretty good. * * <DIV CLASS=CSS>{@code * .selector::before * { * content: "\2713 Checkmark"; * font-family: Arial, sans-serif; * } * }</DIV> * * <BR />The above CSS-{@code String} (which is inside the {@code 'content'} Property-Value) * would be stored in the field {@code 'unescaped'} as: <CODE>✓ Checkmark</CODE>. * <BR /> */ public final String unescaped; // ******************************************************************************************** // ******************************************************************************************** // Private Constructor, API "is" and "if" Methods // ******************************************************************************************** // ******************************************************************************************** private Str( final int[] css, final int sPos, final int ePos, final IntStream.Builder b ) { super(css, sPos, ePos); this.quote = (char) css[sPos]; // This Code-Points array contains the String-Contents. Note that this array WILL NOT // contain the starting and ending quotation-marks. Also, if there were any escaped // characters / code-points in the String, they will have been unescaped, since there is no // longer any need to have the remain escaped. int[] codePoints = b.build().toArray(); // This does nothing more than convert an int[] Code-Points array to a java.lang.String. // This is one of Java's String Constructors. this.unescaped = new String(codePoints, 0, codePoints.length); } @Override public final boolean isStr() { return true; } @Override public final Str ifStr() { return this; } // ******************************************************************************************** // ******************************************************************************************** // Tokenizer's "is" Method(s) // ******************************************************************************************** // ******************************************************************************************** static boolean is(final int codePoint) { return (codePoint == '\'') || (codePoint == '"'); } // ******************************************************************************************** // ******************************************************************************************** // User's Constructor: a static "build" method // ******************************************************************************************** // ******************************************************************************************** /** * <EMBED CLASS=defs DATA-TOK=Str DATA-P=stringLiteral> * <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_DESC> * @param stringLiteral <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_PARAM> * @return <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_RET> * @throws TokenizeException <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_TOK_EX> */ @SuppressWarnings("unchecked") public static Str build(final String stringLiteral) { return (Str) CSSToken.build(stringLiteral, INPUT_CHECKER, Str::consume); } private static final CSSToken.InputChecker INPUT_CHECKER = (int[] css) -> { if (! is(css[0])) throw new TokenizeException ("Input String does not start with a valid CSS String-Literal"); }; // ******************************************************************************************** // ******************************************************************************************** // CONSUME // ******************************************************************************************** // ******************************************************************************************** // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** // COPIED FROM: // https://drafts.csswg.org/css-syntax-3/#consume-a-string-token // // COPIED ON: // March 26th, 2024 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** // // 4.3.5. Consume a string token // // This section describes how to consume a string token from a stream of code points. It // returns either a <string-token> or <bad-string-token>. // // This algorithm may be called with an ending code point, which denotes the code point that // ends the string. If an ending code point is not specified, the current input code point is // used. // // Initially create a <string-token> with its value set to the empty string. // [I interpret this as follows:] // final int quote = tr.css[tr.pos]; // final IntStream.Builder b = IntStream.builder(); // // Repeatedly consume the next input code point from the stream: // [This is fancy W3C Psuedo-Code Talk for a `use a for-loop`] // [Below is a For-Loop Body, written in W3C Psuedo-Code] // // ** ending code point [NOTE: This says: The Matching-and-Closing Quotation-Mark] // ==> Return the <string-token>. // // ** EOF // ==> This is a parse error. Return the <string-token>. // // ** newline // ==> This is a parse error. Reconsume the current input code point, create a // <bad-string-token>, and return it. // // ** U+005C REVERSE SOLIDUS (\) // ==> 1) If the next input code point is EOF, do nothing. // 2) Otherwise, if the next input code point is a newline, consume it. // 3) Otherwise, (the stream starts with a valid escape) consume an escaped code point // and append the returned code point to the <string-token>’s value. // // ** anything else // ==> Append the current input code point to the <string-token>’s value. // [Which I do using an IntStream.Builder as: b.accept(c);] // // A Bug? I don't know if I'm misunderstanding, but I think there is a minor bug in // the above Pseudo-Code. For the character after the Reverse-Solidus Option, Option #2 says // Consume the next-newline, but it doesn't seem skip all tabs and spaces after the newline has // been consumed. This method skips all ' ' and '\t' characters after consuming the newline. // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** /** * This is a tokenizer method which <B>"consumes"</B> the next {@code String}-Literal from the * input Code-Point Array. * * <EMBED CLASS=defs DATA-TOK=String-Literal DATA-URL=consume-string-token DATA-OP=Consume> * <EMBED CLASS=external-html DATA-FILE-ID=COPIED_CSS_WG> * <EMBED CLASS=external-html DATA-FILE-ID=STRING_TOKEN> * <EMBED CLASS=external-html DATA-FILE-ID=STRING_TOK_SVG> */ public static void consume( // When invoked from 'CSSTokenizer' final int[] css, // C, int[] css final ByRef<Integer> POS, // P, array-pos loop-variable final Consumer<CSSToken> returnParsedToken, // T, Vector<CSSToken>.add final Consumer<TokenizeError> errorEncountered // E, Vector<TokenizeError>.add ) { final int quote = css[POS.f]; final IntStream.Builder b = IntStream.builder(); int pos=POS.f+1; int c; WHILE_LOOP: while (pos < css.length) switch (c = css[pos]) { case '\'': case '"': // This switch-statement is asking whether or not the quotation that was just // encountered is the same quotation that was used to start this string-literal. if (c == quote) { returnParsedToken.accept(new Str(css, POS.f, ++pos, b)); POS.f = pos; return; } // Again, the IntStream is there to help build the "Un-Escaped" Version of the // String. Note that this IntStream.Builder would be completely unnecessary if it // weren't for the Escaped-Characters. If there are not any Escaped-Chars, one // could simply build the String using the beginning and ending indices from the // int[] css array. // // ALSO: // // Since Java doesn't allow the variable `quote` inside of a `case`branch, BOTH // '\'' and '"' have to be in the case. For that reason alone, we also have to do // this `else` part, otherwise, this `else`branch would be handled automatically by // this Switch-Statement's `default` clause. // // FINALLY: // // This 'c' is just a Double-Quote that has harmlessly been placed within a String // that uses Single-Quotes... **OR** a harmless Single-Quote that was placed // within a String-Literal that was specified using Double-Quotes. b.accept(c); pos++; continue WHILE_LOOP; case '\n': case '\r': case '\f': // Note that, here, a `\r\n` is irrelevant, because this character will be // "Re-Consumed" by the loop that called this method. If this is a 2-character // New-Line, it will be handled in by whatever surrounding method called this. returnParsedToken.accept(new BadStr(css, POS.f, pos, b)); errorEncountered.accept( new TokenizeError( css, POS.f, pos, Str.class, "A String-Literal's Open-Quotation was found but a new-line character " + "has been encountered before the Matching, Closing Quotation-Mark was " + "identified." )); POS.f = pos; return; // From the Pseudo-Code at the top of this method, this is the case: // U+005C REVERSE SOLIDUS (\) case '\\': // EOF-Reached, then quit immediately. There was a "REVERSE SOLIDUS" at the very // last character of the CSS-File or String. if ((++pos) == css.length) b.accept(0xFFFD); // U+FFFD REPLACEMENT CHARACTER (�) // Here, if there is a `\r\n` it needs to be "consumed" immediately. It cannot be // ignored here. Note that the original Pseudo-Code that was copied from: // // https://drafts.csswg.org/css-syntax-3/#consume-an-escaped-code-point // // There, it states that there should be a CSS "Pre-Processor Phase" that // eliminates all '\r' and '\r\n' characters and replaces them with '\n' // // In Java-HTML, it is somewhat imperative that an HTML File can be perfectly // re-constructed from a Vector<HTMLNode>. Therefore the top-level design decision // has been made such that any Vector<CSSToken> can be used to perfectly // reconstruct a CSS File // // As a result, running the CSS Pre-Processor suggested by the Web-Site is being // ignored. This means there are a few small places that need to handle // "New-Lines" slightly more carefully. This is one of them. else if (((c = css[pos]) == '\n') || (c == '\f')) { while ( (++pos < css.length) && ((c = css[pos]) == ' ') || (c == '\t')); } else if (c == '\r') { // There was a properly-escaped new-line, but unfortunately, the next line was // empty. This will be turned into a 'BadStr', at the very end of this method. if (++pos == css.length) break WHILE_LOOP; // This is how `\r\n` is handled: SKIP the subsequent `\n` after the `\r` if (css[pos] == '\n') pos++; // After the Reverse-Solidus (BackSlash), and after thew single or double // character new-line, then skip any spaces or tabs on the next line of text. while ( (pos < css.length) && ((c = css[pos]) == ' ') || (c == '\t')) pos++; } else pos = CSSUtil.consumeEscapedUnicode(css, pos, b); continue WHILE_LOOP; default: b.accept(c); pos++; } // Thesee statements are only reachable if the above loop was terminated by due to the // value of 'pos' reaching the end of the tr.css code-point array. If these statemetns are // reached, this is guaranteed to be an error. returnParsedToken.accept(new BadStr(css, POS.f, css.length, b)); errorEncountered.accept( new TokenizeError( css, POS.f, pos, Str.class, "A String Literal's Open-Quotation was found, but unfortunately EOF was reached " + "before identifying the Matching, Closing-Quotation mark" )); POS.f = pos; } } |