001package Torello.CSS; 002 003import Torello.Java.Additional.ByRef; 004 005import java.util.Vector; 006import java.util.function.Consumer; 007import java.util.stream.IntStream; 008 009/** 010 * CSS-Tokenizer Class for {@code String}-Literals. 011 */ 012@Torello.JavaDoc.JDHeaderBackgroundImg(EmbedTagFileID="CSS_TOK") 013public class Str extends CSSToken 014 implements CharSequence, java.io.Serializable, Comparable<CharSequence> 015{ 016 /** <EMBED CLASS='external-html' DATA-FILE-ID=SVUID> */ 017 protected static final long serialVersionUID = 1; 018 019 020 // ******************************************************************************************** 021 // ******************************************************************************************** 022 // Public & Final Fields 023 // ******************************************************************************************** 024 // ******************************************************************************************** 025 026 027 /** 028 * The quotation mark type used to quote this {@code String}-Literal. The value placed in 029 * this Java {@code char} primitive may only be a Single-Quotation Mark, or a Double-Quotation. 030 * No other types of quotations are included in this class parser. 031 */ 032 public final char quote; 033 034 /** 035 * This is the actual {@code String}-Literal that this {@code CSSToken} represents. This Java 036 * {@code String} will never actually contain the opening and closing quotation marks that were 037 * used to create this {@code String}. 038 * 039 * <BR /><BR /><B CLASS=JDDescLabel>Unescaped String:</B> 040 * 041 * <BR />if this {@code String} utilized any Escape-Sequences representing Unicode Characters, 042 * the Unescaped-Characters are used within this {@code String} to replace the original, 043 * escaped, sequences. 044 * 045 * <BR /><BR /><B CLASS=JDDescLabel>ChatGPT Provided Exmaple:</B> 046 * 047 * <BR />There is a quoted, {@code String} below, provided by AI. Note that ChatGPT 048 * initially gave me a slightly different answer written as <CODE>"\u2713 Checkmark"</CODE> 049 * (which you may or may not notice has a {@code 'u'} character between the Reverse-Solidus 050 * Backslash character and the Hexadecimal Characters {@code '2713'}). 051 * 052 * <BR /><BR />After further research, ChatGPT apologized for it's mistake saying: 053 * 054 * <BR /><BR /><I>"You're correct, and I apologize for the oversight. In CSS, Unicode escape 055 * sequences within string literals do not start with the u character. Instead, they consist of 056 * a backslash followed by up to six hexadecimal digits, representing the Unicode code point." 057 * </I> 058 * 059 * <BR /><BR />This crap sort of amazes me. I really can't believe it. Anything that Stack 060 * Overflow is busy condemning, with a very high likelihood is bound to be pretty good. 061 * 062 * <DIV CLASS=CSS>{@code 063 * .selector::before 064 * { 065 * content: "\2713 Checkmark"; 066 * font-family: Arial, sans-serif; 067 * } 068 * }</DIV> 069 * 070 * <BR />The above CSS-{@code String} (which is inside the {@code 'content'} Property-Value) 071 * would be stored in the field {@code 'unescaped'} as: <CODE>✓ Checkmark</CODE>. 072 * <BR /> 073 */ 074 public final String unescaped; 075 076 077 // ******************************************************************************************** 078 // ******************************************************************************************** 079 // Private Constructor, API "is" and "if" Methods 080 // ******************************************************************************************** 081 // ******************************************************************************************** 082 083 084 private Str( 085 final int[] css, 086 final int sPos, 087 final int ePos, 088 final IntStream.Builder b 089 ) 090 { 091 super(css, sPos, ePos); 092 093 this.quote = (char) css[sPos]; 094 095 // This Code-Points array contains the String-Contents. Note that this array WILL NOT 096 // contain the starting and ending quotation-marks. Also, if there were any escaped 097 // characters / code-points in the String, they will have been unescaped, since there is no 098 // longer any need to have the remain escaped. 099 100 int[] codePoints = b.build().toArray(); 101 102 // This does nothing more than convert an int[] Code-Points array to a java.lang.String. 103 // This is one of Java's String Constructors. 104 105 this.unescaped = new String(codePoints, 0, codePoints.length); 106 } 107 108 @Override 109 public final boolean isStr() { return true; } 110 111 @Override 112 public final Str ifStr() { return this; } 113 114 115 // ******************************************************************************************** 116 // ******************************************************************************************** 117 // Tokenizer's "is" Method(s) 118 // ******************************************************************************************** 119 // ******************************************************************************************** 120 121 122 static boolean is(final int codePoint) 123 { return (codePoint == '\'') || (codePoint == '"'); } 124 125 126 // ******************************************************************************************** 127 // ******************************************************************************************** 128 // User's Constructor: a static "build" method 129 // ******************************************************************************************** 130 // ******************************************************************************************** 131 132 133 /** 134 * <EMBED CLASS=defs DATA-TOK=Str DATA-P=stringLiteral> 135 * <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_DESC> 136 * @param stringLiteral <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_PARAM> 137 * @return <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_RET> 138 * @throws TokenizeException <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_TOK_EX> 139 */ 140 @SuppressWarnings("unchecked") 141 public static Str build(final String stringLiteral) 142 { return (Str) CSSToken.build(stringLiteral, INPUT_CHECKER, Str::consume); } 143 144 private static final CSSToken.InputChecker INPUT_CHECKER = (int[] css) -> 145 { 146 if (! is(css[0])) throw new TokenizeException 147 ("Input String does not start with a valid CSS String-Literal"); 148 }; 149 150 151 // ******************************************************************************************** 152 // ******************************************************************************************** 153 // CONSUME 154 // ******************************************************************************************** 155 // ******************************************************************************************** 156 157 158 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 159 // COPIED FROM: 160 // https://drafts.csswg.org/css-syntax-3/#consume-a-string-token 161 // 162 // COPIED ON: 163 // March 26th, 2024 164 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 165 // 166 // 4.3.5. Consume a string token 167 // 168 // This section describes how to consume a string token from a stream of code points. It 169 // returns either a <string-token> or <bad-string-token>. 170 // 171 // This algorithm may be called with an ending code point, which denotes the code point that 172 // ends the string. If an ending code point is not specified, the current input code point is 173 // used. 174 // 175 // Initially create a <string-token> with its value set to the empty string. 176 // [I interpret this as follows:] 177 // final int quote = tr.css[tr.pos]; 178 // final IntStream.Builder b = IntStream.builder(); 179 // 180 // Repeatedly consume the next input code point from the stream: 181 // [This is fancy W3C Psuedo-Code Talk for a `use a for-loop`] 182 // [Below is a For-Loop Body, written in W3C Psuedo-Code] 183 // 184 // ** ending code point [NOTE: This says: The Matching-and-Closing Quotation-Mark] 185 // ==> Return the <string-token>. 186 // 187 // ** EOF 188 // ==> This is a parse error. Return the <string-token>. 189 // 190 // ** newline 191 // ==> This is a parse error. Reconsume the current input code point, create a 192 // <bad-string-token>, and return it. 193 // 194 // ** U+005C REVERSE SOLIDUS (\) 195 // ==> 1) If the next input code point is EOF, do nothing. 196 // 2) Otherwise, if the next input code point is a newline, consume it. 197 // 3) Otherwise, (the stream starts with a valid escape) consume an escaped code point 198 // and append the returned code point to the <string-token>’s value. 199 // 200 // ** anything else 201 // ==> Append the current input code point to the <string-token>’s value. 202 // [Which I do using an IntStream.Builder as: b.accept(c);] 203 // 204 // A Bug? I don't know if I'm misunderstanding, but I think there is a minor bug in 205 // the above Pseudo-Code. For the character after the Reverse-Solidus Option, Option #2 says 206 // Consume the next-newline, but it doesn't seem skip all tabs and spaces after the newline has 207 // been consumed. This method skips all ' ' and '\t' characters after consuming the newline. 208 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 209 210 /** 211 * This is a tokenizer method which <B>"consumes"</B> the next {@code String}-Literal from the 212 * input Code-Point Array. 213 * 214 * <EMBED CLASS=defs DATA-TOK=String-Literal DATA-URL=consume-string-token DATA-OP=Consume> 215 * <EMBED CLASS=external-html DATA-FILE-ID=COPIED_CSS_WG> 216 * <EMBED CLASS=external-html DATA-FILE-ID=STRING_TOKEN> 217 * <EMBED CLASS=external-html DATA-FILE-ID=STRING_TOK_SVG> 218 */ 219 public static void consume( // When invoked from 'CSSTokenizer' 220 final int[] css, // C, int[] css 221 final ByRef<Integer> POS, // P, array-pos loop-variable 222 final Consumer<CSSToken> returnParsedToken, // T, Vector<CSSToken>.add 223 final Consumer<TokenizeError> errorEncountered // E, Vector<TokenizeError>.add 224 ) 225 { 226 final int quote = css[POS.f]; 227 final IntStream.Builder b = IntStream.builder(); 228 229 int pos=POS.f+1; 230 int c; 231 232 WHILE_LOOP: 233 while (pos < css.length) switch (c = css[pos]) 234 { 235 case '\'': 236 case '"': 237 238 // This switch-statement is asking whether or not the quotation that was just 239 // encountered is the same quotation that was used to start this string-literal. 240 241 if (c == quote) 242 { 243 returnParsedToken.accept(new Str(css, POS.f, ++pos, b)); 244 POS.f = pos; 245 return; 246 } 247 248 // Again, the IntStream is there to help build the "Un-Escaped" Version of the 249 // String. Note that this IntStream.Builder would be completely unnecessary if it 250 // weren't for the Escaped-Characters. If there are not any Escaped-Chars, one 251 // could simply build the String using the beginning and ending indices from the 252 // int[] css array. 253 // 254 // ALSO: 255 // 256 // Since Java doesn't allow the variable `quote` inside of a `case`branch, BOTH 257 // '\'' and '"' have to be in the case. For that reason alone, we also have to do 258 // this `else` part, otherwise, this `else`branch would be handled automatically by 259 // this Switch-Statement's `default` clause. 260 // 261 // FINALLY: 262 // 263 // This 'c' is just a Double-Quote that has harmlessly been placed within a String 264 // that uses Single-Quotes... **OR** a harmless Single-Quote that was placed 265 // within a String-Literal that was specified using Double-Quotes. 266 267 b.accept(c); 268 269 pos++; 270 continue WHILE_LOOP; 271 272 case '\n': 273 case '\r': 274 case '\f': 275 276 // Note that, here, a `\r\n` is irrelevant, because this character will be 277 // "Re-Consumed" by the loop that called this method. If this is a 2-character 278 // New-Line, it will be handled in by whatever surrounding method called this. 279 280 returnParsedToken.accept(new BadStr(css, POS.f, pos, b)); 281 282 errorEncountered.accept( 283 new TokenizeError( 284 css, POS.f, pos, Str.class, 285 "A String-Literal's Open-Quotation was found but a new-line character " + 286 "has been encountered before the Matching, Closing Quotation-Mark was " + 287 "identified." 288 )); 289 290 POS.f = pos; 291 return; 292 293 // From the Pseudo-Code at the top of this method, this is the case: 294 // U+005C REVERSE SOLIDUS (\) 295 296 case '\\': 297 298 // EOF-Reached, then quit immediately. There was a "REVERSE SOLIDUS" at the very 299 // last character of the CSS-File or String. 300 301 if ((++pos) == css.length) b.accept(0xFFFD); // U+FFFD REPLACEMENT CHARACTER (�) 302 303 // Here, if there is a `\r\n` it needs to be "consumed" immediately. It cannot be 304 // ignored here. Note that the original Pseudo-Code that was copied from: 305 // 306 // https://drafts.csswg.org/css-syntax-3/#consume-an-escaped-code-point 307 // 308 // There, it states that there should be a CSS "Pre-Processor Phase" that 309 // eliminates all '\r' and '\r\n' characters and replaces them with '\n' 310 // 311 // In Java-HTML, it is somewhat imperative that an HTML File can be perfectly 312 // re-constructed from a Vector<HTMLNode>. Therefore the top-level design decision 313 // has been made such that any Vector<CSSToken> can be used to perfectly 314 // reconstruct a CSS File 315 // 316 // As a result, running the CSS Pre-Processor suggested by the Web-Site is being 317 // ignored. This means there are a few small places that need to handle 318 // "New-Lines" slightly more carefully. This is one of them. 319 320 else if (((c = css[pos]) == '\n') || (c == '\f')) 321 { 322 while ( (++pos < css.length) 323 && ((c = css[pos]) == ' ') || (c == '\t')); 324 } 325 326 else if (c == '\r') 327 { 328 // There was a properly-escaped new-line, but unfortunately, the next line was 329 // empty. This will be turned into a 'BadStr', at the very end of this method. 330 331 if (++pos == css.length) break WHILE_LOOP; 332 333 // This is how `\r\n` is handled: SKIP the subsequent `\n` after the `\r` 334 if (css[pos] == '\n') pos++; 335 336 // After the Reverse-Solidus (BackSlash), and after thew single or double 337 // character new-line, then skip any spaces or tabs on the next line of text. 338 339 while ( (pos < css.length) 340 && ((c = css[pos]) == ' ') || (c == '\t')) 341 pos++; 342 } 343 344 else pos = CSSUtil.consumeEscapedUnicode(css, pos, b); 345 346 continue WHILE_LOOP; 347 348 default: 349 b.accept(c); 350 pos++; 351 } 352 353 // Thesee statements are only reachable if the above loop was terminated by due to the 354 // value of 'pos' reaching the end of the tr.css code-point array. If these statemetns are 355 // reached, this is guaranteed to be an error. 356 357 returnParsedToken.accept(new BadStr(css, POS.f, css.length, b)); 358 359 errorEncountered.accept( 360 new TokenizeError( 361 css, POS.f, pos, Str.class, 362 "A String Literal's Open-Quotation was found, but unfortunately EOF was reached " + 363 "before identifying the Matching, Closing-Quotation mark" 364 )); 365 366 POS.f = pos; 367 } 368}