001package Torello.CSS; 002 003import Torello.Java.Additional.ByRef; 004 005import java.util.Vector; 006import java.util.stream.IntStream; 007import java.util.function.Consumer; 008 009@Torello.JavaDoc.JDHeaderBackgroundImg(EmbedTagFileID="CSS_TOK") 010public class Identifier extends CSSToken 011 implements CharSequence, java.io.Serializable, Comparable<CharSequence> 012{ 013 /** <EMBED CLASS='external-html' DATA-FILE-ID=SVUID> */ 014 protected static final long serialVersionUID = 1; 015 016 017 // ******************************************************************************************** 018 // ******************************************************************************************** 019 // Public & Final Fields 020 // ******************************************************************************************** 021 // ******************************************************************************************** 022 023 024 /** 025 * This contains the <B STYLE='color: red;'><I>unescaped</I></B> text that that constitutes 026 * this identifier. Identifiers are permitted to use Escaped-Unicode Sequences. If any 027 * characters were escaped, this {@code String} will have the unescaped variant of the 028 * {@code String} stored here. 029 */ 030 public final String identifier; 031 032 033 // ******************************************************************************************** 034 // ******************************************************************************************** 035 // Private Constructor, API "is" and "if" Methods 036 // ******************************************************************************************** 037 // ******************************************************************************************** 038 039 040 Identifier( 041 final int[] css, 042 final int sPos, 043 final int ePos, 044 final String identifier 045 ) 046 { 047 super(css, sPos, ePos); 048 this.identifier = identifier; 049 } 050 051 @Override 052 public final boolean isIdentifier() { return true; } 053 054 @Override 055 public final Identifier ifIdentifier() { return this; } 056 057 058 // ******************************************************************************************** 059 // ******************************************************************************************** 060 // User's Constructor: a static "build" method 061 // ******************************************************************************************** 062 // ******************************************************************************************** 063 064 065 /** 066 * <EMBED CLASS=defs DATA-TOK=Identifier DATA-P=identStr> 067 * <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_DESC> 068 * @param identStr <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_PARAM> 069 * @return <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_RET> 070 * @throws TokenizeException <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_TOK_EX> 071 */ 072 @SuppressWarnings("unchecked") 073 public static Identifier build(final String identStr) 074 { return (Identifier) CSSToken.build(identStr, INPUT_CHECKER, Identifier::_PRIVATE_CONSUME); } 075 076 private static final CSSToken.InputChecker INPUT_CHECKER = (int[] css) -> 077 { 078 if (css.length < 1) throw new TokenizeException(Identifier.class); 079 080 if (! startsIdentSequence(css, 0)) throw new TokenizeException 081 ("String-text beginning does not constitute a valid CSS Identifier-Token"); 082 }; 083 084 085 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 086 // Note: This is an "Extra Consume" Method. Class Identifier has no straight-forward consumer 087 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 088 // 089 // Static-Builder Methods really aren't 1/100th as nice and direct as an actual constructor. 090 // As I explained in the class CSSToken, there is not really any way around this - unless I 091 // were to completely clobber the CSS-Working-Group's Pseudo-Code and extra provided-diagrams. 092 // 093 // These "Static Build" methods aren't that bad, but since "Class Identifier" is the second 094 // biggest of the Data-Classes / Parse-Classes, this "build" method is slightly more 095 // complicated 096 // 097 // This method is a PRIVATE-METHOD which is only invoked by the "build"-Method that is directly 098 // above! The code below is DOING ABSOLUTELY-NOTHING but adding and removing "wrappers". 099 // Wrappers inside of code usually really are a bit ugly (at least to me)... 100 // 101 // This stuff has been thoroughly tested, and it does work. I ran this through a bunch of 102 // different edge-cases. 103 104 private static final void _PRIVATE_CONSUME( 105 final int[] css, 106 final ByRef<Integer> POS, 107 final Consumer<CSSToken> returnParsedToken 108 ) 109 { 110 // ByRef / Wrappers... wrappers, wrappers, wrappers that is all this is doing... 111 // Every single one of the other classes that have a "consume" method utilize these input 112 // and output parameters so that they can interact with BOTH the parser AND the 113 // User-Constructor via the SAME EXACT "consume" method. 114 // 115 // Again, Class "Identifier" is the only of the classes that does not actually have a 116 // regular "consume" method, although, that is what this one essentially is. Since there 117 // must be Exception-Checks before all "consume" methods, (and since this one doesn't have 118 // one), this method, as mentioned before, is PRIVATE, and only invoked directly above by 119 // "build!" 120 121 final ByRef<String> ident = new ByRef<>(null); 122 final int ePos = Identifier.consumeIdentSequence(css, POS.f, ident); 123 returnParsedToken.accept(new Identifier(css, POS.f, ePos, ident.f)); 124 } 125 126 127 // ******************************************************************************************** 128 // ******************************************************************************************** 129 // Tokenizer's "is" Method(s) 130 // ******************************************************************************************** 131 // ******************************************************************************************** 132 133 134 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 135 // Copied from: 136 // https://drafts.csswg.org/css-syntax-3/#ident-code-point 137 // March 27th, 2024 138 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 139 // ** ident code point 140 // ==> An ident-start code point, a digit, or U+002D HYPHEN-MINUS (-). 141 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 142 143 static boolean isIdentCodePoint(int codePoint) 144 { 145 if (isIdentStartCodePoint(codePoint)) return true; 146 if ((codePoint >= '0') && (codePoint <= '9')) return true; 147 if (codePoint == '-') return true; 148 149 return false; 150 } 151 152 153 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 154 // Copied from: 155 // https://drafts.csswg.org/css-syntax-3/#ident-start-code-point 156 // March 27th, 2024 157 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 158 // ** ident-start code point 159 // ==> A letter, a non-ASCII ident code point, or U+005F LOW LINE (_). 160 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 161 162 static boolean isIdentStartCodePoint(int codePoint) 163 { 164 if ((codePoint >= 'a') && (codePoint <= 'z')) return true; 165 if ((codePoint >= 'A') && (codePoint <= 'Z')) return true; 166 if (codePoint == '_') return true; 167 if (isNonASCIIIdentCodePoint(codePoint)) return true; 168 169 return false; 170 } 171 172 173 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 174 // Copied from: 175 // https://drafts.csswg.org/css-syntax-3/#non-ascii-ident-code-point 176 // March 27th, 2024 177 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 178 // ** non-ASCII ident code point 179 // ==> A code point whose value is any of: 180 // * U+00B7 181 // * between U+00C0 and U+00D6 182 // * between U+00D8 and U+00F6 183 // * between U+00F8 and U+037D 184 // * between U+037F and U+1FFF 185 // * U+200C, U+200D, U+203F, U+2040 186 // * between U+2070 and U+218F 187 // * between U+2C00 and U+2FEF 188 // * between U+3001 and U+D7FF 189 // * between U+F900 and U+FDCF 190 // * between U+FDF0 and U+FFFD 191 // * greater than or equal to U+10000 192 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 193 194 static boolean isNonASCIIIdentCodePoint(int c) 195 { 196 if (c == 0x00B7) return true; 197 198 // between U+00C0 and U+00D6 199 // between U+00D8 and U+00F6 200 // between U+00F8 and U+037D 201 // between U+037F and U+1FFF 202 203 if ((c >= 0x00C0) && (c <= 0x1FFF)) 204 { 205 return (c != 0x00D7) 206 && (c != 0x00F7) 207 && (c != 0x037E); 208 } 209 210 // U+200C, U+200D, U+203F, U+2040 211 if (c == 0x200C) return true; 212 if (c == 0x200D) return true; 213 if (c == 0x203F) return true; 214 if (c == 0x2040) return true; 215 216 // between U+2070 and U+218F 217 if ((c >= 0x2070) && (c <= 0x218F)) return true; 218 219 // between U+2C00 and U+2FEF 220 if ((c >= 0x2C00) && (c <= 0x2FEF)) return true; 221 222 // between U+3001 and U+D7FF 223 if ((c >= 0x2001) && (c <= 0xD7FF)) return true; 224 225 // between U+F900 and U+FDCF 226 if ((c >= 0xF900) && (c <= 0xFDCF)) return true; 227 228 // between U+FDF0 and U+FFFD 229 if ((c >= 0xFDF0) && (c <= 0xFFFD)) return true; 230 231 // greater than or equal to U+10000 232 if (c >= 0x10000) return true; 233 234 return false; 235 } 236 237 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 238 // Copied from: 239 // https://drafts.csswg.org/css-syntax-3/#check-if-three-code-points-would-start-an-ident-sequence 240 // March 27th, 2024 241 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 242 // 243 // 4.3.9. Check if three code points would start an ident sequence 244 // 245 // This section describes how to check if three code points would start an ident sequence. The 246 // algorithm described here can be called explicitly with three code points, or can be called 247 // with the input stream itself. In the latter case, the three code points in question are the 248 // current input code point and the next two input code points, in that order. 249 // 250 // NOTE: This algorithm will not consume any additional code points. 251 // 252 // Look at the first code point: 253 // 254 // ** U+002D HYPHEN-MINUS 255 // ==> If the second code point is an ident-start code point or a U+002D HYPHEN-MINUS, or 256 // the second and third code points are a valid escape, return true. Otherwise, return 257 // false. 258 // 259 // ** ident-start code point 260 // ==> Return true. 261 // 262 // ** U+005C REVERSE SOLIDUS (\) 263 // ==> If the first and second code points are a valid escape, return true. 264 // Otherwise, return false. 265 // 266 // ** anything else 267 // ==> Return false. 268 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 269 270 /** 271 * Checks whether or not the next token to consume is one of three available identifier-token, 272 * classes. 273 * 274 * <EMBED CLASS=defs DATA-TOK=Identifier-Name 275 * DATA-URL=check-if-three-code-points-would-start-an-ident-sequence DATA-OP=Check> 276 * <EMBED CLASS=external-html DATA-FILE-ID=COPIED_CSS_WG> 277 * <EMBED CLASS=external-html DATA-FILE-ID=CHECK_IDENT_SEQ_3CP> 278 * 279 * @param css CSS-{@code String} as an array of code-points. 280 * @param sPos The array-index where the tokenizer is to consume its next token 281 * @return {@code TRUE} if and only if the next token in the array is an identifier 282 */ 283 public static boolean startsIdentSequence(final int[] css, final int sPos) 284 { 285 final int c1 = ((sPos + 0) < css.length) ? css[sPos + 0] : 0; 286 final int c2 = ((sPos + 1) < css.length) ? css[sPos + 1] : 0; 287 final int c3 = ((sPos + 2) < css.length) ? css[sPos + 2] : 0; 288 289 // ** U+002D HYPHEN-MINUS 290 // ==> If the second code point is an ident-start code point or a U+002D HYPHEN-MINUS, or 291 // the second and third code points are a valid escape, return true. Otherwise, return 292 // false. 293 294 if (c1 == '-') 295 { 296 if (isIdentStartCodePoint(c2)) return true; 297 if (c2 == '-') return true; 298 if (CSSUtil.isValidEscape(c2, c3)) return true; 299 300 return false; 301 } 302 303 // ** ident-start code point ==> Return true. 304 if (isIdentStartCodePoint(c1)) return true; 305 306 // ** U+005C REVERSE SOLIDUS (\) 307 // ==> If the first and second code points are a valid escape, return true. 308 // Otherwise, return false. 309 310 if (CSSUtil.isValidEscape(c1, c2)) return true; 311 312 // ** anything else ==> Return false. 313 return false; 314 } 315 316 317 // ******************************************************************************************** 318 // ******************************************************************************************** 319 // CONSUME 320 // ******************************************************************************************** 321 // ******************************************************************************************** 322 323 324 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 325 // Copied from: 326 // https://drafts.csswg.org/css-syntax-3/#consume-an-ident-sequence 327 // March 27th, 2024 328 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 329 // 330 // 4.3.12. Consume an ident sequence 331 // 332 // This section describes how to consume an ident sequence from a stream of code points. It 333 // returns a string containing the largest name that can be formed from adjacent code points in 334 // the stream, starting from the first. 335 // 336 // NOTE: This algorithm does not do the verification of the first few code points that are 337 // necessary to ensure the returned code points would constitute an <ident-token>. If that is 338 // the intended use, ensure that the stream starts with an ident sequence before calling this 339 // algorithm. 340 // 341 // Let result initially be an empty string. 342 // 343 // Repeatedly consume the next input code point from the stream: 344 // 345 // ** ident code point: 346 // ==> Append the code point to result. 347 // 348 // ** the stream starts with a valid escape 349 // ==> Consume an escaped code point. Append the returned code point to result. 350 // 351 // ** anything else 352 // ==> Reconsume the current input code point. Return result. 353 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 354 355 /** 356 * This is a tokenizer method which <B>"consumes"</B> the next {@code Identifier}-Sequence 357 * from the input Code-Point Array. 358 * 359 * <EMBED CLASS=defs DATA-TOK=Identifier-Sequence DATA-URL=consume-an-ident-sequence 360 * DATA-OP=Consume> 361 * <EMBED CLASS=external-html DATA-FILE-ID=COPIED_CSS_WG> 362 * <EMBED CLASS=external-html DATA-FILE-ID=IDENTIFIER_SEQUENCE> 363 * <EMBED CLASS=external-html DATA-FILE-ID=IDENTIFIER_TOK_SVG> 364 */ 365 protected static int consumeIdentSequence( 366 final int[] css, 367 final int sPos, 368 final ByRef<String> identifier 369 ) 370 { 371 IntStream.Builder b = IntStream.builder(); 372 int c = 0; 373 int pos = sPos; 374 375 while (pos < css.length) 376 377 if (isIdentCodePoint(c = css[pos])) 378 { b.accept(c); pos++; } 379 380 else if (CSSUtil.isValidEscape(css, pos)) 381 pos = CSSUtil.consumeEscapedUnicode(css, ++pos, b); 382 383 // break happens before increment 384 else break; 385 386 int[] identifierAsCodePoints = b.build().toArray(); 387 388 // Uses the great Pass-Reference Tuple, which is ByRef 389 identifier.f = new String(identifierAsCodePoints, 0, identifierAsCodePoints.length); 390 391 return pos; 392 } 393 394 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 395 // Copied from: 396 // https://drafts.csswg.org/css-syntax-3/#consume-an-ident-like-token 397 // March 2024 398 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 399 // 400 // 4.3.4. Consume an ident-like token 401 // 402 // This section describes how to consume an ident-like token from a stream of code points. 403 // It returns an <ident-token>, <function-token>, <url-token>, or <bad-url-token>. 404 // 405 // Consume an ident sequence, and let **STRING** be the result. 406 // 407 // ** If string’s value is an ASCII case-insensitive match for "url", and the next input code 408 // point is U+0028 LEFT PARENTHESIS ((), consume it. While the next two input code points 409 // are whitespace, consume the next input code point. If the next one or two input code 410 // points are: 411 // * U+0022 QUOTATION MARK ("), 412 // * U+0027 APOSTROPHE ('), 413 // * or whitespace 414 // followed by: 415 // * U+0022 QUOTATION MARK (") 416 // * or U+0027 APOSTROPHE ('), 417 // 418 // then create a <function-token> with its value set to **STRING** and return it. 419 // Otherwise, consume a url token, and return it. 420 // 421 // ** Otherwise, if the next input code point is U+0028 LEFT PARENTHESIS ((), consume it. 422 // Create a <function-token> with its value set to **STRING** and return it. 423 // 424 // ** Otherwise, create an <ident-token> with its value set to **STRING** and return it. 425 426 /** 427 * This is a tokenizer method which <B>"consumes"</B> the next {@code Identifier}-Token (or 428 * Identifier-Token Subclass) from the input Code-Point Array. 429 * 430 * <EMBED CLASS=defs DATA-TOK=Identifier-Like-Token DATA-URL=consume-ident-like-token 431 * DATA-OP=Consume> 432 * <EMBED CLASS=external-html DATA-FILE-ID=COPIED_CSS_WG> 433 * <EMBED CLASS=external-html DATA-FILE-ID=IDENT_LIKE_TOKEN> 434 */ 435 protected static void consumeIdentLikeSequence( 436 final int[] css, 437 final ByRef<Integer> POS, 438 final Consumer<CSSToken> returnParsedToken, 439 final Consumer<TokenizeError> errorEncountered 440 ) 441 { 442 // Consume an ident sequence, and let string be the result. 443 ByRef<String> identifier = new ByRef<>(); 444 int identEPos = consumeIdentSequence(css, POS.f, identifier); 445 446 447 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 448 // FIRST-CASE: Handle the "url(" possibility 449 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 450 // 451 // If string’s value is an ASCII case-insensitive match for "url", and the next input code 452 // point is U+0028 LEFT PARENTHESIS ((), consume it 453 454 if ( identifier.f.equalsIgnoreCase("url") 455 && (identEPos < css.length) 456 && (css[identEPos] == '(') 457 ) 458 { 459 // Java-HTML isn't doing this in the EXACT-PRECISE order expresed in the Pseudo-Code 460 // Here, this is added first. It makes no different, because once the left-parenthesis 461 // has been identified, this is going to be a "Func" instance. Furthermore the end-pos 462 // of "Func(" will be exactly the left-parenthesis. 463 464 returnParsedToken.accept(new Func(css, POS.f, identEPos + 1, identifier.f)); 465 466 POS.f = identEPos + 1; 467 468 // Below is the actual Pseudo-Code Comment from the CSS-WG Document. Since Comments 469 // are still in the "int[] css", a while-loop is actually necessary - AND the "Func" 470 // instance needs to be inserted into the output-Consumer FIRST 471 // 472 // While the next two input code points are whitespace, consume the next input code 473 // point. 474 475 while (true) 476 477 if (Whitespace.is(css[POS.f])) 478 Whitespace.consume(css, POS, returnParsedToken); 479 480 else if (Comment.is(css, POS.f)) 481 Comment.consume(css, POS, returnParsedToken, errorEncountered); 482 483 else break; 484 485 // This part is identical to the CSS-WG Pseudo-Code, but again, it is every-so-slightly 486 // out of order. I don't actually stip out all the comments at the beginning, in the 487 // "PRE-PROCESSOR" phase. This Tokenizer does not employ the pre-processor. There are 488 // only two operations that are performed by the "PRE-PROCESSOR" - one is to stip all 489 // comments upon entry, and two is to replace all '\r\n' and '\r' with just '\n' 490 // 491 // That's all it does! I don't do either of those! Which is explained in the JavaDoc 492 // Pages - the Sum of all CSSToken's generated must be explicitly identical to the 493 // original CSS Input-String. 494 // 495 // Below is the original CSS-WG Comment. 496 // 497 // IT IS SOMEWHAT IMPORTANT TO NOTE/UNDERSTAND that ALL THIS IS SAYING is that if the 498 // "url(enclosed_stuff)" - the "enclosed_stuff" is encapsulated inside of quoation 499 // marks, then "enclosed_stuff" SHOULD BE IGNORED COMPLETELY, and handled as "Str" 500 // token by the MAIN-TOKENIZER LOOP (in class CSSTokenzier). 501 // 502 // The only way that an CSSToken instance of either "URLToken" or "BadURL" would EVER 503 // need to be parsed would be if the URL-Part - "enclosed_stuff" were ONLY encapsulated 504 // within parenthesis, and left-off / were-not-wrapped-with any quotation marks at all. 505 // 506 // If the next one or two input code points are 507 // * U+0022 QUOTATION MARK ("), 508 // * U+0027 APOSTROPHE ('), 509 // * or whitespace 510 // 511 // followed by: 512 // * U+0022 QUOTATION MARK (") 513 // * or U+0027 APOSTROPHE ('), 514 // 515 516 if ((css[POS.f] == '\'') || (css[POS.f] == '"') || (css[POS.f] == ')')) return; 517 518 // FINALLY, The "consume" method inside Class "URLToken" will actually generate a 519 // "BadURL" instance - if the URL is, indeed, bad! There is no need to worry about it 520 // here at all. 521 522 if (POS.f < css.length) 523 { 524 // NOTE: The 'false' at the end is to solve a minor problem where Class URLToken's 525 // "build" method is every-so-slightly different from the "consume" method. 526 // The "consume" method consumes a closing Right-Parenthesis, but the "build" 527 // method DOES NOT PROVIDE SUCH A PARENTHESIS. 528 // 529 // In order to differentiate the two different cases/situations, here 'false' is 530 // passed to method "consume", and when Class URLToken.build() is invoked, that 531 // method passes 'true' to the last parameter. 532 533 URLToken.consume(css, POS, returnParsedToken, errorEncountered, false); 534 return; 535 } 536 537 return; 538 } 539 540 541 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 542 // SECOND-CASE: Handle ALL OTHER "Func(" Possibilities 543 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 544 // 545 // Otherwise, if the next input code point is U+0028 LEFT PARENTHESIS ((), consume it. 546 // Create a <function-token> with its value set to string and return it. 547 548 else if ((identEPos < css.length) && (css[identEPos] == '(')) 549 { 550 returnParsedToken.accept(new Func(css, POS.f, identEPos + 1, identifier.f)); 551 POS.f = identEPos + 1; 552 } 553 554 555 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 556 // LAST-CASE: This Just an "identifierStr" - NOT AN "identifierStr(" - No Left-Parenthesis 557 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 558 559 else 560 { 561 returnParsedToken.accept(new Identifier(css, POS.f, identEPos, identifier.f)); 562 POS.f = identEPos; 563 } 564 } 565}