001package Torello.CSS; 002 003import Torello.Java.Additional.ByRef; 004 005import java.util.Vector; 006import java.util.function.Consumer; 007 008/** Represents a range of characters in Unicode. */ 009@Torello.JavaDoc.JDHeaderBackgroundImg(EmbedTagFileID="CSS_TOK") 010public class UnicodeRange extends CSSToken 011 implements CharSequence, java.io.Serializable, Comparable<CharSequence> 012{ 013 /** <EMBED CLASS='external-html' DATA-FILE-ID=SVUID> */ 014 protected static final long serialVersionUID = 1; 015 016 // Don't worry about me right now. The Pseudo-Code said this should be "configurable". I have 017 // not implemented this idea at the moment. In order to properly implement this, I would need 018 // to build an entire "ParserBuilder" (a build that allows a user to configure-and-then-build) 019 // his own parser, JUST FOR THIS ONE STUPID CONFIGURATION-FIELD. 020 // 021 // Since this is the only configuration in the entire Parer's Suite of CONSUME methods, I think 022 // I'll one day make this a public & static Configuration-Field, and provide a javadoc warnign 023 // (Similar to this one you are reading right now). There is just simply no reason to over 024 // complicate this pacakge with a "Parser Builder" 025 026 static final boolean UNICODE_RANGES = true; 027 028 /** The starting value of the range that has been specified, as a Java Integer. */ 029 public final int sRange; 030 031 /** The ending value of the range that has been specified, as a Java Integer */ 032 public final int eRange; 033 034 035 // ******************************************************************************************** 036 // ******************************************************************************************** 037 // Private Constructor, API "is" and "if" Methods 038 // ******************************************************************************************** 039 // ******************************************************************************************** 040 041 042 private UnicodeRange(final int[] css, final int sPos, final int ePos, int sRange, int eRange) 043 { 044 super(css, sPos, ePos); 045 this.sRange = sRange; 046 this.eRange = eRange; 047 } 048 049 @Override 050 public final boolean isUnicodeRange() { return true; } 051 052 @Override 053 public final UnicodeRange ifUnicodeRange() { return this; } 054 055 056 // ******************************************************************************************** 057 // ******************************************************************************************** 058 // User's Constructor: a static "build" method 059 // ******************************************************************************************** 060 // ******************************************************************************************** 061 062 063 /** 064 * <EMBED CLASS=defs DATA-TOK=Str DATA-P=rangeStr> 065 * <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_DESC> 066 * @param rangeStr <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_PARAM> 067 * @return <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_RET> 068 * @throws TokenizeException <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_TOK_EX> 069 */ 070 @SuppressWarnings("unchecked") 071 public static UnicodeRange build(final String rangeStr) 072 { return (UnicodeRange) CSSToken.build(rangeStr, INPUT_CHECKER, UnicodeRange::consume); } 073 074 private static final CSSToken.InputChecker INPUT_CHECKER = (int[] css) -> 075 { 076 if (css.length < 3) throw new TokenizeException(UnicodeRange.class); 077 078 if (! UnicodeRange.is(css, 0)) throw new TokenizeException 079 ("String-text beginning does not constitute a valid CSS UnicodeRange-Token"); 080 }; 081 082 083 // ******************************************************************************************** 084 // ******************************************************************************************** 085 // Tokenizer's "is" Method(s) 086 // ******************************************************************************************** 087 // ******************************************************************************************** 088 089 090 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 091 // Copied from: 092 // https://drafts.csswg.org/css-syntax-3/#check-if-three-code-points-would-start-a-unicode-range 093 // April 27th, 2024 094 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 095 // 096 // 4.3.11. Check if three code points would start a unicode-range 097 // 098 // This section describes how to check if three code points would start a unicode-range. The 099 // algorithm described here can be called explicitly with three code points, or can be called 100 // with the input stream itself. In the latter case, the three code points in question are the 101 // current input code point and the next two input code points, in that order. 102 // 103 // NOTE: This algorithm will not consume any additional code points. 104 // 105 // If all of the following are true: 106 // 107 // 1) The first code point is either U+0055 LATIN CAPITAL LETTER U (U) or 108 // U+0075 LATIN SMALL LETTER U (u) 109 // 110 // 2) The second code point is U+002B PLUS SIGN (+). 111 // 112 // 3) The third code point is either U+003F QUESTION MARK (?) or a hex digit 113 // 114 // then return true. 115 // 116 // Otherwise return false. 117 118 /** 119 * Checks whether or not the next token to consume is a Unicode Range. 120 * <EMBED CLASS=defs DATA-TOK=Escape-Sequence 121 * DATA-URL=check-if-three-code-points-would-start-a-unicode-range DATA-OP=Check> 122 * <EMBED CLASS=external-html DATA-FILE-ID=COPIED_CSS_WG> 123 * <EMBED CLASS=external-html DATA-FILE-ID=CHECK_UNIRANGE_3CP> 124 * @param css CSS-{@code String} as an array of code-points. 125 * @param sPos The array-index where the tokenizer is to consume its next token 126 * @return {@code TRUE} if and only if the next token in the array is a Unicode-Range 127 */ 128 public static boolean is(final int[] css, final int sPos) 129 { 130 if (! UNICODE_RANGES) return false; 131 132 final int c1 = ((sPos + 0) < css.length) ? css[sPos+0] : 0; 133 final int c2 = ((sPos + 1) < css.length) ? css[sPos+1] : 0; 134 final int c3 = ((sPos + 2) < css.length) ? css[sPos+2] : 0; 135 136 if ((c1 != 'u') && (c1 != 'U')) return false; 137 if (c2 != '+') return false; 138 if (c3 == '?') return true; 139 if ((c3 >= '0') && (c3 <= '9')) return true; 140 if ((c3 >= 'A') && (c3 <= 'F')) return true; 141 if ((c3 >= 'a') && (c3 <= 'f')) return true; 142 143 return false; 144 } 145 146 147 // ******************************************************************************************** 148 // ******************************************************************************************** 149 // CONSUME 150 // ******************************************************************************************** 151 // ******************************************************************************************** 152 153 154 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 155 // Copied From: 156 // https://drafts.csswg.org/css-syntax-3/#consume-unicode-range-token 157 // April 2024 158 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 159 // 160 // 4.3.14. Consume a unicode-range token 161 // 162 // This section describes how to consume a unicode-range token from a stream of code points. It 163 // returns a <unicode-range-token>. 164 // 165 // NOTE: This algorithm does not do the verification of the first few code points that are 166 // necessary to ensure the returned code points would constitute an 167 // <unicode-range-token>. Ensure that the stream would start a unicode-range before 168 // calling this algorithm. 169 // 170 // NOTE: This token is not produced by the tokenizer under normal circumstances. This algorithm 171 // is only called during consume the value of a unicode-range descriptor, which itself is 172 // only called as a special case for parsing the unicode-range descriptor; this single 173 // invocation in the entire language is due to a bad syntax design in early CSS. 174 // 175 // 1) Consume the next two input code points and discard them. 176 // 177 // 2) Consume as many hex digits as possible, but no more than 6. If less than 6 hex digits 178 // were consumed, consume as many U+003F QUESTION MARK (?) code points as possible, but no 179 // more than enough to make the total of hex digits and U+003F QUESTION MARK (?) code 180 // points equal to 6. 181 // 182 // Let first segment be the consumed code points. 183 // 184 // 3) If first segment contains any question mark code points, then: 185 // 186 // 1) Replace the question marks in first segment with U+0030 DIGIT ZERO (0) code points, 187 // and interpret the result as a hexadecimal number. Let this be start of range. 188 // 189 // 2) Replace the question marks in first segment with U+0046 LATIN CAPITAL LETTER F (F) 190 // code points, and interpret the result as a hexadecimal number. Let this be end of 191 // range. 192 // 193 // 3) Return a new <unicode-range-token> starting at start of range and ending at end of 194 // range. 195 // 196 // 4) Otherwise, interpret first segment as a hexadecimal number, and let the result be start 197 // of range. 198 // 199 // 5) If the next 2 input code points are U+002D HYPHEN-MINUS (-) followed by a hex digit, 200 // then: 201 // 202 // 1) Consume the next input code point. 203 // 204 // 2) Consume as many hex digits as possible, but no more than 6. Interpret the consumed 205 // code points as a hexadecimal number. Let this be end of range. 206 // 207 // 3) Return a new <unicode-range-token> starting at start of range and ending at end of 208 // range. 209 // 210 // 6) Otherwise, return a new <unicode-range-token> both starting and ending at start of range. 211 212 /** 213 * This is a tokenizer method which <B>"consumes"</B> the next {@code UnicodeRange}-Token 214 * from the input Code-Point Array. 215 * 216 * <EMBED CLASS=defs DATA-TOK=UnicodeRange DATA-URL=consume-unicode-range-token 217 * DATA-OP=Consume> 218 * <EMBED CLASS=external-html DATA-FILE-ID=COPIED_CSS_WG> 219 * <EMBED CLASS=external-html DATA-FILE-ID=UNICODE_RANGE_TOKEN> 220 * <EMBED CLASS=external-html DATA-FILE-ID=UNICODE_RG_TOK_SVG> 221 */ 222 protected static void consume( // When invoked from 'CSSTokenizer' 223 final int[] css, // C, int[] css 224 final ByRef<Integer> POS, // P, array-pos loop-variable 225 final Consumer<CSSToken> returnParsedToken 226 ) 227 { 228 // 1) Consume the next two input code points and discard them. 229 int pos = POS.f + 2; 230 231 // 2) Consume as many hex digits as possible, but no more than 6. If less than 6 hex 232 // digits were consumed, consume as many U+003F QUESTION MARK (?) code points as 233 // possible, but no more than enough to make the total of hex digits and 234 // U+003F QUESTION MARK (?) code points equal to 6. 235 // 236 // Let first segment be the consumed code points. 237 238 int count = 0; 239 boolean hadQuestionMarks = false; 240 241 while ( (count < 6) 242 && (pos < css.length) 243 && isHexDigit(css[pos]) 244 ) 245 { count++; pos++; } 246 247 while ( (count < 6) 248 && (pos < css.length) 249 && (css[pos] == '?') 250 ) 251 { count++; pos++; hadQuestionMarks=true;} 252 253 254 // *** 3) If first segment contains any question mark code points, then: 255 // [The next 3 bullet points were UN-INDENTED, for readability] 256 // 257 // 1) Replace the question marks in first segment with U+0030 DIGIT ZERO (0) code points, 258 // and interpret the result as a hexadecimal number. Let this be start of range. 259 // 260 // 2) Replace the question marks in first segment with U+0046 LATIN CAPITAL LETTER F (F) 261 // code points, and interpret the result as a hexadecimal number. Let this be end of 262 // range. 263 // 264 // 3) Return a new <unicode-range-token> starting at start of range and ending at end of 265 // range. 266 267 final String s1 = new String(css, POS.f + 2, pos - (POS.f + 2)); 268 final int sRange, eRange; 269 270 if (hadQuestionMarks) 271 { 272 sRange = Integer.parseInt(s1.replace('?', '0'), 16); 273 eRange = Integer.parseInt(s1.replace('?', 'F'), 16); 274 275 returnParsedToken.accept(new UnicodeRange(css, POS.f, pos, sRange, eRange)); 276 POS.f = pos; 277 return; 278 } 279 280 // 4) Otherwise, interpret first segment as a hexadecimal number, and let the result be 281 // start of range. 282 283 else sRange = Integer.parseInt(s1, 16); 284 285 // *** 5) If the next 2 input code points are U+002D HYPHEN-MINUS (-) followed by a hex 286 // digit, then: 287 // [The next 3 bullet points were UN-INDENTED, for readability] 288 // 289 // 1) Consume the next input code point. 290 // 291 // 2) Consume as many hex digits as possible, but no more than 6. Interpret the consumed 292 // code points as a hexadecimal number. Let this be end of range. 293 // 294 // 3) Return a new <unicode-range-token> starting at start of range and ending at end of 295 // range. 296 297 if (((pos+1) < css.length) && (css[pos] == '-') && isHexDigit(css[pos+1])) 298 { 299 pos++; 300 count = 0; 301 int c; 302 final StringBuilder sb = new StringBuilder(); 303 304 while ( (count < 6) 305 && (pos < css.length) 306 && isHexDigit(c = css[pos]) 307 ) 308 { 309 sb.append((char) c); 310 count++; 311 pos++; 312 } 313 314 eRange = Integer.parseInt(sb.toString(), 16); 315 returnParsedToken.accept(new UnicodeRange(css, POS.f, pos, sRange, eRange)); 316 POS.f = pos; 317 return; 318 } 319 320 // 6) Otherwise, return a new <unicode-range-token> both starting and ending at start of 321 // range. 322 323 returnParsedToken.accept(new UnicodeRange(css, POS.f, pos, sRange, sRange)); 324 POS.f = pos; 325 } 326 327 private static boolean isHexDigit(int codePoint) 328 { 329 return 330 ((codePoint >= '0') && (codePoint <= '9')) 331 || ((codePoint >= 'a') && (codePoint <= 'f')) 332 || ((codePoint >= 'A') && (codePoint <= 'F')); 333 } 334}