001package Torello.CSS;
002
003import Torello.Java.Additional.ByRef;
004
005import java.util.Vector;
006import java.util.function.Consumer;
007
008/** Represents a range of characters in Unicode. */
009@Torello.JavaDoc.JDHeaderBackgroundImg(EmbedTagFileID="CSS_TOK")
010public class UnicodeRange extends CSSToken
011    implements CharSequence, java.io.Serializable, Comparable<CharSequence>
012{
013    /** <EMBED CLASS='external-html' DATA-FILE-ID=SVUID> */
014    protected static final long serialVersionUID = 1;
015
016    // Don't worry about me right now.  The Pseudo-Code said this should be "configurable".  I have
017    // not implemented this idea at the moment.  In order to properly implement this, I would need
018    // to build an entire "ParserBuilder" (a build that allows a user to configure-and-then-build)
019    // his own parser, JUST FOR THIS ONE STUPID CONFIGURATION-FIELD.
020    // 
021    // Since this is the only configuration in the entire Parer's Suite of CONSUME methods, I think
022    // I'll one day make this a public & static Configuration-Field, and provide a javadoc warnign
023    // (Similar to this one you are reading right now).  There is just simply no reason to over
024    // complicate this pacakge with a "Parser Builder"
025
026    static final boolean UNICODE_RANGES = true;
027
028    /** The starting value of the range that has been specified, as a Java Integer. */
029    public final int sRange;
030
031    /** The ending value of the range that has been specified, as a Java Integer */
032    public final int eRange;
033
034
035    // ********************************************************************************************
036    // ********************************************************************************************
037    // Private Constructor, API "is" and "if" Methods
038    // ********************************************************************************************
039    // ********************************************************************************************
040
041
042    private UnicodeRange(final int[] css, final int sPos, final int ePos, int sRange, int eRange)
043    {
044        super(css, sPos, ePos);
045        this.sRange = sRange;
046        this.eRange = eRange;
047    }
048
049    @Override 
050    public final boolean isUnicodeRange() { return true; }
051
052    @Override
053    public final UnicodeRange ifUnicodeRange() { return this; }
054
055
056    // ********************************************************************************************
057    // ********************************************************************************************
058    // User's Constructor: a static "build" method
059    // ********************************************************************************************
060    // ********************************************************************************************
061
062
063    /**
064     * <EMBED CLASS=defs DATA-TOK=Str DATA-P=rangeStr>
065     * <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_DESC>
066     * @param rangeStr <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_PARAM>
067     * @return <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_RET>
068     * @throws TokenizeException <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_TOK_EX>
069     */
070    @SuppressWarnings("unchecked")
071    public static UnicodeRange build(final String rangeStr)
072    { return (UnicodeRange) CSSToken.build(rangeStr, INPUT_CHECKER, UnicodeRange::consume); }
073
074    private static final CSSToken.InputChecker INPUT_CHECKER = (int[] css) ->
075    {
076        if (css.length < 3) throw new TokenizeException(UnicodeRange.class);
077
078        if (! UnicodeRange.is(css, 0)) throw new TokenizeException
079            ("String-text beginning does not constitute a valid CSS UnicodeRange-Token");
080    };
081
082
083    // ********************************************************************************************
084    // ********************************************************************************************
085    // Tokenizer's "is" Method(s)
086    // ********************************************************************************************
087    // ********************************************************************************************
088
089
090    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
091    // Copied from:
092    // https://drafts.csswg.org/css-syntax-3/#check-if-three-code-points-would-start-a-unicode-range
093    // April 27th, 2024
094    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
095    // 
096    // 4.3.11. Check if three code points would start a unicode-range
097    // 
098    // This section describes how to check if three code points would start a unicode-range. The
099    // algorithm described here can be called explicitly with three code points, or can be called
100    // with the input stream itself. In the latter case, the three code points in question are the
101    // current input code point and the next two input code points, in that order.
102    // 
103    // NOTE: This algorithm will not consume any additional code points.
104    // 
105    // If all of the following are true:
106    // 
107    // 1)   The first code point is either U+0055 LATIN CAPITAL LETTER U (U) or
108    //      U+0075 LATIN SMALL LETTER U (u)
109    // 
110    // 2)   The second code point is U+002B PLUS SIGN (+).
111    // 
112    // 3)   The third code point is either U+003F QUESTION MARK (?) or a hex digit
113    // 
114    // then return true.
115    // 
116    // Otherwise return false.
117
118    /**
119     * Checks whether or not the next token to consume is a Unicode Range.
120     * <EMBED CLASS=defs DATA-TOK=Escape-Sequence
121     *      DATA-URL=check-if-three-code-points-would-start-a-unicode-range DATA-OP=Check>
122     * <EMBED CLASS=external-html DATA-FILE-ID=COPIED_CSS_WG>
123     * <EMBED CLASS=external-html DATA-FILE-ID=CHECK_UNIRANGE_3CP>
124     * @param css CSS-{@code String} as an array of code-points.
125     * @param sPos The array-index where the tokenizer is to consume its next token
126     * @return {@code TRUE} if and only if the next token in the array is a Unicode-Range
127     */
128    public static boolean is(final int[] css, final int sPos)
129    {
130        if (! UNICODE_RANGES) return false;
131
132        final int c1 = ((sPos + 0) < css.length) ? css[sPos+0] : 0;
133        final int c2 = ((sPos + 1) < css.length) ? css[sPos+1] : 0;
134        final int c3 = ((sPos + 2) < css.length) ? css[sPos+2] : 0;
135
136        if ((c1 != 'u') && (c1 != 'U')) return false;
137        if (c2 != '+')                  return false;
138        if (c3 == '?')                  return true;
139        if ((c3 >= '0') && (c3 <= '9')) return true;
140        if ((c3 >= 'A') && (c3 <= 'F')) return true;
141        if ((c3 >= 'a') && (c3 <= 'f')) return true;
142
143        return false;        
144    }
145
146
147    // ********************************************************************************************
148    // ********************************************************************************************
149    // CONSUME
150    // ********************************************************************************************
151    // ********************************************************************************************
152
153
154    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
155    // Copied From:
156    // https://drafts.csswg.org/css-syntax-3/#consume-unicode-range-token
157    // April 2024
158    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
159    // 
160    // 4.3.14. Consume a unicode-range token
161    // 
162    // This section describes how to consume a unicode-range token from a stream of code points. It
163    // returns a <unicode-range-token>.
164    // 
165    // NOTE: This algorithm does not do the verification of the first few code points that are
166    //       necessary to ensure the returned code points would constitute an
167    //       <unicode-range-token>. Ensure that the stream would start a unicode-range before
168    //       calling this algorithm.
169    // 
170    // NOTE: This token is not produced by the tokenizer under normal circumstances. This algorithm
171    //       is only called during consume the value of a unicode-range descriptor, which itself is
172    //       only called as a special case for parsing the unicode-range descriptor; this single
173    //       invocation in the entire language is due to a bad syntax design in early CSS.
174    // 
175    // 1)   Consume the next two input code points and discard them.
176    // 
177    // 2)   Consume as many hex digits as possible, but no more than 6. If less than 6 hex digits
178    //      were consumed, consume as many U+003F QUESTION MARK (?) code points as possible, but no
179    //      more than enough to make the total of hex digits and U+003F QUESTION MARK (?) code
180    //      points equal to 6.
181    // 
182    //      Let first segment be the consumed code points.
183    // 
184    // 3)   If first segment contains any question mark code points, then:
185    // 
186    //      1)  Replace the question marks in first segment with U+0030 DIGIT ZERO (0) code points,
187    //          and interpret the result as a hexadecimal number. Let this be start of range.
188    // 
189    //      2)  Replace the question marks in first segment with U+0046 LATIN CAPITAL LETTER F (F)
190    //          code points, and interpret the result as a hexadecimal number. Let this be end of
191    //          range.
192    // 
193    //      3)  Return a new <unicode-range-token> starting at start of range and ending at end of
194    //          range.
195    // 
196    // 4)   Otherwise, interpret first segment as a hexadecimal number, and let the result be start
197    //      of range.
198    // 
199    // 5)   If the next 2 input code points are U+002D HYPHEN-MINUS (-) followed by a hex digit,
200    //      then:
201    // 
202    //      1)  Consume the next input code point.
203    // 
204    //      2)  Consume as many hex digits as possible, but no more than 6. Interpret the consumed
205    //          code points as a hexadecimal number. Let this be end of range.
206    // 
207    //      3) Return a new <unicode-range-token> starting at start of range and ending at end of
208    //         range.
209    // 
210    // 6) Otherwise, return a new <unicode-range-token> both starting and ending at start of range.
211
212    /**
213     * This is a tokenizer method which <B>"consumes"</B> the next {@code UnicodeRange}-Token
214     * from the input Code-Point Array.
215     * 
216     * <EMBED CLASS=defs DATA-TOK=UnicodeRange DATA-URL=consume-unicode-range-token
217     *      DATA-OP=Consume>
218     * <EMBED CLASS=external-html DATA-FILE-ID=COPIED_CSS_WG>
219     * <EMBED CLASS=external-html DATA-FILE-ID=UNICODE_RANGE_TOKEN>
220     * <EMBED CLASS=external-html DATA-FILE-ID=UNICODE_RG_TOK_SVG>
221     */
222    protected static void consume(                          // When invoked from 'CSSTokenizer'
223            final int[]                 css,                // C, int[] css
224            final ByRef<Integer>        POS,                // P, array-pos loop-variable
225            final Consumer<CSSToken>    returnParsedToken
226        )
227    {
228        // 1)   Consume the next two input code points and discard them.
229        int pos = POS.f + 2;
230
231        // 2)   Consume as many hex digits as possible, but no more than 6. If less than 6 hex
232        //      digits were consumed, consume as many U+003F QUESTION MARK (?) code points as
233        //      possible, but no more than enough to make the total of hex digits and
234        //      U+003F QUESTION MARK (?) code points equal to 6.
235        // 
236        //      Let first segment be the consumed code points.
237
238        int     count               = 0;
239        boolean hadQuestionMarks    = false;
240
241        while (     (count < 6)
242                &&  (pos < css.length)
243                &&  isHexDigit(css[pos])
244            )
245            { count++; pos++; }
246
247        while (     (count < 6)
248                &&  (pos < css.length)
249                &&  (css[pos] == '?')
250            )
251            { count++; pos++; hadQuestionMarks=true;}
252
253
254        // *** 3) If first segment contains any question mark code points, then:
255        // [The next 3 bullet points were UN-INDENTED, for readability]
256        // 
257        // 1) Replace the question marks in first segment with U+0030 DIGIT ZERO (0) code points,
258        //    and interpret the result as a hexadecimal number. Let this be start of range.
259        // 
260        // 2) Replace the question marks in first segment with U+0046 LATIN CAPITAL LETTER F (F)
261        //    code points, and interpret the result as a hexadecimal number. Let this be end of
262        //    range.
263        // 
264        // 3) Return a new <unicode-range-token> starting at start of range and ending at end of
265        //    range.
266
267        final String s1 = new String(css, POS.f + 2, pos - (POS.f + 2));
268        final int sRange, eRange;
269
270        if (hadQuestionMarks)
271        {
272            sRange = Integer.parseInt(s1.replace('?', '0'), 16);
273            eRange = Integer.parseInt(s1.replace('?', 'F'), 16);
274
275            returnParsedToken.accept(new UnicodeRange(css, POS.f, pos, sRange, eRange));
276            POS.f = pos;
277            return;
278        }
279
280        // 4) Otherwise, interpret first segment as a hexadecimal number, and let the result be
281        //    start of range.
282
283        else sRange = Integer.parseInt(s1, 16);
284
285        // *** 5) If the next 2 input code points are U+002D HYPHEN-MINUS (-) followed by a hex
286        //        digit, then:
287        // [The next 3 bullet points were UN-INDENTED, for readability]
288        // 
289        // 1) Consume the next input code point.
290        // 
291        // 2) Consume as many hex digits as possible, but no more than 6. Interpret the consumed
292        //    code points as a hexadecimal number. Let this be end of range.
293        // 
294        // 3) Return a new <unicode-range-token> starting at start of range and ending at end of
295        //    range.
296
297        if (((pos+1) < css.length) && (css[pos] == '-') && isHexDigit(css[pos+1]))
298        {
299            pos++;
300            count = 0;
301            int c;
302            final StringBuilder sb = new StringBuilder();
303
304            while (     (count < 6)
305                    &&  (pos < css.length)
306                    &&  isHexDigit(c = css[pos])
307                )
308            {
309                sb.append((char) c);
310                count++;
311                pos++;
312            }
313
314            eRange = Integer.parseInt(sb.toString(), 16);
315            returnParsedToken.accept(new UnicodeRange(css, POS.f, pos, sRange, eRange));
316            POS.f = pos;
317            return;
318        }
319 
320        // 6) Otherwise, return a new <unicode-range-token> both starting and ending at start of
321        //    range.
322
323        returnParsedToken.accept(new UnicodeRange(css, POS.f, pos, sRange, sRange));
324        POS.f = pos;    
325    }
326
327    private static boolean isHexDigit(int codePoint)
328    {
329        return
330            ((codePoint >= '0') && (codePoint <= '9'))
331        ||  ((codePoint >= 'a') && (codePoint <= 'f'))
332        ||  ((codePoint >= 'A') && (codePoint <= 'F'));
333    }
334}