UnicodeRange.java.html

package Torello.CSS;

import Torello.Java.Additional.ByRef;

import java.util.Vector;
import java.util.function.Consumer;

/** Represents a range of characters in Unicode. */
@Torello.JavaDoc.JDHeaderBackgroundImg(EmbedTagFileID="CSS_TOK")
public class UnicodeRange extends CSSToken
    implements CharSequence, java.io.Serializable, Comparable<CharSequence>
{
    /** <EMBED CLASS='external-html' DATA-FILE-ID=SVUID> */
    protected static final long serialVersionUID = 1;

    // Don't worry about me right now.  The Pseudo-Code said this should be "configurable".  I have
    // not implemented this idea at the moment.  In order to properly implement this, I would need
    // to build an entire "ParserBuilder" (a build that allows a user to configure-and-then-build)
    // his own parser, JUST FOR THIS ONE STUPID CONFIGURATION-FIELD.
    // 
    // Since this is the only configuration in the entire Parer's Suite of CONSUME methods, I think
    // I'll one day make this a public & static Configuration-Field, and provide a javadoc warnign
    // (Similar to this one you are reading right now).  There is just simply no reason to over
    // complicate this pacakge with a "Parser Builder"

    static final boolean UNICODE_RANGES = true;

    /** The starting value of the range that has been specified, as a Java Integer. */
    public final int sRange;

    /** The ending value of the range that has been specified, as a Java Integer */
    public final int eRange;


    // ********************************************************************************************
    // ********************************************************************************************
    // Private Constructor, API "is" and "if" Methods
    // ********************************************************************************************
    // ********************************************************************************************


    private UnicodeRange(final int[] css, final int sPos, final int ePos, int sRange, int eRange)
    {
        super(css, sPos, ePos);
        this.sRange = sRange;
        this.eRange = eRange;
    }

    @Override 
    public final boolean isUnicodeRange() { return true; }

    @Override
    public final UnicodeRange ifUnicodeRange() { return this; }


    // ********************************************************************************************
    // ********************************************************************************************
    // User's Constructor: a static "build" method
    // ********************************************************************************************
    // ********************************************************************************************


    /**
     * <EMBED CLASS=defs DATA-TOK=Str DATA-P=rangeStr>
     * <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_DESC>
     * @param rangeStr <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_PARAM>
     * @return <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_RET>
     * @throws TokenizeException <EMBED CLASS='external-html' DATA-FILE-ID=BUILD_TOK_EX>
     */
    @SuppressWarnings("unchecked")
    public static UnicodeRange build(final String rangeStr)
    { return (UnicodeRange) CSSToken.build(rangeStr, INPUT_CHECKER, UnicodeRange::consume); }

    private static final CSSToken.InputChecker INPUT_CHECKER = (int[] css) ->
    {
        if (css.length < 3) throw new TokenizeException(UnicodeRange.class);

        if (! UnicodeRange.is(css, 0)) throw new TokenizeException
            ("String-text beginning does not constitute a valid CSS UnicodeRange-Token");
    };


    // ********************************************************************************************
    // ********************************************************************************************
    // Tokenizer's "is" Method(s)
    // ********************************************************************************************
    // ********************************************************************************************


    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
    // Copied from:
    // https://drafts.csswg.org/css-syntax-3/#check-if-three-code-points-would-start-a-unicode-range
    // April 27th, 2024
    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
    // 
    // 4.3.11. Check if three code points would start a unicode-range
    // 
    // This section describes how to check if three code points would start a unicode-range. The
    // algorithm described here can be called explicitly with three code points, or can be called
    // with the input stream itself. In the latter case, the three code points in question are the
    // current input code point and the next two input code points, in that order.
    // 
    // NOTE: This algorithm will not consume any additional code points.
    // 
    // If all of the following are true:
    // 
    // 1)   The first code point is either U+0055 LATIN CAPITAL LETTER U (U) or
    //      U+0075 LATIN SMALL LETTER U (u)
    // 
    // 2)   The second code point is U+002B PLUS SIGN (+).
    // 
    // 3)   The third code point is either U+003F QUESTION MARK (?) or a hex digit
    // 
    // then return true.
    // 
    // Otherwise return false.

    /**
     * Checks whether or not the next token to consume is a Unicode Range.
     * <EMBED CLASS=defs DATA-TOK=Escape-Sequence
     *      DATA-URL=check-if-three-code-points-would-start-a-unicode-range DATA-OP=Check>
     * <EMBED CLASS=external-html DATA-FILE-ID=COPIED_CSS_WG>
     * <EMBED CLASS=external-html DATA-FILE-ID=CHECK_UNIRANGE_3CP>
     * @param css CSS-{@code String} as an array of code-points.
     * @param sPos The array-index where the tokenizer is to consume its next token
     * @return {@code TRUE} if and only if the next token in the array is a Unicode-Range
     */
    public static boolean is(final int[] css, final int sPos)
    {
        if (! UNICODE_RANGES) return false;

        final int c1 = ((sPos + 0) < css.length) ? css[sPos+0] : 0;
        final int c2 = ((sPos + 1) < css.length) ? css[sPos+1] : 0;
        final int c3 = ((sPos + 2) < css.length) ? css[sPos+2] : 0;

        if ((c1 != 'u') && (c1 != 'U')) return false;
        if (c2 != '+')                  return false;
        if (c3 == '?')                  return true;
        if ((c3 >= '0') && (c3 <= '9')) return true;
        if ((c3 >= 'A') && (c3 <= 'F')) return true;
        if ((c3 >= 'a') && (c3 <= 'f')) return true;

        return false;        
    }


    // ********************************************************************************************
    // ********************************************************************************************
    // CONSUME
    // ********************************************************************************************
    // ********************************************************************************************


    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
    // Copied From:
    // https://drafts.csswg.org/css-syntax-3/#consume-unicode-range-token
    // April 2024
    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
    // 
    // 4.3.14. Consume a unicode-range token
    // 
    // This section describes how to consume a unicode-range token from a stream of code points. It
    // returns a <unicode-range-token>.
    // 
    // NOTE: This algorithm does not do the verification of the first few code points that are
    //       necessary to ensure the returned code points would constitute an
    //       <unicode-range-token>. Ensure that the stream would start a unicode-range before
    //       calling this algorithm.
    // 
    // NOTE: This token is not produced by the tokenizer under normal circumstances. This algorithm
    //       is only called during consume the value of a unicode-range descriptor, which itself is
    //       only called as a special case for parsing the unicode-range descriptor; this single
    //       invocation in the entire language is due to a bad syntax design in early CSS.
    // 
    // 1)   Consume the next two input code points and discard them.
    // 
    // 2)   Consume as many hex digits as possible, but no more than 6. If less than 6 hex digits
    //      were consumed, consume as many U+003F QUESTION MARK (?) code points as possible, but no
    //      more than enough to make the total of hex digits and U+003F QUESTION MARK (?) code
    //      points equal to 6.
    // 
    //      Let first segment be the consumed code points.
    // 
    // 3)   If first segment contains any question mark code points, then:
    // 
    //      1)  Replace the question marks in first segment with U+0030 DIGIT ZERO (0) code points,
    //          and interpret the result as a hexadecimal number. Let this be start of range.
    // 
    //      2)  Replace the question marks in first segment with U+0046 LATIN CAPITAL LETTER F (F)
    //          code points, and interpret the result as a hexadecimal number. Let this be end of
    //          range.
    // 
    //      3)  Return a new <unicode-range-token> starting at start of range and ending at end of
    //          range.
    // 
    // 4)   Otherwise, interpret first segment as a hexadecimal number, and let the result be start
    //      of range.
    // 
    // 5)   If the next 2 input code points are U+002D HYPHEN-MINUS (-) followed by a hex digit,
    //      then:
    // 
    //      1)  Consume the next input code point.
    // 
    //      2)  Consume as many hex digits as possible, but no more than 6. Interpret the consumed
    //          code points as a hexadecimal number. Let this be end of range.
    // 
    //      3) Return a new <unicode-range-token> starting at start of range and ending at end of
    //         range.
    // 
    // 6) Otherwise, return a new <unicode-range-token> both starting and ending at start of range.

    /**
     * This is a tokenizer method which <B>"consumes"</B> the next {@code UnicodeRange}-Token
     * from the input Code-Point Array.
     * 
     * <EMBED CLASS=defs DATA-TOK=UnicodeRange DATA-URL=consume-unicode-range-token
     *      DATA-OP=Consume>
     * <EMBED CLASS=external-html DATA-FILE-ID=COPIED_CSS_WG>
     * <EMBED CLASS=external-html DATA-FILE-ID=UNICODE_RANGE_TOKEN>
     * <EMBED CLASS=external-html DATA-FILE-ID=UNICODE_RG_TOK_SVG>
     */
    protected static void consume(                          // When invoked from 'CSSTokenizer'
            final int[]                 css,                // C, int[] css
            final ByRef<Integer>        POS,                // P, array-pos loop-variable
            final Consumer<CSSToken>    returnParsedToken
        )
    {
        // 1)   Consume the next two input code points and discard them.
        int pos = POS.f + 2;

        // 2)   Consume as many hex digits as possible, but no more than 6. If less than 6 hex
        //      digits were consumed, consume as many U+003F QUESTION MARK (?) code points as
        //      possible, but no more than enough to make the total of hex digits and
        //      U+003F QUESTION MARK (?) code points equal to 6.
        // 
        //      Let first segment be the consumed code points.

        int     count               = 0;
        boolean hadQuestionMarks    = false;

        while (     (count < 6)
                &&  (pos < css.length)
                &&  isHexDigit(css[pos])
            )
            { count++; pos++; }

        while (     (count < 6)
                &&  (pos < css.length)
                &&  (css[pos] == '?')
            )
            { count++; pos++; hadQuestionMarks=true;}


        // *** 3) If first segment contains any question mark code points, then:
        // [The next 3 bullet points were UN-INDENTED, for readability]
        // 
        // 1) Replace the question marks in first segment with U+0030 DIGIT ZERO (0) code points,
        //    and interpret the result as a hexadecimal number. Let this be start of range.
        // 
        // 2) Replace the question marks in first segment with U+0046 LATIN CAPITAL LETTER F (F)
        //    code points, and interpret the result as a hexadecimal number. Let this be end of
        //    range.
        // 
        // 3) Return a new <unicode-range-token> starting at start of range and ending at end of
        //    range.

        final String s1 = new String(css, POS.f + 2, pos - (POS.f + 2));
        final int sRange, eRange;

        if (hadQuestionMarks)
        {
            sRange = Integer.parseInt(s1.replace('?', '0'), 16);
            eRange = Integer.parseInt(s1.replace('?', 'F'), 16);

            returnParsedToken.accept(new UnicodeRange(css, POS.f, pos, sRange, eRange));
            POS.f = pos;
            return;
        }

        // 4) Otherwise, interpret first segment as a hexadecimal number, and let the result be
        //    start of range.

        else sRange = Integer.parseInt(s1, 16);

        // *** 5) If the next 2 input code points are U+002D HYPHEN-MINUS (-) followed by a hex
        //        digit, then:
        // [The next 3 bullet points were UN-INDENTED, for readability]
        // 
        // 1) Consume the next input code point.
        // 
        // 2) Consume as many hex digits as possible, but no more than 6. Interpret the consumed
        //    code points as a hexadecimal number. Let this be end of range.
        // 
        // 3) Return a new <unicode-range-token> starting at start of range and ending at end of
        //    range.

        if (((pos+1) < css.length) && (css[pos] == '-') && isHexDigit(css[pos+1]))
        {
            pos++;
            count = 0;
            int c;
            final StringBuilder sb = new StringBuilder();

            while (     (count < 6)
                    &&  (pos < css.length)
                    &&  isHexDigit(c = css[pos])
                )
            {
                sb.append((char) c);
                count++;
                pos++;
            }

            eRange = Integer.parseInt(sb.toString(), 16);
            returnParsedToken.accept(new UnicodeRange(css, POS.f, pos, sRange, eRange));
            POS.f = pos;
            return;
        }
 
        // 6) Otherwise, return a new <unicode-range-token> both starting and ending at start of
        //    range.

        returnParsedToken.accept(new UnicodeRange(css, POS.f, pos, sRange, sRange));
        POS.f = pos;    
    }

    private static boolean isHexDigit(int codePoint)
    {
        return
            ((codePoint >= '0') && (codePoint <= '9'))
        ||  ((codePoint >= 'a') && (codePoint <= 'f'))
        ||  ((codePoint >= 'A') && (codePoint <= 'F'));
    }
}