Source code

001package Torello.Java;
002
003import Torello.Java.ReadOnly.ReadOnlySet;
004import Torello.Java.ReadOnly.ReadOnlyHashSet;
005import Torello.Java.ReadOnly.ReadOnlyList;
006import Torello.Java.ReadOnly.ReadOnlyArrayList;
007
008import Torello.Java.Additional.Counter;
009
010import java.util.regex.Pattern;
011import java.util.regex.Matcher;
012
013import java.util.stream.Stream;
014
015import java.util.function.Supplier;
016
017@Torello.JavaDoc.StaticFunctional
018public class StrSource
019{
020    private StrSource() { }
021
022
023    // ********************************************************************************************
024    // ********************************************************************************************
025    // FIELDS
026    // ********************************************************************************************
027    // ********************************************************************************************
028
029
030    private static final char[] REGEX_ESCAPE_CHARS_ARR =
031    { '\\', '/', '(', ')', '[', ']', '{', '}', '$', '^', '+', '*', '?', '-', '.' };
032
033    /**
034     * These are 'control' characters (Reg Ex Code), so they must be escaped if the are to be
035     * treated as their ASCII-equivalent values.
036     */
037    public static final ReadOnlySet<Character> REGEX_ESCAPE_CHARS =
038        new ReadOnlyHashSet<>(REGEX_ESCAPE_CHARS_ARR, null);
039
040    private static final char[] JS_ESCAPE_CHARS_ARR =
041    { '\\', '/', '\n', '\"' };
042
043    /**
044     * When converting a {@code String} for a Java-Script {@code String}, these are the 
045     * characters that must be escaped.
046     */
047    public static final ReadOnlySet<Character> JS_ESCAPE_CHARS = 
048        new ReadOnlyHashSet<>(JS_ESCAPE_CHARS_ARR, null);
049
050    /**
051     * The list of reserved Java Key-Words.  This list was written by ChatGPT on February 1st,
052     * 2024.
053     */
054    public static final ReadOnlyList<String> reservedKeywords = new ReadOnlyArrayList<>(
055        "abstract", "assert", "boolean", "break", "byte", "case", "catch", "char", "class",
056        "const", "continue", "default", "do", "double", "else", "enum", "extends", "false",
057        "final", "finally", "float", "for", "goto", "if", "implements", "import", "instanceof",
058        "int", "interface", "long", "native", "new", "null", "package", "permirs", "private",
059        "protected", "public", "return", "short", "static", "strictfp", "super", "switch",
060        "synchronized", "this", "throw", "throws", "transient", "true", "try", "void", "volatile",
061        "while"
062    );
063
064    /** This will match the definition for a java {@code 'Generic'} class or interface */
065    public static final Pattern GENERIC_PARAMS = Pattern.compile("^.+?<([\\s\\w\\<>,\\?]+)>$");
066
067    /** This shall match a Java Package {@code String} */
068    public static final Pattern PACKAGE_NAME = Pattern.compile("([A-Za-z_]\\w*\\.)+");
069
070
071    // ********************************************************************************************
072    // ********************************************************************************************
073    // Searching for a tag in an HTML string (the early way - without regular expressions)
074    // ********************************************************************************************
075    // ********************************************************************************************
076
077
078    /**
079     * If parameter {@code String s} contains any tag within-which there is a valid
080     * {@code "HREF"}, this will return the contents of the {@code HREF} Attribute/InnerTag.
081     * 
082     * @param s This is usually some variant of an HTML element/tag {@code String}.  This method
083     * was the first one written for HTML in this scrape package, and is just kept here for legacy
084     * reasons. The {@code class HTML.TagNode} has a number of options for extracting the
085     * {@code 'HREF'} attribute from an HTML element.
086     * 
087     * @return The attribute-value of an {@code HREF=...} attribute inside (usually an {@code <A>}
088     * 'Anchor') HTML tag. This will return 'null' if there is no {@code HREF="..."}
089     * attribute-value pair is found or identified.
090     * 
091     * @throws IllegalArgumentException If there is no end-quote found for the {@code HREF="..."}
092     * sub-string.
093     */
094    public static String grep_HREF_tag(String s)
095    {
096        s = s.toLowerCase();
097        String quote = "\"";
098
099        int hrefPos = s.indexOf("href=\"");
100
101        if (hrefPos == -1)
102        {
103            hrefPos = s.indexOf("href='");
104            if (hrefPos == -1) return null;
105            quote = "'";
106        }
107
108        // System.out.print("\t[hrefPos=" + hrefPos + "]");
109        
110        // the " + 6" is because the string HREF=" is 6 characters long
111        String ret = s.substring(hrefPos + 6);
112        int endQuotePos = ret.indexOf(quote);
113        
114        if (endQuotePos == -1) throw new IllegalArgumentException
115            ("HREF has no End-Quote!\n\nFor String:\n" + s);
116
117        // System.out.print("endQuotePos = " + endQuotePos + " " + ret.substring(0, endQuotePos));
118
119        return ret.substring(0,endQuotePos);
120    }
121
122    /**
123     * If parameter {@code String s} contains an HTML {@code "IMG"} tag, this will return the
124     * contents of the {@code "SRC=..."} attribute tag-field.
125     * 
126     * @param s This is usually some variant of an HTML element/tag {@code String}.  This method
127     * was the first one written for HTML in this scrape package, and is just kept here for legacy
128     * reasons. The {@code class HTML.TagNode} has a number of options for extracting the
129     * {@code 'SRC'} attribute from an HTML element.
130     * 
131     * @return The attribute-value of a {@code SRC=...} attribute inside (usually an {@code <IMG>}
132     * 'Image') HTML tag. 'null' is returned if:
133     * 
134     * <BR /><BR /><OL CLASS=JDOL>
135     * <LI>There is no HTML {@code 'IMG'} token found in the {@code String}</LI>
136     * <LI>There is no {@code SRC='...'} attribute-value pair found.</LI>
137     * </OL>
138     */
139    public static String grep_IMG_SRC_tag(String s)
140    {
141        String stlc = s.toLowerCase();
142        // System.out.println("1: " + stlc);
143        
144        int imgPos = stlc.indexOf("<img ");
145
146        if (imgPos == -1) return null;
147
148        stlc = stlc.substring(imgPos + 5);
149        // System.out.println("2: " + stlc + "[imgPos=" + imgPos + "]");
150
151        // first check for double-quotes
152        String  quote   = "\"";
153        int     srcPos  = stlc.indexOf("src=\"");
154
155        if (srcPos == -1)
156        {
157            // if no double-quotes, try single quotes
158            srcPos = stlc.indexOf("src='");
159
160            if (srcPos == -1) return null;
161
162            quote = "'";
163        }
164
165        stlc = stlc.substring(srcPos + 5);
166
167        // System.out.println("3: " + stlc + "[srcPos=" + srcPos + "]");
168        
169        int endSrcPos = stlc.indexOf(quote);
170
171        if (endSrcPos == -1) return null;
172        
173        int urlStart    = imgPos + srcPos + 10;
174        int urlEnd      = urlStart + endSrcPos;
175        
176        // System.out.println
177        //     ("4: [endSrcPos=" + endSrcPos + ", urlStart=" + urlStart + ", urlEnd=" + urlEnd);
178
179        return s.substring(urlStart, urlEnd);
180    }
181
182
183    // ********************************************************************************************
184    // ********************************************************************************************
185    // Java-Script & Reg-Ex String encoding (JSON.stringify())
186    // ********************************************************************************************
187    // ********************************************************************************************
188
189
190    /**
191     * <EMBED CLASS='external-html' DATA-FILE-ID=STRSRC_ESC_4JS>
192     * 
193     * @param str This may be any String in java.  It is intended to be inserted into a Java-Script
194     * file between an open and close quotation marks.  
195     * 
196     * @return The String that is returned will have certain characters escaped, so that it may be
197     * wrapped in quotation marks and easily inserted into any java-script ".js" text-file.
198     * 
199     * <BR /><BR /><B>Escaped-Text:</B>
200     * 
201     * <BR /><BR /><UL CLASS=JDUL>
202     * <LI> {@code char '\'} will be escaped to: {@code "\\"}</LI>
203     * 
204     * <LI> {@code char '/'} will be escaped to: {@code "\/"}, this is required in Java-Script, but
205     *      not Java!
206     *      </LI>
207     * 
208     * <LI> {@code char '"'} will be escaped to: {@code "\""}</LI>
209     * <LI> {@code char '\n'} will be escaped to: {@code "\\n"}</LI>
210     * </UL>
211     *
212     * <BR /><B><SPAN STYLE="color: red;">IMPORTANT NOTE:</B></SPAN> There is no easy, nor clear,
213     * way to express what is being replaced and/or escaped in a simple list.  You may run this
214     * method on any {@code String} and view for yourself what changes.  <B><I>The primary 
215     * goal</B></I> of the method is to allow <I>*any* Java String of *any* length</I> to be 
216     * converted, wrapped inside of an open and closed quotation-marks, and printed into a 
217     * Java-Script {@code ".js" file}.  Escaping "escape characters" which does come up some-what
218     * often in HTML text/string processing is near-impossible to explain clearly!  Review the
219     * stack-overflow "incantation" for possible help.
220     */
221    public static String escStrForJavaScript(String str)
222    { return StrReplace.r(str, JS_ESCAPE_CHARS_ARR, '\\'); }
223
224    /**
225     * This method should only be used for a <B><I>precise {@code String} match</I></B> using a
226     * regular-expression.  This method shall 'escape' all characters that the JVM Regular
227     * Expression Matcher in {@code package java.util.regex.*} would expect be escaped.  If the
228     * input parameter {@code 'str'} contains any regular-expression code, then this method would
229     * <B>FAIL</B> as it would escape regular-expression code into unusable text.
230     * 
231     * @param str This should be any {@code String} for which the user would like to find an
232     * <B>exact match, as-is</B>.
233     * 
234     * @return A regular-expression ready {@code String}
235     */
236    public static String escStrForRegEx(String str)
237    { return StrReplace.r(str, REGEX_ESCAPE_CHARS_ARR, '\\'); }
238
239
240    // ********************************************************************************************
241    // ********************************************************************************************
242    // Java Code String-Functions
243    // ********************************************************************************************
244    // ********************************************************************************************
245
246
247    /**
248     * Parses a {@code String} such as {@code T extends TreeMap<Integer, List<String>>}.  It is
249     * strictly used, to <B><I>only parse</I></B> the generic-definition lists that are at the top
250     * of generic <B>classes</B> and <B>interfaces</B>.
251     *
252     * <EMBED CLASS='external-html' DATA-FILE-ID=STRSRC_PARSE_GENT DATA-NODE="An Example of Sorts">
253     *
254     * @param genericTypeParamOrDefinition This should be {@code String} retrieved from inside the
255     * less-than ({@code '<'}) and greater-than ({@code '>'}) symbols.  For example, for 
256     * {@code SortedList<A extends Comparable, B>} the {@code String} passed to this method should
257     * be {@code "A extends Comparable, B"}
258     * 
259     * @return This should break down this {@code CSV} (comma separated value) list into 
260     * individual {@code String's}.
261     * 
262     * @throws NoMatchException if the input {@code String} parameter does not match the
263     * generics regular-expression {@link #GENERIC_PARAMS}.
264     * 
265     * @throws StringFormatException If the input {@code String} could not be parsed.
266     */
267    public static String[] parseGenericType(String genericTypeParamOrDefinition)
268    {
269        Matcher m               = GENERIC_PARAMS.matcher(genericTypeParamOrDefinition);
270        String  innerGenericStr = m.find() ? m.group(1) : null;
271
272        if (innerGenericStr == null) throw new NoMatchException(
273            "The provided value to parameter 'genericTypeParamOrDefinition' [" + 
274            genericTypeParamOrDefinition + "] did not match the Java Generics " +
275            "Regular-Expression:\n" + GENERIC_PARAMS.toString()
276        );
277
278        Stream.Builder<String>  b               = Stream.builder();
279        String[]                sArr            = innerGenericStr.split(",");
280
281        for (int i=0; i < sArr.length; i++)
282
283            // We have shifted elements, and now all of the remaining elements would be null
284            // return immediately
285
286            if (sArr[i] == null) return b.build().toArray(String[]::new);
287
288            // Simple generic-type definition: has no "sub-generics" or "inner-generics"
289            // Add this to the list, and move on
290
291            else if ((! sArr[i].contains("<")) && (! sArr[i].contains(">")))
292                b.accept(sArr[i].trim());
293
294            // This is a generic-type definition that has at least one "sub-generic"
295            // If there are an equal number of '<' and '>' then there were no commas
296            // in between the sub-generics.  Add this to this list, and move on.
297
298            else if (   StringParse.countCharacters(sArr[i], '<') ==
299                        StringParse.countCharacters(sArr[i], '>')
300            )
301                b.accept(sArr[i].trim());
302
303            // There was a generic with a sub-generic that had a comma...
304            else
305            {
306                // If we have reached the end of the String, the number of greater than and
307                // less than symbols was not balanced.
308
309                if (i == (sArr.length - 1)) throw new StringFormatException(
310                    "The provided value to parameter 'genericTypeParamOrDefinition' [" + 
311                    genericTypeParamOrDefinition + "], was not properly formatted, and could " +
312                    "not be parsed."
313                );
314
315                // Join the next String Array Element with the current one.
316                sArr[i] = sArr[i].trim() + ", " + sArr[i + 1].trim();
317
318                // Shift the rest of the array left.
319                for (int j=i+1; j < (sArr.length-1); j++) sArr[j] = sArr[j+1];
320                sArr[sArr.length - 1] = null;
321
322                // decrement the counter to retest this array-index location
323                i--;
324            }
325
326        // Return the list
327        return b.build().toArray(String[]::new);
328    }
329
330    /**
331     * This will print a caret-symbol on a line of text underneath the input {@code String}
332     * parameter {@code 'str'}.  Preceeding the caret-symbol will be exactly {@code strPos - 1}
333     * space characters.  This look of the output-{@code String} is similar to some of the error
334     * messages generated by a Java Compiler.
335     * 
336     * <BR /><BR />The caret-symbol {@code '^'} will bee pointing to the character at index
337     * {@code strPos}.
338     * 
339     * <DIV CLASS=EXAMPLE>{@code
340     * // Notice the (accidental, on-purpose) use of the '@'' character instead of an 'a'
341     * // To make this easy, lets compute the exact location of this erroneous character.
342     * String   s   = "This string has an inv@lid character.";
343     * int      pos = s.indexOf("@");
344     * 
345     * // This will print out a line of text containing the string, with a caret pointing
346     * // at the '@' symbol.
347     * System.out.println(StringParse.caretBeneath(s, pos));
348     *
349     * // PRINTS:
350     * // This string has an inv@lid character.
351     * //                       ^
352     * }</DIV>
353     * 
354     * @param str This may be any input-{@code String} that is less than 100 characters.
355     * 
356     * @param strPos This must be a number between 0 and the length
357     * 
358     * @return The same input-{@code String} with a second line appended underneath (using a
359     * newline) having a <B>caret</B> ({@code '^'}) directly underneath the character at
360     * {@code strPos}.
361     * 
362     * @throws IllegalArgumentException If the input {@code String} is longer than 
363     * {@code 100 characters}.
364     * 
365     * @throws StringFormatException If the input {@code String} contains any new-line {@code '\n'}
366     * or tab {@code '\t'} characters.
367     * 
368     * @throws StringIndexOutOfBoundsException If the value pased to {@code strPos} is negative or
369     * greater than the length of the input-{@code String}.
370     * 
371     * @see StringParse#nChars(char, int)
372     */
373    public static String caretBeneath(String str, int strPos)
374    {
375        if (str.length() > 100) throw new IllegalArgumentException(
376            "The length of the input-string must be less than 100.  str has length: " +
377            str.length()
378        );
379
380        if (StrCmpr.containsOR(str, "\n", "\t")) throw new StringFormatException
381            ("The input-string may not contain new-line or tab characters.");
382
383        if (strPos >= str.length()) throw new StringIndexOutOfBoundsException(
384            "The value you have passed to 'strPos' [" + strPos + "] is greater than the length " +
385            "the input-string [" + str.length() + "]"
386        );
387
388        if (strPos < 0) throw new StringIndexOutOfBoundsException
389            ("You have passed a negative value to strPos [" + strPos + "]");
390
391        return str + "\n" + StringParse.nChars(' ', strPos) + '^';
392    }
393
394    private static StringFormatException REM_GENERIC_ERROR_MSG(String s, int charPos)
395    { 
396        return new StringFormatException(
397            /*
398            "The opening '<' and closing '>' symbols in the type-string have not been " +
399            "properly placed.\n" +
400            */
401            "Generic Type-String Error, Beginning at Noted Location:\n" +
402            caretBeneath(s, charPos)
403        );
404    }
405
406    /**
407     * This will remove the generic type-parameters expression from a Java Type Declaration or
408     * Reference.  In simple terms, this removes the {@code '<K, V>'} from a {@code String} such
409     * as {@code Map.Entry<K, V>}.
410     * 
411     * <BR /><TABLE CLASS=JDBriefTable>
412     * <TR> <TH>Returned {@code String}</TH>
413     *      <TH>Input {@code String}</TH>
414     *      </TR>
415     * <TR> <TD>{@code "Vector"}</TD>
416     *      <TD>{@code "Vector<E>"}</TD>
417     *      </TR>
418     * <TR> <TD>{@code "AbstractHNLI"}</TD>
419     *      <TD>{@code "AbstractHNLI<E extends HTMLNode, F>"}</TD>
420     *      </TR>
421     * <TR> <TD>{@code "Torello.HTML.TagNode"}</TD>
422     *      <TD>{@code "Torello.HTML.TagNode"}</TD>
423     *      </TR>
424     * <TR> <TD>{@code "ClassA.InnerClassB.InnerClassC"}</TD>
425     *      <TD>{@code "ClassA<X>.InnerClassB<Y>.InnerClassC"}</TD>
426     *      </TR>
427     * <TR> <TD>{@code "String[]"}</TD>
428     *      <TD>{@code "String[]"}</TD>
429     *      </TR>
430     * <TR> <TD>{@code "java.lang.String[]"}</TD>
431     *      <TD>{@code "java.lang.String[]"}</TD>
432     *      </TR>
433     * <TR> <TD>{@code "Vector"}</TD>
434     *      <TD>{@code "Vector<String[]>"}</TD>
435     *      </TR>
436     * <TR> <TD>{@code "java.util.Vector"}</TD>
437     *      <TD>{@code "java.util.Vector<String[]>"}</TD>
438     *      </TR>
439     * <TR> <TH COLSPAN=2>Point of Interest:</TH>
440     *      </TR>
441     * <TR> <TD>"I watched the World Series"</TD>
442     *      <TD>"I watched the World Series"</TD>
443     *      </TR>
444     * <TR> <TD>{@code "Vector"}</TD>
445     *      <TD>{@code "Vector<Quoth the Raven>"}</TD>
446     *      </TR>
447     * <TR> <TH COLSPAN=2>Throws an Exception</TH></TR>
448     * <TR> <TD COLSPAN=2>{@code "HNLI<E> <"}</TD></TR>
449     * <TR> <TD COLSPAN=2>{@code "> <Quoth the Raven>"}</TD></TR>
450     * </TABLE>
451     * 
452     * @param typeAsStr The "Reference Type" or "Declaration Type".
453     * 
454     * @return The same {@code String}, having everything between the <B>outer-most, matching</B>
455     * {@code '<'} and {@code '>'} symbols.
456     * 
457     * <BR /><BR /><B>NOTE:</B> The returned {@code String} will not contain any leading or
458     * trailing white-space.  It is trimmed before being returned.
459     * 
460     * @throws StringFormatException An exhaustive check on everything that could be wrong with
461     * a type-{@code String} is an impossibility (if you include checking for valid types).  This
462     * exception is only thrown if the {@code '<'} and {@code '>'} symbols inside the
463     * input-{@code String} do not match-up.
464     * 
465     * <BR /><BR />In order to avoid throwing this exception, there must be an equal number of
466     * opening and closing symbols.
467     * 
468     * <BR /><BR />There is also a check to ensure that the charcters in this {@code String}
469     * are valid.
470     */
471    public static String removeGeneric(String typeAsStr)
472    {
473        int leftPos = typeAsStr.indexOf('<');
474
475        if (leftPos == -1)
476        {
477            int pos = typeAsStr.indexOf('>');
478
479            if (pos == -1) return typeAsStr.trim();
480
481            throw REM_GENERIC_ERROR_MSG(typeAsStr, pos);
482        }
483
484        char[]  cArr    = typeAsStr.toCharArray();
485        int     count   = 1;            // The number of OPENING-CLOSING tags (same as Inclusive)
486        int     END     = cArr.length;  // This is the location JUST-AFTER the last USEABLE-char
487        int     delta   = 0;            // How many characters have been deleted already.
488                                        // NOTE: This is zero, because the loop hasn't started.
489                                        //       If there is a "Shift" this will be PRECISELY-EQUAL
490                                        //       to the size of the last generic parameter-expression.
491                                        // ALSO: The only purpose of this is for error-reporting.
492
493        // check for a closing '>' before the first opening '<'
494        for (int j=0; j < leftPos; j++)
495            if (cArr[j] == '>') throw REM_GENERIC_ERROR_MSG(typeAsStr, j);
496
497        // Check for in-valid characters
498        // This is a lot of lines of code, but these methods are extremely short, and the input
499        // string (for all VALID) input will be very short.  This is peace of mind.  It checks...
500        for (int pos=0; pos < cArr.length; pos++)
501        {
502            char c = cArr[pos];
503            if (! Character.isJavaIdentifierPart(c))
504                if (! Character.isIdentifierIgnorable(c))
505                    if (! Character.isWhitespace(c))
506                        if (
507                                (c != '[') && (c != ']') && (c != '?') && (c != '<') &&
508                                (c != '>') && (c != ',') && (c != '.')
509                        )
510                            throw REM_GENERIC_ERROR_MSG(typeAsStr, pos);
511        }
512
513        do
514        {
515            // Keeps a count on the number of "Opening Braces" and "Closing Braces" 
516            // This is the same thing as the whole "Inclusive" deal, but with braces instead.
517            //
518            // count: At loop start, count is '1'  If it ever reaches 0, the loop exits.
519            // leftPos: The location of the '<' that has been found.
520            int i = leftPos + 1;
521    
522            while ((count > 0) && (i < END))
523            {
524                if      (cArr[i] == '<')    count++;
525                else if (cArr[i] == '>')    count--;
526
527                if (count > 0) i++;
528            }
529
530            // The '<' and the '>' didn't match up.  Better to throw exception, than ignore it.
531            if ((count != 0) && (i == END))
532                throw REM_GENERIC_ERROR_MSG(typeAsStr, leftPos);
533
534            int rightPos = i; // 'i' is currently pointing to the '>'
535
536            // Erase the most recently found <...> expression
537            int     sourcePos       = rightPos + 1; // Pointing at first VALID / NEED-TO-COPY char
538            int     destPos         = leftPos;      // Pointing at '<'
539            boolean possiblyAnother = false;
540
541            while (sourcePos < END)
542            {
543                // The next character to copy... check it first to see if it is valid!
544                char c = cArr[sourcePos]; 
545
546                // continue to shift all the characters left to erase the expression.
547                cArr[destPos] = c;
548
549                if (! possiblyAnother) // Haven't found an opening '<'
550                {
551                    // If there is a '>' - ***AND NO '<' HAS BEEN FOUND***, this is an error.    
552                    if (c == '>')
553                        throw REM_GENERIC_ERROR_MSG(typeAsStr, delta + sourcePos);
554
555                    // If there is another '<', then it is possible another expression awaits us
556                    if (c == '<')
557                    {
558                        // Reset the outer-loop variables for the next iteration.  There is going
559                        // to be another iteration - guaranteed.
560                        //
561                        // NOTE: Delta is supposed to hold how many characters are being deleted.
562                        //       This is used for proper error-reporting (only)
563
564                        // This is how many chars are in the current <...> expression
565                        delta   = rightPos - leftPos + 1;
566
567                        leftPos = destPos;  // Now pointing at the next open '<' char (just found!)
568                        count   = 1;        // There was a new-unclosed '>', prepares for next loop
569
570                        // You know it
571                        possiblyAnother = true;
572                    }
573                }
574
575                sourcePos++; destPos++;
576            }
577
578            // Completed without errors, and without another expression being found.
579            // NOTE: This used to be a one-line return call.
580            // ADDED: This now does a String.trim().   These little loops skip leading and 
581            //        trailing white-space BEFORE returning the String
582            //
583            // WORKS-NO-TRIM: return new String(cArr, 0, destPos);
584            //                replace loop-body with the above line to get rid of trim()
585            if (! possiblyAnother)
586            {
587                int sPos    = 0;
588                int len     = destPos;  // REMEMBER:    new String(char[], int OFFSET, int COUNT)
589                                        // NOT:         new String(char[], int SPOS, int EPOS)
590
591                // Skip LEADING-WHITESPACE
592                while ((sPos < cArr.length) && (destPos > 0) && Character.isWhitespace(cArr[sPos]))
593                { sPos++; destPos--; } // Advance start, *AND* shorten "count"
594
595                // Skip TRAILING WHITE-SPACE
596                while ((destPos > 1) && Character.isWhitespace(cArr[sPos + destPos-1]))
597                    destPos--; // Shorten length *ONLY*
598
599                return new String(cArr, sPos, destPos);
600            }
601            
602            END = destPos;  // Pointing at the first invalid / unused / ALREADY-MOVED char
603        }
604        while (true);
605    }
606
607    /**
608     * This will remove any generic-parameter information from a Java type-{@code String} <B>and
609     * then</B> remove all package-information or outer-class {@code String's}.  What is left 
610     * is a single <B>Java Identifier {@code String}</B> that, <I>as long as the proper scope has
611     * been provided</I>, identifies a Java Type (Class, Interface, Enum, Record, Annotation).
612     * 
613     * <BR /><TABLE CLASS=JDBriefTable>
614     * <TR><TH>Output</TH><TH>Input</TH></TR>
615     * <TR><TD>{@code "Integer"}</TD><TD>{@code "java.lang.Integer"}</TD></TR>
616     * <TR><TD>{@code "Vector"}</TD><TD>{@code "java.util.Vector<E>"}</TD></TR>
617     * <TR><TD>{@code "Entry"}</TD><TD>{@code "java.util.Map.Entry<String, Integer>"}</TD></TR>
618     * <TR><TD>{@code "Entry"}</TD><TD>{@code "Map.Entry<String, Intger>"}</TD></TR>
619     * <TR><TD>{@code "Entry"}</TD><TD>{@code "Entry<String, Integer>"}</TD></TR>
620     * <TR><TD>{@code "Entry"}</TD><TD>{@code "Entry"}</TD></TR>
621     * <TR><TD>{@code "String[]"}</TD><TD>{@code "String[]"}</TD></TR>
622     * <TR><TD>{@code "String[]"}</TD><TD>{@code "java.lang.String[]"}</TD></TR>
623     * <TR><TD>{@code "Vector"}</TD><TD>{@code "Vector<String[]>"}</TD></TR>
624     * <TR><TD>{@code "Vector[]"}</TD><TD>{@code "Vector<String>[]"}</TD></TR>
625     * 
626     * <TR><TH COLSPAN=2>Point of Interest:</TH></TR>
627     * <TR><TD>{@code "The World Series"}</TD><TD>{@code "The World Series"}</TD></TR>
628     * <TR><TD>{@code "Quoth the Raven"}</TD><TD>{@code "Quoth the Raven<java.lang.Integer>"}</TD></TR>
629     * 
630     * <TR><TH COLSPAN=2>Finally:</TH></TR>
631     * <TR><TD>{@code "String..."}</TD><TD>{@code "String..."}</TD></TR>
632     * <TR><TD>{@code "String..."}</TD><TD>{@code "java.lang.String..."}</TD></TR>
633     * <TR><TD>{@code "Vector..."}</TD><TD>{@code "Vector<E>..."}</TD></TR>
634     * <TR><TD>{@code "Vector..."}</TD><TD>{@code "java.util.Vector<E>..."}</TD></TR>
635     * </TABLE>
636     * 
637     * @param typeStr This is a type as a {@code String}.  These are usually retrieved from Java
638     * Parser, in the Java Doc Upgrader package.  This method does not provide an exhaustive
639     * check for all variants of format and naming erros of a Java Type.  Some validity checks
640     * are performed regarding the use of non-Java type characters.  
641     * 
642     * <BR /><BR /><B STYLE='color:red;'>NOTE:</B> All the exceptions thrown by the method
643     * {@link #removeGeneric(String)} will also be thrown here, if {@code 'typeStr'} is not
644     * not properly formatted.
645     * 
646     * @return a Simplified version of the type that leaves out the scope, but provides a
647     * simple Java Identifier, instead.  Throws exceptions if not properly formatted.  If any
648     * array-bracket characters are passed, they is preserved, unless the arrays in this type
649     * are part of the generic-type parameters; please see the examples above.
650     * 
651     * @throws StringFormatException Please see the explanation provided in
652     * {@link #removeGeneric(String)} under 'Throws'.
653     * 
654     * @see #removeGeneric(String)
655     */
656    public static String typeToJavaIdentifier(String typeStr)
657    {
658        String  ret         = removeGeneric(typeStr);
659        boolean isVarArgs   = false;
660
661        if (ret.endsWith("..."))
662        {
663            ret = ret.substring(0, ret.length() - 3);
664            isVarArgs = true;
665        }
666
667        int pos = ret.lastIndexOf('.');
668
669        if (isVarArgs)
670        {
671            if (pos == -1)  return ret + "...";
672            else            return ret.substring(pos+1) + "...";
673        }
674
675        else
676        {
677            if (pos == -1)  return ret;
678            else            return ret.substring(pos+1);
679        }
680    }
681
682    // This was designed while staring at the field retrieved from a JavaDoc HTML Page that
683    // looked like this (from AbstractHNLI)
684    //        protected java.util.function.Predicate<E extends HTMLNode> p;
685    // This puts a group (group 1) around the ( extends HTMLNode) part, so it can be removed.
686    // JavaParser complained about it.
687
688    private static final Pattern exClause =
689        Pattern.compile("([A-Za-z][A-Za-z0-9]*)(\\s+extends\\s+[\\w\\.]+)");
690
691    /**
692     * Removes the {@code 'extends'} part of a Java Generic
693     * 
694     * <BR /><BR /><B STYLE='color:red;'>TO DO:</B> This will fail for a class such as:
695     * <BR />{@code public class MyClass<T extends Vector<String>}, where the extends clause
696     * also has a generic in it.  Java HTML does not define such classes, but they are possible,
697     * and this needs to be fixed, as soon as they let me!
698     * 
699     * @param decl Any Type Declaration that includes has the word {{@code 'extends'}},
700     * followed by type-parameter information.
701     * 
702     * @return The same {@code String} without the clause.
703     */
704    public static String removeExtendsClause(String decl)
705    {
706        Matcher m = exClause.matcher(decl);
707
708        while (m.find())
709        {
710            decl = m.replaceFirst(m.group(1));
711            m.reset(decl);
712        }
713
714        return decl;
715    }
716
717    /**
718     * <EMBED CLASS='external-html' DATA-FILE-ID=STRSRC_JTYPE_STR>
719     * 
720     * @param s Any Java {@code String}.
721     * 
722     * @return {@code TRUE} if and only if the Java Compiler could interpret {@code 's'} as a valid
723     * reference to a Java Type.  In computer-programming, the world <B>{@code Type}</B> can have a
724     * lot of meanings, but here, the word should be interpreted as a Java Class, Interface,
725     * Enumeration (an {@code 'enum'}), Annotation or Record.
726     * 
727     * <BR /><BR /><B>NOTE:</B> {@code 's'} may include the period {@code '.'} since inner classes,
728     * enum's and interfaces are also valid Java Type's.  Two consecutive period-characters, or a
729     * period at the beginning or ending of {@code 's'} will result in this method returning
730     * {@code FALSE}.
731     */
732    public static boolean isJavaTypeStr(String s)
733    {
734        if (s.length() == 0) return false;
735
736        // Java restricts the first character of a java-identifier to a smaller subset than the
737        // other characters in an identifier.  Use method 'isJavaIdentifierStart'
738
739        if (! Character.isJavaIdentifierStart(s.charAt(0))) return false;
740
741        int     len = s.length();
742        char    c   = 0;
743
744        for (int i=1; i < len; i++)
745
746            if (! Character.isJavaIdentifierPart(c = s.charAt(i)))
747            {
748                if (c == '.')
749                {
750                    // A second (subsequent) period-character (in a row) ==> FALSE
751                    if (s.charAt(i-1) == '.') return false;
752
753                    // The LAST character in the String is a period-character ==> FALSE
754                    if (i == (len-1)) return false;
755
756                    // The character immediately following a period isn't a valid Java Identifier
757                    // Start ==> FALSE
758
759                    if (! Character.isJavaIdentifierStart(s.charAt(++i))) return false;
760                }
761                else
762                    // Character is NEITHER a period, NOR a Java Identifier Part ==> FALSE
763                    return false;
764            }
765
766        // All metrics / tests have succeeded (which would have resulted in immediate exiting of
767        // this method, and a FALSE return value) ... therefore return TRUE.
768        return true;
769    }
770
771    /**
772     * Checks whether an input {@code String} would be allowed as a Java Identifier - for instance,
773     * whether the input would make a valid Field-Name, Variable-Name, Class-Name or Method-Name.
774     * 
775     * <BR /><BR /><B CLASS=JDDescLabel>ChatGPT Note:</B>
776     * 
777     * <BR /><B>ChatGPT, 3.5</B> wrote this whole thing, including the in-line comments.  I had to
778     * write the Java-Doc Comments, but I guess I could have asked it to do that too.
779     * 
780     * @param identifier Any Java {@code String}
781     * 
782     * @return {@code TRUE} if-and-only-if parameter {@code 'identifier'} is a valid Java
783     * Identifier.
784     */
785    public static boolean isValidJavaIdentifier(String identifier)
786    {
787        // Check if the string is not null or empty
788        if (identifier == null || identifier.isEmpty()) return false;
789
790        // Check if the first character is a letter, underscore, or dollar sign
791        if (! Character.isJavaIdentifierStart(identifier.charAt(0))) return false;
792
793        // Check the remaining characters
794        for (int i = 1; i < identifier.length(); i++)
795            if (!Character.isJavaIdentifierPart(identifier.charAt(i)))
796                return false;
797
798        // Check if the identifier is a reserved keyword
799        if (reservedKeywords.contains(identifier)) return false;
800
801        // The string is a valid Java identifier
802        return true;
803    }
804
805
806    // ********************************************************************************************
807    // ********************************************************************************************
808    // Replace Special-Character
809    // ********************************************************************************************
810    // ********************************************************************************************
811
812
813    /**
814     * There are actually people out there who are willing to put character {@code '160'} into
815     * a file or document, instead of a simple {@code '&nbsp;'} element.  How rude.
816     * Any instances of this character shall be replaced with the standard space character
817     * {@code ASCII #32}.
818     * 
819     * @param s Any {@code String} will pass.  Generally {@code String's} that were converted from
820     * HTML pages will contain {@code char #160} as it is occasionally translated from the HTML
821     * escape sequence {@code &nbsp;}
822     * 
823     * @return A String where any instance of white-space character {@code #160} have been
824     * replaced with character {@code #32}
825     */
826    public static String replaceNBSP(String s)
827    { return s.replace(("" + ((char) 160)), " "); }
828
829    /**
830     * Even lower than {@code #160}, apparently is the {@code "Zero Width Space"} (character 
831     * {@code #8203}.  This is actually inserted by the <B>JavaDoc Tool</B> (by
832     * {@code Sun / Oracle}) into JavaDoc generated HTML Pages.  Here, it shall be replaced by
833     * character {@code #32} - the <I>space-character</I>.
834     * 
835     * <BR /><BR /><B>A.K.A.:</B> <CODE>&quot;\u200B&quot;</CODE>.
836     * 
837     * <BR /><BR /><B><I STYLE='color: red;'>Can you see the character, above?</I></B>  No?
838     * That's zero width space for you!  If you ever sitting and wondering why a {@code String}
839     * seems to be something else than what it looks like - you might have a zero-width 
840     * space in your {@code String}.  If so, it will take a while to find the bug.
841     * 
842     * @param s Any {@code String} will pass.  Generally {@code String's} that were converted from
843     * JavaDoc HTML pages will contain {@code char #8203}.
844     * 
845     * @return A String where any instance of white-space character {@code #8203} have been
846     * replaced with character {@code #32}
847     */
848    public static String replaceZWSP(String s)
849    { return s.replace(("" + ((char) 8203)), " "); }
850
851
852    // ********************************************************************************************
853    // ********************************************************************************************
854    // CSS Source
855    // ********************************************************************************************
856    // ********************************************************************************************
857
858
859    /**
860     * Checks if a Java-{@code String} constitutes a valid CSS Property-Name.  Note that this
861     * method, in no way consults any "complete list" of all known CSS-Properties.  Instead, it 
862     * simply analyzes whether the name is conguent with the CSS-Property Validator Reg-ex.
863     * 
864     * @param cssPropertyName Any Java-{@code String}
865     * 
866     * @return {@code TRUE} if and ony if {@code 'attributeName'} is a valid HTML Atribute-Name,
867     * according to the agreed upon CSS-Property Regular-Expression Validator.
868     */
869    public static boolean isCSSPropertyName(String cssPropertyName)
870    {
871        if (cssPropertyName.length() == 0) return false;
872
873        if (! isCSSPropertyNameStart(cssPropertyName.charAt(0))) return false;
874
875        for (int i=1; i < cssPropertyName.length(); i++)
876        {
877            final char c = cssPropertyName.charAt(i);
878            if ((c >= 'A') && (c <= 'Z')) continue;
879            if ((c >= 'a') && (c <= 'z')) continue;
880            if ((c >= '0') && (c <= '9')) continue;
881            if ((c == '-') || (c == '_')) continue;
882            return false;
883        }
884
885        return true;
886    }
887
888    /**
889     * Checks whether parameter {@code 'c'} is one of the agreed-upon standard characters that are
890     * allowed to begin CSS Property-Names.
891     * 
892     * @param c Any Java {@code char}-primitive
893     * 
894     * @return {@code TRUE} if and ony if {@code 'c'} is a character that would be allowed to begin
895     * a CSS Property-Name
896     */
897    public static boolean isCSSPropertyNameStart(char c)
898    {
899        if ((c >= 'A') && (c <= 'Z')) return true;
900        if ((c >= 'a') && (c <= 'z')) return true;
901        if ((c == '-') || (c == '_')) return true;
902        return false;
903    }
904
905    /**
906     * Checks whether parameter {@code 'c'} is one of the agreed-upon standard characters that are
907     * permitted within CSS Property-Names, after the first character of the name.
908     * 
909     * @param c Any Java {@code char}-primitive
910     * 
911     * @return {@code TRUE} if and ony if {@code 'c'} is a character that would be allowed within a
912     * valid CSS Property-Name.
913     */
914    public static boolean isCSSPropertyNamePart(char c)
915    {
916        if ((c >= 'A') && (c <= 'Z')) return true;
917        if ((c >= 'a') && (c <= 'z')) return true;
918        if ((c >= '0') && (c <= '9')) return true;
919        if ((c == '-') || (c == '_')) return true;
920        return false;
921    }
922
923
924    // ********************************************************************************************
925    // ********************************************************************************************
926    // More HTML Source
927    // ********************************************************************************************
928    // ********************************************************************************************
929
930
931    /**
932     * Checks if a Java-{@code String} constitutes a valid HTML Attibute-Name.  Note that this
933     * method, in no way consults any "complete list" of all know HTML-Attributes.  Instead, it 
934     * simply analyzes whether the name is conguent with the Attribute-Name Validator Reg-ex.
935     * 
936     * @param attributeName Any Java-{@code String}
937     * 
938     * @return {@code TRUE} if and ony if {@code 'attributeName'} is a valid HTML Atribute-Name,
939     * according to the agreed upon Attribute-Name Regular-Expression Validator.
940     */
941    public static boolean isAttributeName(String attributeName)
942    {
943        if (attributeName.length() == 0) return false;
944
945        if (! isAttributeNameStart(attributeName.charAt(0))) return false;
946
947        for (int i=1; i < attributeName.length(); i++)
948        {
949            final char c = attributeName.charAt(i);
950            if ((c >= 'A') && (c <= 'Z')) continue;
951            if ((c >= 'a') && (c <= 'z')) continue;
952            if ((c >= '0') && (c <= '9')) continue;
953            if ((c == '-') || (c == '_')) continue;
954            return false;
955        }
956
957        return true;
958    }
959
960    /**
961     * Checks whether parameter {@code 'c'} is one of the agreed-upon standard characters that are
962     * allowed to begin HTML Attribute-Names.
963     * 
964     * @param c Any Java {@code char}-primitive
965     * 
966     * @return {@code TRUE} if and ony if {@code 'c'} is a character that would be allowed to begin
967     * an HTML Attribute-Name
968     */
969    public static boolean isAttributeNameStart(char c)
970    {
971        if ((c >= 'A') && (c <= 'Z')) return true;
972        if ((c >= 'a') && (c <= 'z')) return true;
973        return false;
974    }
975
976    /**
977     * Checks whether parameter {@code 'c'} is one of the agreed-upon standard characters that are
978     * permitted within HTML Attribute-Names, after the first character of the name.
979     * 
980     * @param c Any Java {@code char}-primitive
981     * 
982     * @return {@code TRUE} if and ony if {@code 'c'} is a character that would be allowed within a
983     * valid HTML Attribute-Name.
984     */
985    public static boolean isAttributeNamePart(char c)
986    {
987        if ((c >= 'A') && (c <= 'Z')) return true;
988        if ((c >= 'a') && (c <= 'z')) return true;
989        if ((c >= '0') && (c <= '9')) return true;
990        if ((c == '-') || (c == '_')) return true;
991        return false;
992    }
993
994}