001package Torello.HTML; 002 003import java.util.*; 004import java.util.regex.*; 005import java.util.stream.*; 006 007import Torello.Java.*; 008 009 010/** 011 * Easy utilities for escaping and un-escaping HTML characters such as {@code }, and even 012 * code-point based Emoji's. 013 * 014 * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE> 015 */ 016@Torello.JavaDoc.StaticFunctional 017public final class Escape 018{ 019 private Escape() { } 020 021 022 // ******************************************************************************************** 023 // ******************************************************************************************** 024 // Internal Fields, used by this class only 025 // ******************************************************************************************** 026 // ******************************************************************************************** 027 028 029 /** 030 * Regular Expression for characters represented in HTML as 031 * <CODE>&#x[Hexadecimal-Code];</CODE> 032 */ 033 private static final Pattern HEX_CODE = Pattern.compile("&#x([A-F,a-f,\\d]{1,8});"); 034 035 /** 036 * Regular Expression for characters represented in HTML as <CODE>&#[Decimal-Code];</CODE> 037 */ 038 private static final Pattern DEC_CODE = Pattern.compile("&#(\\d{1,8});"); 039 040 /** 041 * Regular Expression (approximate, not exact) for hard-coded escape sequences such as 042 * <CODE>"&amp;"</CODE> 043 * 044 * <BR /><BR />This is <I>"approximate"</I> - because it does not actually look the sequence 045 * up in the hash table. This means, of course, that not everything which matches this Regular 046 * Expression Pattern is actually an escaped HTML ASCII/UniCode character. 047 * 048 * <BR /><BR /><B CLASS=JDDescLabel>For Example:</B> 049 * 050 * <BR /><CODE>&NotACode;</CODE> will match this Regular-Expression, but it is not an 051 * actual HTML Escape-sequence. For that, one needs to consult the internal 052 * {@code 'htmlEscSeq'} or {@code 'htmlEscChars'} tables themselves. 053 * 054 * @see #htmlEscChars 055 * @see #htmlEscSeq 056 */ 057 private static final Pattern TEXT_CODE = Pattern.compile("&[A-Z,a-z,0-9]{1,8};"); 058 059 @SuppressWarnings("rawtypes") 060 private static final Vector data = LFEC.readObjectFromFile_JAR 061 (Escape.class, "data-files/Escape.htdat", true, Vector.class); 062 063 /** 064 * This {@code Hashtable} contains all of the HTML escape characters which are represented by 065 * a short Text-{@code String}. The file listed above contains that list. 066 * 067 * @see HTML_ESC_CHARS 068 */ 069 @SuppressWarnings("unchecked") 070 private static final Hashtable<String, Character> htmlEscChars = 071 (Hashtable<String, Character>) data.elementAt(0); 072 073 /** 074 * This {@code Hashtable} is the reverse of the previous table. It allows a user to look up 075 * the escape sequence, given a particular ASCII {@code char}. 076 * 077 * @see HTML_ESC_CHARS 078 * @see #htmlEscChars 079 */ 080 @SuppressWarnings("unchecked") 081 private static final Hashtable<Character, String> htmlEscSeq = 082 (Hashtable<Character, String>) data.elementAt(1); 083 084 085 // ******************************************************************************************** 086 // ******************************************************************************************** 087 // Some debug, and "View Data" methods 088 // ******************************************************************************************** 089 // ******************************************************************************************** 090 091 092 /** 093 * Print's the HTML Escape Character lookup table to {@code System.out}. 094 * This is useful for debugging. 095 * 096 * <BR /><BR /><B CLASS=JDDescLabel>View Escape-Codes:</B> 097 * 098 * <BR />The JAR Data-File List included within the page attached (below) is a complete list of 099 * all <B><CODE>text-String</B> HTML Escape Sequences </CODE> that are known to this class. 100 * This list, does not include any <CODE>Code Point, Hex</CODE> or <CODE>Decimal Number</CODE> 101 * sequences. 102 * 103 * <BR /><BR /><B><CODE><A HREF="doc-files/EscapeCodes.html"> 104 * All HTML Escape Sequences</A></CODE></B> 105 */ 106 public static void printHTMLEsc() 107 { 108 Enumeration<String> e = htmlEscChars.keys(); 109 110 while (e.hasMoreElements()) 111 { 112 String tag = e.nextElement(); 113 System.out.println("&" + tag + "; ==> " + htmlEscChars.get(tag)); 114 } 115 } 116 117 118 // ******************************************************************************************** 119 // ******************************************************************************************** 120 // Main Part of the class 121 // ******************************************************************************************** 122 // ******************************************************************************************** 123 124 125 /** 126 * Converts a single {@code String} from an HTML-escape sequence into the appropriate 127 * character. 128 * 129 * <BR /><BR /> 130 * <CODE>&[escape-sequence];</CODE> ==> actual ASCII or UniCode character. 131 * 132 * @param escHTML An HTML escape sequence. 133 * 134 * @return the {@code ASCII} or {@code Unicode} character represented by this escape sequence. 135 * 136 * <BR /><BR />This method will return {@code '0'} if the input it does not represent a valid 137 * HTML Escape sequence. 138 */ 139 public static char escHTMLToChar(String escHTML) 140 { 141 if (! escHTML.startsWith("&") || ! escHTML.endsWith(";")) return (char) 0; 142 143 String s = escHTML.substring(1, escHTML.length() - 1); 144 145 // Temporary Variable. 146 int i = 0; 147 148 // Since the EMOJI Escape Sequences use Code Point, they cannot, generally be 149 // converted into a single Character. Skip them. 150 151 if (HEX_CODE.matcher(s).find()) 152 { 153 if ((i = Integer.parseInt(s.substring(2), 16)) < Character.MAX_VALUE) 154 return (char) i; 155 else 156 return 0; 157 } 158 159 // Again, deal with Emoji's here... Parse the integer, and make sure it is a 160 // character in the standard UNICODE range. 161 162 if (DEC_CODE.matcher(s).find()) 163 { 164 if ((i = Integer.parseInt(s.substring(1))) < Character.MAX_VALUE) 165 return (char) i; 166 else 167 return 0; 168 } 169 170 // Now check if the provided Escape String is listed in the htmlEscChars Hashtable. 171 Character c = htmlEscChars.get(s); 172 173 // If the character was found in the table that lists all escape sequence characters, 174 // then return it. Otherwise just return ASCII zero. 175 176 return (c != null) ? c.charValue() : 0; 177 } 178 179 /** 180 * Will generate a {@code String} whereby any & all <B STYLE='color: red;'><I>Hexadecimal 181 * Escape Sequences</I></B> have been removed and subsequently replaced with their actual 182 * ASCII/UniCode un-escaped characters! 183 * 184 * <BR /><BR /><B CLASS=JDDescLabel>Hexadecimal HTML Escape-Sequence Examples:</B> 185 * 186 * <BR /><TABLE CLASS=JDBriefTable> 187 * <TR><TH>Substring from Input:</TH><TH>Web-Browser Converts To:</TH></TR> 188 * <TR><TD><CODE>&#xAA;</CODE></TD><TD><CODE>'ª'</CODE> within a browser</TD></TR> 189 * <TR><TD><CODE>&#x67;</CODE></TD><TD><CODE>'g'</CODE> within a browser</TD></TR> 190 * <TR><TD><CODE>&#x84;</CODE></TD><TD><CODE>'„'</CODE> within a browser</TD></TR> 191 * </TABLE> 192 * 193 * <BR />This method might be thought of as similar to the older C/C++ {@code 'Ord()'} 194 * function, except it is for HTML. 195 * 196 * @param str any {@code String} that contains an HTML Escape Sequence 197 * &#x[HEXADECIMAL VALUE]; 198 * 199 * @return a {@code String}, with all of the hexadecimal escape sequences removed and replaced 200 * with their equivalent ASCII or UniCode Characters. 201 * 202 * @see #replaceAll_DEC(String str) 203 * @see StrReplace#r(String, String[], char[]) 204 */ 205 public static String replaceAll_HEX(String str) 206 { 207 // This is the RegEx Matcher from the top. It matches string's that look like: &#x\d+; 208 Matcher m = HEX_CODE.matcher(str); 209 210 // Save the escape-string regex search matches in a TreeMap. We need to use a 211 // TreeMap because it is much easier to check if a particular escape sequence has already 212 // been found. It is easier to find duplicates with TreeMap's. 213 214 TreeMap<String, Character> escMap = new TreeMap<>(); 215 216 while (m.find()) 217 { 218 // Use Base-16 Integer-Parse 219 int i = Integer.valueOf(m.group(1), 16); 220 221 // Do not un-escape EMOJI's... It makes a mess - they are sequences of characters 222 // not single characters. 223 224 if (i > Character.MAX_VALUE) continue; 225 226 // Retrieve the Text Information about the HTML Escape Sequence 227 String text = m.group(); 228 229 // Check if it is a valid HTML 5 Escape Sequence. 230 if (! escMap.containsKey(text)) escMap.put(text, Character.valueOf((char) i)); 231 } 232 233 // Build the matchStr's and replaceChar's arrays. These are just the KEY's and 234 // the VALUE's of the TreeMap<String, Character> which was just built. 235 // NOTE: A TreeMap is used *RATHER THAN* two parallel arrays in order to avoid keeping 236 // duplicates when the replacement occurs. 237 238 String[] matchStrs = escMap.keySet().toArray(new String[escMap.size()]); 239 char[] replaceChars = new char[escMap.size()]; 240 241 // Lookup each "ReplaceChar" in the TreeMap, and put it in the output "replaceChars" 242 // array. The class StrReplace will replace all the escape squences with the actual 243 // characters. 244 245 for (int i=0; i < matchStrs.length; i++) replaceChars[i] = escMap.get(matchStrs[i]); 246 247 return StrReplace.r(str, matchStrs, replaceChars); 248 } 249 250 /** 251 * This method functions the same as {@code replaceAll_HEX(String)} - except it replaces only 252 * HTML Escape sequences that are represented using decimal (base-10) values. 253 * {@code 'replaceAll_HEX(...)'} works on hexadecimal (base-16) values. 254 * 255 * <BR /><BR /><B CLASS=JDDescLabel>Base-10 HTML Escape-Sequence Examples:</B> 256 * 257 * <BR /><TABLE CLASS=JDBriefTable> 258 * <TR><TH>Substring from Input:</TH><TH>Web-Browser Converts To:</TH></TR> 259 * <TR><TD><CODE>&#48;</CODE></TD><TD><CODE>'0'</CODE> in your browser</TD></TR> 260 * <TR><TD><CODE>&#64;</CODE></TD><TD><CODE>'@'</CODE> in your browser</TD></TR> 261 * <TR><TD><CODE>&#123;</CODE></TD><TD><CODE>'{'</CODE> in your browser</TD></TR> 262 * <TR><TD><CODE>&#125;</CODE></TD><TD><CODE>'}'</CODE> in your browser</TD></TR> 263 * </TABLE> 264 * 265 * <BR /><B CLASS=JDDescLabel>Base-10 & Base-16 Escape-Sequence Difference:</B> 266 * 267 * <BR /><UL CLASS=JDUL> 268 * <LI> <CODE>&#x[hex base-16 value];</CODE> There is an {@code 'x'} as the third character 269 * in the {@code String} 270 * </LI> 271 * <LI> <CODE>&#[decimal base-10 value];</CODE> There is no {@code 'x'} in the 272 * escape-sequence {@code String!} 273 * </LI> 274 * </UL> 275 * 276 * <BR />This short example delineates the difference between an HTML escape-sequence that 277 * employs {@code Base-10} numbers, and one using {@code Base-16} (Hexadecimal) numbers. 278 * 279 * @param str any {@code String} that contains the HTML Escape Sequence 280 * <CODE>&#[DECIMAL VALUE];</CODE>. 281 * 282 * @return a {@code String}, with all of the decimal escape sequences removed and replaced with 283 * ASCII UniCode Characters. 284 * 285 * <BR /><BR />If this parameter does not contain such a sequence, then this method will return 286 * the same input-{@code String} reference as its return value. 287 * 288 * @see #replaceAll_HEX(String str) 289 * @see StrReplace#r(String, String[], char[]) 290 */ 291 public static String replaceAll_DEC(String str) 292 { 293 // This is the RegEx Matcher from the top. It matches string's that look like: &#\d+; 294 Matcher m = DEC_CODE.matcher(str); 295 296 // Save the escape-string regex search matches in a TreeMap. We need to use a 297 // TreeMap because it is much easier to check if a particular escape sequence has already 298 // been found. It is easier to find duplicates with TreeMap's. 299 300 TreeMap<String, Character> escMap = new TreeMap<>(); 301 302 while (m.find()) 303 { 304 // Use Base-10 Integer-Parse 305 int i = Integer.valueOf(m.group(1)); 306 307 // Do not un-escape EMOJI's... It makes a mess - they are sequences of characters 308 // not single characters. 309 310 if (i > Character.MAX_VALUE) continue; 311 312 // Retrieve the Text Information about the HTML Escape Sequence 313 String text = m.group(); 314 315 // Check if it is a valid HTML 5 Escape Sequence. 316 if (! escMap.containsKey(text)) escMap.put(text, Character.valueOf((char) i)); 317 } 318 319 // Build the matchStr's and replaceChar's arrays. These are just the KEY's and 320 // the VALUE's of the TreeMap<String, Character> which was just built. 321 // NOTE: A TreeMap is used *RATHER THAN* two parallel arrays in order to avoid keeping 322 // duplicates when the replacement occurs. 323 324 String[] matchStrs = escMap.keySet().toArray(new String[escMap.size()]); 325 char[] replaceChars = new char[escMap.size()]; 326 327 // Lookup each "ReplaceChar" in the TreeMap, and put it in the output "replaceChars" 328 // array. The class StrReplace will replace all the escape sequences with the actual 329 // characters. 330 331 for (int i=0; i < matchStrs.length; i++) replaceChars[i] = escMap.get(matchStrs[i]); 332 333 return StrReplace.r(str, matchStrs, replaceChars); 334 } 335 336 /** 337 * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_ALL_TEXT> 338 * 339 * @param str any {@code String} that contains HTML Escape Sequences that need to be converted 340 * to their ASCII-UniCode character representations. 341 * 342 * @return a {@code String}, with all of the decimal escape sequences removed and replaced with 343 * ASCII UniCode Characters. 344 * 345 * @see #replaceAll_HEX(String str) 346 * @see StrReplace#r(String, boolean, String[], Torello.Java.Function.ToCharIntTFunc) 347 * 348 * @throws IllegalStateException 349 */ 350 public static String replaceAll_TEXT(String str) 351 { 352 // We only need to find which escape sequences are in this string. 353 // use a TreeSet<String> to list them. It will 354 355 Matcher m = TEXT_CODE.matcher(str); 356 TreeMap<String, String> escMap = new TreeMap<>(); 357 358 while (m.find()) 359 { 360 // Retrieve the Text Information about the HTML Escape Sequence 361 String text = m.group(); 362 String sequence = text.substring(1, text.length() - 1); 363 364 // Check if it is a valid HTML 5 Escape Sequence. 365 if ((! escMap.containsKey(text)) && htmlEscChars.containsKey(sequence)) 366 escMap.put(text, sequence); 367 } 368 369 // Convert the TreeSet to a String[] array... and use StrReplace 370 String[] escArr = new String[escMap.size()]; 371 372 return StrReplace.r( 373 str, false, escMap.keySet().toArray(escArr), 374 (int i, String sequence) -> htmlEscChars.get(escMap.get(sequence)) 375 ); 376 } 377 378 /** 379 * Calls all of the HTML Escape Sequence convert/replace {@code String} functions at once. 380 * 381 * @param s This may be any Java {@code String} which may (or may not) contain HTML Escape 382 * sequences. 383 * 384 * @return a new {@code String} where all HTML escape-sequence substrings have been replaced 385 * with their natural character representations. 386 * 387 * @see #replaceAll_DEC(String) 388 * @see #replaceAll_HEX(String) 389 * @see #replaceAll_TEXT(String) 390 */ 391 @Deprecated 392 public static String replaceAll(String s) 393 { return replaceAll_HEX(replaceAll_DEC(replaceAll_TEXT(s))); } 394 395 /** 396 * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_REPLACE> 397 * 398 * @param s This may be any Java {@code String} which may (or may not) contain HTML Escape 399 * sequences. 400 * 401 * @return a new {@code String} where all HTML escape-sequence substrings have been replaced 402 * with their natural character representations. 403 */ 404 public static String replace(String s) 405 { 406 // The primary optimization is to do this the "C" way (As in The C Programming Language) 407 // The String to Escape is converted to a character array, and the characters are shifted 408 // as the Escape Sequences are replaced. This is all done "in place" without creating 409 // new substring's in memory. 410 411 char[] c = s.toCharArray(); 412 413 // These two pointers are kept as the "Source Character" - as in the next character to 414 // "Read" ... and the "Destination Character" - as in the next location to write. 415 416 int sourcePos = 0; 417 int destPos = 0; 418 419 while (sourcePos < c.length) 420 421 // All Escape Sequences begin with the Ampersand Symbol. If the next character 422 // does not begin with the Ampersand, we should skip and move on. Copy the next source 423 // character to the next destination location, and continue the loop. 424 425 if (c[sourcePos] != '&') 426 { c[destPos++]=c[sourcePos++]; continue; } 427 428 // Here, an Ampersand has been found. Now check if the character immediately 429 // following the Ampersand is a Pound Sign. If it is a Pound Sign, that implies 430 // this escape sequence is simply going to be a number. 431 432 else if ((sourcePos < (c.length-1)) && (c[sourcePos + 1] == '#')) 433 { 434 int evaluatingPos = sourcePos + 1; 435 boolean isHex = false; 436 437 // If the Character after the Pound Sign is an 'X', it means that the number 438 // that has been escaped is a Base 16 (Hexadecimal) number. 439 // IMPORTANT: Check to see that the Ampersand wasn't the last char in the String 440 441 if (evaluatingPos + 1 < c.length) 442 if (c[evaluatingPos + 1] == 'x') 443 { isHex = true; evaluatingPos++; } 444 445 // Keep skipping the numbers, until a non-digit character is identified. 446 while ((++evaluatingPos < c.length) && Character.isDigit(c[evaluatingPos])); 447 448 // If the character immediately after the last digit isn't a ';' (Semicolon), 449 // then this entire thing is NOT an escaped HTML character. In this case, make 450 // sure to copy the next source-character to the next destination location in the 451 // char[] array... Then continue the loop to the next 'char' (after Ampersand) 452 453 if ((evaluatingPos == c.length) || (c[evaluatingPos] != ';')) 454 { c[destPos++]=c[sourcePos++]; continue; } 455 456 int escapedChar; 457 458 try 459 { 460 // Make sure to convert 16-bit numbers using the 16-bit radix using the 461 // standard java parse integer way. 462 463 escapedChar = isHex 464 ? Integer.parseInt(s.substring(sourcePos + 3, evaluatingPos), 16) 465 : Integer.parseInt(s.substring(sourcePos + 2, evaluatingPos)); 466 } 467 468 // If for whatever reason java was unable to parse the digits in the escape 469 // sequence, then copy the next source-character to the next destination-location 470 // and move on in the loop. 471 472 catch (NumberFormatException e) 473 { c[destPos++]=c[sourcePos++]; continue; } 474 475 // If the character was an Emoji, then it would be a number greater than 476 // 2^16. Emoji's use Code Points - which are multiple characters used up 477 // together. Their escape sequences are always characters larger than 65,535. 478 // If so, just copy the next source-character to the next destination location, and 479 // move on in the loop. 480 481 if (escapedChar > Character.MAX_VALUE) 482 { c[destPos++]=c[sourcePos++]; continue; } 483 484 // Replace the next "Destination Location" with the (un) escaped char. 485 c[destPos++] = (char) escapedChar; 486 487 // Skip the entire HTML Escape Sequence by skipping to the location after the 488 // position where the "evaluation" (all this processing) was occurring. This 489 // just happens to be the next-character immediately after the semi-colon 490 491 sourcePos = evaluatingPos + 1; // will be pointing at the ';' (semicolon) 492 } 493 494 // An Ampersand was just found, but it was not followed by a '#' (Pound Sign). This 495 // means that it is not a "numbered" (to invent a term) HTML Escape Sequence. Instead 496 // we shall check if there is a valid Escape-String (before the next semi-colon) that 497 // can be identified in the Hashtable 'htmlEscChars' 498 499 else if (sourcePos < (c.length - 1)) 500 { 501 // We need to create a 'temp variable' and it will be called "evaluating position" 502 int evaluatingPos = sourcePos; 503 504 // All text (non "Numbered") HTML Escape String's are comprised of letter or digits 505 while ((++evaluatingPos < c.length) && Character.isLetterOrDigit(c[evaluatingPos])); 506 507 // If the character immediately after the last letter or digit is not a semi-colon, 508 // then there is no way this is an HTML Escape Sequence. Copy the next source to 509 // the next destination location, and continue with the loop. 510 511 if ((evaluatingPos == c.length) || (c[evaluatingPos] != ';')) 512 { c[destPos++]=c[sourcePos++]; continue; } 513 514 // Get the replacement character from the lookup table. 515 Character replacement = htmlEscChars.get(s.substring(sourcePos + 1, evaluatingPos)); 516 517 // The lookup table will return null if there this was not a valid escape sequence. 518 // If this was not a valid sequence, just copy the next character from the source 519 // location, and move on in the loop. 520 521 if (replacement == null) 522 { c[destPos++]=c[sourcePos++]; continue; } 523 524 c[destPos++] = replacement; 525 sourcePos = evaluatingPos + 1; 526 } 527 528 else 529 { c[destPos++]=c[sourcePos++]; continue; } 530 531 return new String(c, 0, destPos); 532 } 533 534 /** 535 * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_CHAR> 536 * 537 * @param c Any Java Character. Note that the Java <B>Primitive Type</B> {@code 'char'} 538 * is a 16-bit type. This parameter equates to the <B>UNICODE</B> Characters 539 * {@code 0x0000} up to {@code 0xFFFF}. 540 * 541 * @param use16BitEscapeSequence If the user would like the returned, escaped, {@code String} 542 * to use <B>Base 16</B> for the escaped digits, pass {@code TRUE} to this parameter. If the 543 * user would like to retrieve an escaped {@code String} that uses standard <B>Base 10</B> 544 * digits, then pass {@code FALSE} to this parameter. 545 * 546 * @return The passed character parameter {@code 'c'} will be converted to an HTML Escape 547 * Sequence. For instance if the character <CODE>'ᡃ'</CODE>, which is the Chinese 548 * Character for <I>I, Me, Myself</I> were passed to this method, then the {@code String} 549 * {@code "我"} would be returned. 550 * 551 * <BR /><BR />If the parameter {@code 'use16BitEscapeSequence'} had been passed {@code TRUE}, 552 * then this method would, instead, return the {@code String "我"}. 553 */ 554 public static String escChar(char c, boolean use16BitEscapeSequence) 555 { 556 return use16BitEscapeSequence 557 ? "&#" + ((int) c) + ";" 558 : "&#x" + Integer.toHexString((int) c).toUpperCase() + ";"; 559 } 560 561 /** 562 * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_CODE_PT> 563 * 564 * @param codePoint This will take any integer. It will be interpreted as a {@code UNICODE} 565 * {@code code point}. 566 * 567 * <BR /><BR /><B STYLE="color:red;">NOTE:</B> Java uses <B>16-bit</B> values for it's 568 * primitive {@code 'char'} type. This is also the "first plane" of the <B>UNICODE Space</B> 569 * and actually referred to as the <B>Basic Multi Lingual Plane</B>. Any value passed to this 570 * method that is lower than {@code 65,535} would receive the same escape-{@code String} that 571 * it would from a call to the method {@link #escChar(char, boolean)}. 572 * 573 * @param use16BitEscapeSequence If the user would like the returned, escaped, {@code String} 574 * to use <B>Base 16</B> for the escaped digits, pass {@code TRUE} to this parameter. If the 575 * user would like to retrieve an escaped {@code String} that uses standard <B>Base 10</B> 576 * digits, then pass {@code FALSE} to this parameter. 577 * 578 * @return The {@code code point} will be converted to an HTML Escape Sequence, as a 579 * {@code java.lang.String}. For instance if the {@code code point} for "the snowman" glyph 580 * (character ☃), which happens to be represented by a {@code code point} that is below 581 * {@code 65,535} (and, incidentally, does "fit" into a single Java {@code 'char'}) - this 582 * method would return the {@code String "☃"}. 583 * 584 * <BR /><BR />If the parameter {@code 'use16BitEscapeSequence'} had been passed {@code TRUE}, 585 * then this method would, instead, return the {@code String "☃"}. 586 * 587 * @throws IllegalArgumentException Java has a method for determining whether any integer is a 588 * valid {@code code point}. Not all of the integers "fit" into the 17 Unicode "planes". 589 * Note that each of the planes in {@code 'Unicode Space'} contain {@code 65,535} 590 * (or {@code 2^16}) characters. 591 */ 592 public static String escCodePoint(int codePoint, boolean use16BitEscapeSequence) 593 { 594 if (! Character.isValidCodePoint(codePoint)) throw new IllegalArgumentException( 595 "The integer you have passed to this method [" + codePoint + "] was deemed an " + 596 "invalid Code Point after a call to: [java.lang.Character.isValidCodePoint(int)]. " + 597 "Therefore this method is unable to provide an HTML Escape Sequence." 598 ); 599 600 return use16BitEscapeSequence 601 ? "&#" + codePoint + ";" 602 : "&#x" + Integer.toHexString(codePoint).toUpperCase() + ";"; 603 } 604 605 /** 606 * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_HAS_HTML> 607 * 608 * @param c Any <B>ASCII</B> or <B>UNICODE</B> Character 609 * 610 * @return {@code TRUE} if there is a {@code String} escape sequence for this character, and 611 * {@code FALSE} otherwise. 612 * 613 * @see #htmlEsc(char) 614 */ 615 public static boolean hasHTMLEsc(char c) 616 { return htmlEscSeq.get(Character.valueOf(c)) != null; } 617 618 /** 619 * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_HTML_ESC> 620 * 621 * @param c Any <B>ASCII</B> or <B>UNICODE</B> Character 622 * 623 * @return The {@code String} that is used by web-browsers to escape this ASCII / Uni-Code 624 * character - <I>if there is one saved</I> in the <B>internal</B> <CODE>Lookup Table</CODE>. 625 * If the character provided does not have an associated {@code HTML Escape String}, then 626 * 'null' is returned. 627 * 628 * <BR /><BR /><B>NOTE:</B> The entire escape-{@code String} is not provided, just the 629 * inner-characters. The leading {@code '&'} (Ampersand) and the trailing {@code ';'} 630 * (Semi-Colon) are not appended to the returned {@code String}. 631 * 632 * @see #hasHTMLEsc(char) 633 */ 634 public static String htmlEsc(char c) 635 { return htmlEscSeq.get(Character.valueOf(c)); } 636}