001package Torello.Java.Additional; 002 003import java.util.*; 004import java.net.*; 005import java.util.regex.*; 006import java.io.*; 007 008import static Torello.Java.C.*; 009 010import Torello.Java.StorageWriter; 011import Torello.Java.StrReplace; 012 013/** 014 * A class that plays-with URL's, no more, no less. 015 * 016 * <EMBED CLASS='external-html' DATA-FILE-ID=URLS> 017 */ 018@Torello.JavaDoc.StaticFunctional 019public class URLs 020{ 021 private URLs() { } 022 023 /** 024 * This is a Regular-Expression Pattern {@code (java.util.regex.Pattern)} - saved as a 025 * {@code String}. It is subsequently compiled. 026 * 027 * <BR /><BR />The primary function is to match {@code String's} that are intended to match 028 * HTTP-{@code URL's}. This Regular Expression matches: 029 * 030 * <BR /><BR /><UL CLASS=JDUL> 031 * <LI>{@code http(s)://...<any-text>.../}</LI> 032 * <LI>{@code http(s)://...<any-text, not front-slash>...}</LI> 033 * <LI>{@code http(s)://...<any-text>.../...<any-text, not front-slash>...}</LI> 034 * </UL> 035 * 036 * <BR /><BR /><B CLASS=JDDescLabel>Primarily used in:</B> 037 * 038 * <BR /><UL CLASS=JDUL> 039 * <LI>{@link #toProperURLV3(String)}</LI> 040 * <LI>{@link #toProperURLV4(String)}</LI> 041 * </UL> 042 * 043 * @see #P1 044 */ 045 protected static final String RE1 = 046 "^(http[s]?:\\/\\/.*?\\/$|http[s]?:\\/\\/[^\\/]*$|http[s]?:\\/\\/.*?\\/[^\\/]+)"; 047 048 /** 049 * {@code P1 = Pattern.compile(RE1);} 050 * 051 * @see #RE1 052 */ 053 protected static final Pattern P1 = Pattern.compile(RE1); 054 055 /** 056 * Java Help Messag Explaining {@code class java.net.URL} - and the specific output of its 057 * methods. 058 * 059 * <EMBED CLASS='external-html' DATA-FILE-ID=URLS_HELP_MSG> 060 * 061 * @param sw An instance of class StorageWriter. This parameter may be null, and if it is 062 * text-output will be sent to Standard-Output. 063 */ 064 protected static final void javaURLHelpMessage(StorageWriter sw) 065 { 066 if (sw == null) sw = new StorageWriter(); 067 068 String[] urlStrArr = { 069 "https://DALLASCITYHALL.com", "https://dallascityhall.com/", 070 "https://dallascityhall.com/news", 071 "https://dallascityhall.com/news/", "http://DALLASCITYHALL.com/news/ARTICLE-1.html", 072 "https://DallasCityHall.com/NEWS/article1.html?q=somevalue", 073 "https://DallasCityHall.com/news/ARTICLE-1.html#subpart1", 074 "https://DallasCityHall.com/NEWS/article1.html?q=somevalue&q2=someOtherValue", 075 "https://DallasCityHall.com/NEWS/article1.html?q=somevalue&q2=someOtherValue#LocalRef" 076 }; 077 078 URL[] urlArr = new URL[urlStrArr.length]; 079 080 try 081 { for (int i=0; i < urlStrArr.length; i++) urlArr[i] = new URL(urlStrArr[i]); } 082 083 catch (Exception e) 084 { 085 sw.println( 086 "Broke a URL, and it generated an exception.\n" + 087 "Sorry, fix the URL's in this method.\n" + 088 "Did you change them?" 089 ); 090 091 e.printStackTrace(); 092 return; 093 } 094 095 for (URL u : urlArr) 096 { 097 System.out.println( 098 "u.toString():\t\t" + BCYAN + u.toString() + RESET + '\n' + 099 "u.getProtocol():\t" + u.getProtocol() + '\n' + 100 "u.getHost():\t\t" + u.getHost() + '\n' + 101 "u.getPath():\t\t" + u.getPath() + '\n' + 102 "u.getFile():\t\t" + u.getFile() + '\n' + 103 "u.getQuery():\t\t" + u.getQuery() + '\n' + 104 "u.getRef():\t\t" + u.getRef() + '\n' + 105 "u.getAuthority():\t" + u.getAuthority() + '\n' + 106 "u.getUserInfo():\t" + u.getUserInfo() + '\n' + 107 "urlToString(u):\t\t" + urlToString(u) 108 ); 109 } 110 } 111 112 113 // ******************************************************************************************** 114 // ******************************************************************************************** 115 // Helper function for making URL address readable by web-servers. 116 //********************************************************************************************* 117 // ******************************************************************************************** 118 119 120 /** 121 * When scraping Spanish {@code URL's}, these characters can / should be escaped. 122 * 123 * <BR /><BR /><B CLASS=JDDescLabel>Parallel Array Note:</B> 124 * 125 * <BR />This array shall be considered parallel to the <B><I>replacement</I></B> 126 * {@code String[]}-Array {@link #VOWELS_URL}. 127 * 128 * @see #toProperURLV1(String) 129 * @see #VOWELS_URL 130 */ 131 protected static final char[] VOWELS = { 132 'á', 'É', 'é', 'Í', 'í', 'Ó', 'ó', 'Ú', 'ú', 'Ü', 'ü', 133 'Ñ', 'ñ', 'Ý', 'ý', '¿', '¡' 134 }; 135 136 /** 137 * When scraping Spanish {@code URL's}, these {@code String's} are the 138 * <B>URL Escape Sequences</B> for the Spanish Vowel Characters listed in {@link #VOWELS}. 139 * 140 * <BR /><BR /><B CLASS=JDDescLabel>Parallel Array Note:</B> 141 * 142 * <BR />This array shall be considered parallel to {@code String[]}-Array {@link #VOWELS}. 143 * 144 * @see #toProperURLV1(String) 145 * @see #VOWELS 146 */ 147 protected static final String[] VOWELS_URL = 148 { 149 "%C3%A1", "%C3%89", "%C3%A9", "%C3%8D", "%C3%AD", "%C3%93", "%C3%B3", "%C3%9A", 150 "%C3%BA", "%C3%9C", "%C3%BC", "%C3%91", "%C3%B1", "%C3%9D", "%C3%BD", "%C2%BF", 151 "%C2%A1" 152 }; 153 154 /** 155 * This will substitute many of the Spanish-characters that can make a web-query difficult. 156 * These are the substitutions listed: 157 * 158 * <BR /><BR /><TABLE CLASS=JDBriefTable> 159 * <TR><TH>Spanish Language Character</TH><TH>URL Escape Sequence</TH></TR> 160 * <TR><TD>{@code Á}</TD><TD>{@code %C3%81}</TD></TR> 161 * <TR><TD>{@code á}</TD><TD>{@code %C3%A1}</TD></TR> 162 * <TR><TD>{@code É}</TD><TD>{@code %C3%89}</TD></TR> 163 * <TR><TD>{@code é}</TD><TD>{@code %C3%A9}</TD></TR> 164 * <TR><TD>{@code Í}</TD><TD>{@code %C3%8D}</TD></TR> 165 * <TR><TD>{@code í}</TD><TD>{@code %C3%AD}</TD></TR> 166 * <TR><TD>{@code Ó}</TD><TD>{@code %C3%93}</TD></TR> 167 * <TR><TD>{@code ó}</TD><TD>{@code %C3%B3}</TD></TR> 168 * <TR><TD>{@code Ú}</TD><TD>{@code %C3%9A}</TD></TR> 169 * <TR><TD>{@code ú}</TD><TD>{@code %C3%BA}</TD></TR> 170 * <TR><TD>{@code Ü}</TD><TD>{@code %C3%9C}</TD></TR> 171 * <TR><TD>{@code ü}</TD><TD>{@code %C3%BC}</TD></TR> 172 * <TR><TD>{@code Ñ}</TD><TD>{@code %C3%91}</TD></TR> 173 * <TR><TD>{@code ñ}</TD><TD>{@code %C3%B1}</TD></TR> 174 * <TR><TD>{@code Ý}</TD><TD>{@code %C3%9D}</TD></TR> 175 * <TR><TD>{@code ý}</TD><TD>{@code %C3%BD}</TD></TR> 176 * </TABLE> 177 * 178 * <BR /><BR /><B CLASS=JDDescLabel>Historical Note:</B> 179 * 180 * <BR />This method was written the very first time that a {@code URL} needed to be escaped 181 * during the writing of the Java-HTML {@code '.jar'}. 182 * 183 * @param url Any website {@code URL} query. 184 * 185 * @return The same {@code URL} with substitutions made. 186 * 187 * @see #VOWELS 188 * @see #VOWELS_URL 189 * @see StrReplace#r(String, char[], String[]) 190 */ 191 public static String toProperURLV1(String url) 192 { return StrReplace.r(url, VOWELS, VOWELS_URL); } 193 194 /** 195 * This list of java {@code char's} are characters that are better off escaped when passing 196 * them through a {@code URL}. 197 * 198 * @see #toProperURLV2(String) 199 */ 200 protected static final char[] URL_ESC_CHARS = 201 { 202 '%', ' ', '#', '$', '&', '@', '`', '/', ':', ';', '<', '=', '>', '?', '[', '\\', 203 ']', '^', '{', '|', '}', '~', '\'', '+', ',' 204 }; 205 206 /** 207 * This method will clobber the leading Domain-Name and Protocol - 208 * {@code http://domain.name.something/} stuff. It is best to use this method on 209 * {@code String's} that will be inserted into a {@code URL} after the {@code '?'} 210 * question-mark, inside the Query-String. 211 * 212 * <BR /><BR />This can be very useful when sending JSON Arguments, for instance, inside a 213 * {@code URL's} Query-String, instead of the GET / POST part of a request. 214 * 215 * <BR /><BR />Note that this method should not be used to escape characters outside of the 216 * range of Standard-ASCII (characters {@code 0 ... 255}). 217 * 218 * <BR /><BR /><B CLASS=JDDescLabel>State of the Experiment:</B> 219 * 220 * <BR />It seems to help to escape these characters: 221 * 222 * <BR /><B STYLE="color:red;">{@code # $ % & @ ` / : ; < = > ? [ \ ] ^ | ~ " ' + ,} 223 * <CODE> { } </CODE></B> 224 * 225 * @param urlStuff Any information that is intended to be sent via an HTTP-{@code URL}, and 226 * needs to be escaped. 227 * 228 * @return An escaped version of this {@code URL-String} 229 * 230 * @see #URL_ESC_CHARS 231 * @see StrReplace#r(String, char[], IntCharFunction) 232 */ 233 public static String toProperURLV2(String urlStuff) 234 { 235 return StrReplace.r( 236 urlStuff, URL_ESC_CHARS, 237 (int i, char c) -> '%' + Integer.toHexString((int) c) 238 ); 239 } 240 241 /** 242 * This leaves out the actual domain name before starting HTTP-URL Escape Sequences. If this 243 * starts with the words "http://domain.something/" then the initial colon, forward-slash and 244 * periods won't be escaped. Everything after the first front-slash will include URL-HTTP 245 * Escape characters. 246 * 247 * <BR /><BR />This does the same thing as {@code toProperURLV2(String)}, but skips the initial 248 * part of the URL text/string - IF PRESENT! 249 * 250 * <BR /><BR />{@code http(s?)://domain.something/} is skipped by the Regular Expression, 251 * everything else from {@code URLV2} is escaped. 252 * 253 * @param url This may be any internet {@code URL}, represented as a {@code String}. It will 254 * be escaped with the {@code %INT} format. 255 * 256 * @return An escaped {@code URL String} 257 * 258 * @see #toProperURLV2(String) 259 * @see #P1 260 */ 261 public static String toProperURLV3(String url) 262 { 263 String beginsWith = null; 264 Matcher m = P1.matcher(url); 265 266 if (m.find()) 267 { 268 beginsWith = m.group(1); 269 url = url.substring(beginsWith.length()); 270 } 271 272 return ((beginsWith != null) ? beginsWith : "") + toProperURLV2(url); 273 } 274 275 /** 276 * This is a (shortened) list of characters that <I>should</I> be escaped before being used 277 * within a {@code URL}. 278 * 279 * <BR /><BR />This version differs from {@link #URL_ESC_CHARS} in that it does not include the 280 * {@code '&'} (ampersand), the {@code '?'} (question-mark) or the {@code '/'} (forward-slash). 281 * 282 * @see #URL_ESC_CHARS 283 * @see #toProperURLV4(String) 284 */ 285 protected static final char[] URL_ESC_CHARS_ABBREV = 286 { 287 '%', ' ', '#', '$', '@', '`', ':', ';', '<', '=', '>', '[', '\\', ']', 288 '^', '{', '|', '}', '~', '\'', '+', ',' 289 }; 290 291 /** 292 * This does the same thing as V3, but it also will avoid escaping any {@code '?'} 293 * (question-mark) or {@code '&'} (ampersand) or {@code '/'} (forward-slash) symbols anywhere 294 * in the entire {@code String}. It also "skips" escaping the initial 295 * {@code HTTP(s)://domain.net.something/} as well - just like {@code toProperURLV3} 296 * 297 * @return This does the same thing as {@code toProperURLV3(String)}, but leaves out 100% 298 * of the instances of Ampersand, Question-Mark, and Forward-Slash symbols. 299 * 300 * @see #toProperURLV3(String) 301 * @see #P1 302 * @see #URL_ESC_CHARS_ABBREV 303 * @see StrReplace#r(String, char[], IntCharFunction) 304 */ 305 public static String toProperURLV4(String url) 306 { 307 String beginsWith = null; 308 Matcher m = P1.matcher(url); 309 310 if (m.find()) 311 { 312 beginsWith = m.group(1); 313 url = url.substring(beginsWith.length()); 314 } 315 316 return ((beginsWith != null) ? beginsWith : "") + 317 StrReplace.r 318 (url, URL_ESC_CHARS_ABBREV, (int i, char c) -> '%' + Integer.toHexString((int) c)); 319 } 320 321 /** 322 * <EMBED CLASS='external-html' DATA-FILE-ID=URLS_PRP_URL_V5> 323 * 324 * @param url This is the URL to be encoded, properly 325 * 326 * @return A properly encoded URL String. Important, if calling the {@code java.net.URL} 327 * constructor generates a {@code MalformedURLException}, then this method shall return. 328 * The {@code java.net.URL} constructor will be called if the {@code String} passed begins with 329 * the characters {@code 'http://'} or {@code 'https://'}. 330 */ 331 public static String toProperURLV5(String url) 332 { 333 url = url.trim(); 334 335 URL u = null; 336 String[] sArr = null; 337 String tlc = url.toLowerCase(); 338 339 if (tlc.startsWith("http://") || tlc.startsWith("https://")) 340 { try { u = new URL(url); } catch (Exception e) { return null; } } 341 342 if (u == null) sArr = url.split("/"); 343 else sArr = u.getPath().split("/"); 344 345 String slash = ""; 346 StringBuilder sb = new StringBuilder(); 347 348 for (String s : sArr) 349 { 350 try 351 { sb.append(slash + java.net.URLEncoder.encode(s, "UTF-8")); } 352 353 catch (UnsupportedEncodingException e) 354 { /* This really cannot happen, and I don't know what to put here! */ } 355 356 slash = "/"; 357 } 358 359 if (u == null) 360 return sb.toString(); 361 else 362 return 363 u.getProtocol() + "://" + u.getHost() + sb.toString() + 364 ((u.getQuery() != null) ? ("?" + u.getQuery()) : "") + 365 ((u.getRef() != null) ? ("#" + u.getRef()) : ""); 366 } 367 368 /** 369 * Rather than trying to explain what is escaped and what is left alone, please review the 370 * exact code here. 371 * 372 * <BR /><BR /><B CLASS=JDDescLabel>Another One:</B> 373 * 374 * <BR />Well, I just wrote another one, they told me to. This, newest version of 375 * {@code URL}-Encoding is actually pretty successful. It handles all Extra-Characters and is 376 * capable of dealing with {@code URL's} that contain the {@code '?' '=' '&'} operators of 377 * {@code GET}-Requests. 378 * 379 * <BR /><BR />Realize that though the out-of-the-box JDK, there is a class called 380 * "URI Encoder" - but that class expects that the {@code URL} to have already been separated 381 * out into it's distinct parts. 382 * 383 * <BR /><BR />This method does the the {@code URL}-Separating into disparate parts 384 * before performing the Character-Escaping. 385 * 386 * @param url This is any java {@code URL}. 387 * 388 * @return a new {@code String} version of the input parameter {@code 'url'} 389 */ 390 public static String toProperURLV6(String url) 391 { 392 URL u = null; 393 394 try 395 { u = new URL(url); } 396 397 catch (Exception e) { return null; } 398 399 StringBuilder sb = new StringBuilder(); 400 401 sb.append(u.getProtocol()); 402 sb.append("://"); 403 sb.append(u.getHost()); 404 sb.append(toProperURLV5(u.getPath())); 405 406 if (u.getQuery() != null) 407 { 408 String[] sArr = u.getQuery().split("&"); 409 StringBuilder sb2 = new StringBuilder(); 410 String ampersand = ""; 411 412 for (String s : sArr) 413 { 414 String[] s2Arr = s.split("="); 415 StringBuilder sb3 = new StringBuilder(); 416 String equals = ""; 417 418 for (String s2: s2Arr) 419 { 420 try 421 { sb3.append(equals + java.net.URLEncoder.encode(s2, "UTF-8")); } 422 423 // This should never happen - UTF-8 is (sort-of) the only encoding. 424 catch (UnsupportedEncodingException e) { } 425 426 equals = "="; 427 } 428 429 sb2.append(ampersand + sb3.toString()); 430 ampersand = "&"; 431 } 432 433 sb.append("?" + sb2.toString()); 434 } 435 436 // Not really a clue, because a the "#" operator and the "?" probably shouldn't be used 437 // together. Java's java.net.URL class will parse a URL that has both the ? and the #, but 438 // I have no idea which Web-Sites would allow this, or encourage this... 439 440 if (u.getRef() != null) 441 442 try 443 { sb.append("#" + java.net.URLEncoder.encode(u.getRef(), "UTF-8")); } 444 445 catch (UnsupportedEncodingException e) { } 446 447 return sb.toString(); 448 } 449 450 /** 451 * These strictly use Java's URI Encoding Mechanism. They seem to work the same as "V6" 452 * Internally, these are now used. This as of November, 2019. 453 * 454 * @param url A Complete Java {@code URL}, as a {@code String}. Any specialized 455 * Escape-Characters that need to be escaped, will be. 456 * 457 * @throws URISyntaxException This will throw if building the {@code URI} generates an 458 * exception. Internally, all this method does is build a {@code URI}, and then call the Java 459 * Method {@code 'toASCIIString()'} 460 */ 461 public static String toProperURLV7(String url) throws URISyntaxException, MalformedURLException 462 { return toProperURLV8(new URL(url)); } 463 464 /** 465 * These strictly use Java's URI Encoding Mechanism. They seem to work the same as "V6" 466 * Internally, these are now used. This as of November, 2019. 467 * 468 * @param url A Complete Java {@code URL}. Any specialized Escape-Characters that need to be 469 * escaped, will be. 470 * 471 * @throws URISyntaxException This will throw if building the URI generates an exception. 472 * Internally, all this method does is build a URI, and then call the Java Method 473 * {@code 'toASCIIString()'} 474 */ 475 public static String toProperURLV8(URL url) throws URISyntaxException, MalformedURLException 476 { 477 return new URI( 478 url.getProtocol(), 479 url.getUserInfo(), 480 url.getHost(), 481 url.getPort(), 482 url.getPath(), 483 url.getQuery(), 484 url.getRef() 485 ).toASCIIString(); 486 } 487 488 489 // ******************************************************************************************** 490 // ******************************************************************************************** 491 // The original "URLs" class 492 //********************************************************************************************* 493 // ******************************************************************************************** 494 495 496 /** 497 * If you have a list of {@code URL's}, and want to quickly remove any 498 * duplicate-{@code URL's} found in the list - this will remove them. 499 * 500 * <BR /><BR /><B CLASS=JDDescLabel>Case Sensitivity:</B> 501 * 502 * <BR />This method will perform a few "to-lower-case" operations on the protocol and 503 * Web-Domain parts, but not on the file, directory, or Query-String portion of the 504 * {@code URL}. 505 * 506 * <BR /><BR />This should hilite what is Case-Sensitive, and what is not: 507 * 508 * <BR /><BR /><UL CLASS=JDUL> 509 * <LI> These are considered duplicate URL's: 510 * <BR /> 511 * <BR /><CODE>http://some.company.com/index.html</CODE> 512 * <BR /><CODE>HTTP://SOME.COMPANY.COM/index.html</CODE> 513 * <BR /><BR /> 514 * </LI> 515 * 516 * <LI> These are <I>not</I> considered duplicate URL's: 517 * <BR /> 518 * <BR /><CODE>http://other.company.com/Directory/Ben-Bitdiddle.html</CODE> 519 * <BR /><CODE>http://other.company.com/DIRECTORY/BE.html</CODE> 520 * </LI> 521 * </UL> 522 * 523 * @param urls Any list of {@code URL's}, some of which might have been duplicated. The 524 * difference between this {@code 'removeDuplicates'} and the other {@code 'removeDuplicates'} 525 * available in this class is that this one only removes multiple instances of the same 526 * {@code URL} in this {@code Vector}, while the other one iterates through a list of 527 * {@code URL's} already visited in a previous-session. 528 * 529 * <BR /><BR /><B>NOTE:</B> <I>Null {@code Vector}-values are skipped outright, they are 530 * neither removed nor changed.</i> 531 * 532 * @return The number of {@code Vector} elements that were removed. (i.e. <I>The size by which 533 * the {@code Vector} was shrunk.</I>) 534 */ 535 public static int removeDuplicates(Vector<URL> urls) 536 { 537 TreeSet<String> dups = new TreeSet<>(); 538 int count = 0; 539 int size = urls.size(); 540 URL url = null; 541 542 for (int i=0; i < size; i++) 543 544 if ((url = urls.elementAt(i)) != null) 545 if (! dups.add(urlToString(url))) 546 { 547 count++; 548 size--; 549 i--; 550 urls.removeElementAt(i); 551 } 552 553 return count; 554 } 555 556 /** 557 * This simple method will remove any {@code URL's} from the input {@code Vector} parameter 558 * {@code 'potentiallyNewURLs'} which are also present-members of the input {@code Vector} 559 * parameter {@code 'visitedURLs'}. 560 * 561 * <BR /><BR />This may seem trivial, and it is, but it worries about things like the 562 * {@code String's} Case for you. 563 * 564 * @param visitedURLs This parameter is a list of {@code URL's} that have already 565 * "been visited." 566 * 567 * @param potentiallyNewURLs This parameter is a list of {@code URL's} that are possibly 568 * "un-visited" - meaning whatever scrape, crawl or search being performed needs to know which 569 * {@code URL's} are listed in the previous parameter's contents. This may seem trivial, just 570 * use the java {@code url1.equals(url2)} command, but, alas, java doesn't exactly take into 571 * account upper-case and lower-case domain-names. This worries about case. 572 * 573 * @return The number of {@code URL's} that were removed from the input {@code Vector} 574 * parameter {@code 'potentiallyNewURLs'}. 575 */ 576 public static int removeDuplicates(Vector<URL> visitedURLs, Vector<URL> potentiallyNewURLs) 577 { 578 // The easiest way to check for duplicates is to build a tree-set of all the URL's as a 579 // String. Java's TreeSet<> generic already (automatically) scans for duplicates 580 // (efficiently) and will tell you if you have tried to add a duplicate 581 582 TreeSet<String> dups = new TreeSet<>(); 583 584 // Build a TreeSet of the url's from the "Visited URLs" parameter 585 visitedURLs.forEach(url -> dups.add(urlToString(url))); 586 587 // Add the "Possibly New URLs", one-by-one, and remove them if they are already in the 588 // visited list. 589 590 int count = 0; 591 int size = potentiallyNewURLs.size(); 592 URL url = null; 593 594 for (int i=0; i < size; i++) 595 596 if ((url = potentiallyNewURLs.elementAt(i)) != null) 597 598 if (! dups.add(urlToString(url))) 599 { 600 count++; 601 size--; 602 i--; 603 potentiallyNewURLs.removeElementAt(i); 604 } 605 606 return count; 607 } 608 609 /** 610 * Removes any Fragment-{@code URL} {@code '#'} symbols from a {@code URL}. 611 * 612 * <BR /><BR />If this {@code URL} contains a pound-sign Anchor-Name according to the Standard 613 * JDK's {@code URL.getRef()} method. Specifically, if {@code URL.getRef()} returns a non-null 614 * value, this method rebuilds the URL, without any Anchor-Name / Fragment information. 615 * 616 * <BR /><BR />The intention is to return a {@code URL} where any / all {@code String}-data 617 * that occurs after a {@code '#'} Hash-Tab / Pound-Sign is removed. 618 * 619 * @param url Any standard HTTP {@code URL}. If this {@code 'url'} contains a {@code '#'} 620 * (Pound Sign, Partial Reference) - according to the standard JDK {@code URL.getRef()} method, 621 * then it shall be removed. 622 * 623 * @return The {@code URL} without the partial-reference, or the original {@code URL} if there 624 * was no partial reference. Null is returned if there is an error instantiating the new 625 * {@code URL} without the partial-reference. 626 */ 627 public static URL shortenPoundREF(URL url) 628 { 629 try 630 { 631 if (url.getRef() != null) return new URL( 632 ((url.getProtocol() != null) ? url.getProtocol().toLowerCase() : "") + 633 "://" + 634 ((url.getHost() != null) ? url.getHost().toLowerCase() : "") + 635 ((url.getFile() != null) ? url.getFile() : "") 636 ); 637 638 else return url; 639 } 640 641 catch (MalformedURLException e) { return null; } 642 } 643 644 /** 645 * <EMBED CLASS='external-html' DATA-FILE-ID=URLS_NAMED_ANCHORS> 646 * 647 * @param urls Any list of completed (read: <I>fully-resolved</I>) {@code URL's}. 648 * 649 * @param ifExceptionSetNull If this parameter is passed {@code TRUE}, if there is ever an 650 * exception-throw while building the new {@code URL's} (without the fragment / pound-sign), 651 * then that position in the {@code Vector} will be replaced with a null. 652 * 653 * <BR /><BR />When this parameter is passed {@code FALSE}, if an exception is thrown, then 654 * it will be caught and silently ignored. 655 * 656 * @return The number / count of {@code URL's} in this list that were modified. Whenever a 657 * {@code URL} Named-Anchor is encountered, it will be removed from the {@code URL}, and a 658 * new {@code URL} without the fragment-part will be inserted to replace the old one. 659 * 660 * <BR /><BR />The integer that is returned here is the number of times that a replacement 661 * was made to the input {@code Vector}-parameter {@code 'urls'}. 662 */ 663 public static int shortenPoundREFs(Vector<URL> urls, boolean ifExceptionSetNull) 664 { 665 int pos = 0; 666 int shortenCount = 0; 667 668 for (int i = (urls.size() - 1); i >= 0; i--) 669 { 670 URL url = urls.elementAt(i); 671 672 try 673 { 674 if (url.getRef() != null) 675 { 676 URL newURL = new URL( 677 ((url.getProtocol() != null) ? url.getProtocol().toLowerCase() : "") + 678 "://" + 679 ((url.getHost() != null) ? url.getHost().toLowerCase() : "") + 680 ((url.getFile() != null) ? url.getFile() : "") 681 ); 682 683 urls.setElementAt(newURL, i); 684 shortenCount++; 685 } 686 } 687 688 catch (MalformedURLException e) 689 { if (ifExceptionSetNull) urls.setElementAt(null, i); } 690 } 691 692 return shortenCount; 693 } 694 695 /** 696 * <EMBED CLASS='external-html' DATA-FILE-ID=URLS_NAMED_ANCHORS> 697 * 698 * <BR /><BR /><B CLASS=JDDescLabel>KE: Keep Exceptions</B> 699 * 700 * <BR />This method is identical to the previous method, defined above, except that it 701 * allows a programmer to keep / retain any {@code MalformedURLException's} that are thrown 702 * while re-building them. 703 * 704 * @param urls Any list of completed (read: <I>fully-resolved</I>) {@code URL's}. 705 * 706 * @param ifExceptionSetNull If this is {@code TRUE} then if there is ever an exception building 707 * a new {@code URL} without a "Relative {@code URL '#'}" (Pound-Sign), then that position in 708 * the {@code Vector} will be replaced with 'null.' 709 * 710 * @return The number/count of {@code URL's} in this list that were modified. If a {@code URL} 711 * was modified, it was because it had a partial-page reference in it. If in the process of 712 * generating a new {@code URL} out of an old one, a {@code MalformedURLException} occurs, the 713 * exception will be placed in the {@code Ret2.b} position, which is a 714 * {@code Vector<MalformedURLException>}. 715 * 716 * <BR /><BR /><B>SPECIFICALLY:</B> 717 * 718 * <BR /><BR /><UL CLASS=JDUL> 719 * 720 * <LI> {@code Ret2.a = 'Integer'} number of {@code URL's} shortened for having a {@code '#'} 721 * partial-reference. 722 * </LI> 723 * 724 * <LI> {@code Ret2.b = Vector<MalformedURLException>} where each element of this 725 * {@code Vector} is null if there were no problems converting the {@code URL}, or the 726 * exception reference if there were exceptions thrown. 727 * </LI> 728 * 729 * </UL> 730 */ 731 public static Ret2<Integer, Vector<MalformedURLException>> shortenPoundREFs_KE 732 (Vector<URL> urls, boolean ifExceptionSetNull) 733 { 734 int pos = 0; 735 int shortenCount = 0; 736 Vector<MalformedURLException> v = new Vector<>(); 737 738 for (int i=0; i < urls.size(); i++) v.setElementAt(null, i); 739 740 for (int i = (urls.size() - 1); i >= 0; i--) 741 { 742 URL url = urls.elementAt(i); 743 744 try 745 { 746 if (url.getRef() != null) 747 { 748 URL newURL = new URL( 749 ((url.getProtocol() != null) ? url.getProtocol().toLowerCase() : "") + 750 "://" + 751 ((url.getHost() != null) ? url.getHost().toLowerCase() : "") + 752 ((url.getFile() != null) ? url.getFile() : "") 753 ); 754 755 urls.setElementAt(newURL, i); 756 shortenCount++; 757 } 758 } 759 760 catch (MalformedURLException e) 761 { 762 if (ifExceptionSetNull) urls.setElementAt(null, i); 763 v.setElementAt(e, i); 764 } 765 } 766 767 return new Ret2<Integer, Vector<MalformedURLException>>(Integer.valueOf(shortenCount), v); 768 } 769 770 /** 771 * On the internet, a {@code URL} is part case-sensitive, and part case-insensitive. The 772 * Domain-Name and Protocol ({@code http://}, and {@code 'some.company.com'}) portions of the 773 * {@code URL} <I>are Case-Insensitive - they may be in any combination of upper or lower 774 * case</I>. 775 * 776 * <BR /><BR />However, the directory, file-name, and (optional) Query-{@code String} portion 777 * of a {@code URL} are (often, but not always) Case-Sensitive. The sensitivity to case in 778 * these three parts of a {@code URL} is dependent upon the individual Web-Server that is 779 * providing the content for the {@code URL}. 780 * 781 * <BR /><BR />To summarize, DNS servers which monitor the Domain-Name part of a {@code URL} 782 * treat upper & lower case English-Letters as the same. Web-Server that utilize the File 783 * Directory part of a {@code URL} will sometimes care about case, and sometimes won't. This 784 * behavior is dependent upon how the Web-Master has configured his system. 785 * 786 * @param url This may be any Internet-Domain {@code URL} 787 * 788 * @return A {@code String} version of this {@code URL}, but the domain and protocol portions 789 * of the {@code URL} will be a "consistent" lower case. The case of the directory, file and 790 * (possibly, but not guaranteed to be present) {@code query-string} portion will not have 791 * their case modified either way. 792 * 793 * <BR /><BR /><B>NOTE:</B> This type of information is pretty important is you are attempting 794 * to scan for duplicate {@code URL's} or check their equality. 795 */ 796 public static String urlToString(URL url) 797 { 798 return 799 ((url.getProtocol() != null) ? url.getProtocol().toLowerCase() : "") + "://" + 800 ((url.getHost() != null) ? url.getHost().toLowerCase() : "") + 801 ((url.getPath() != null) ? url.getPath() : "") + 802 ((url.getQuery() != null) ? ('?' + url.getQuery()) : "") + 803 ((url.getRef() != null) ? ('#' + url.getRef()) : ""); 804 } 805 806 /** 807 * As of today, the version of UNIX {@code curl} command does not seem to be downloading 808 * everything properly. It downloaded an image {@code '.png'} file just fine, but seemed to 809 * have botched a zip-file. This does what UNIX {@code 'curl'} command, <I>but does not 810 * actually invoke the UNIX operating system to do it.</I> It just does this... 811 * 812 * @param url This may be any URL, but it is intended to be a downloadable file. It will 813 * download {@code '.html'} files fine, but you may try images, data-files, zip-files, 814 * tar-archives, and movies. 815 * 816 * @param outFileName You must specify a file-name, and if this parameter is null, a 817 * {@code NullPointerException} will be thrown immediately. If you would like your program 818 * to guess the filename - <I>based on the file named in the URL</I>, please use the method 819 * {@code URL.getFile()}, or something to that effect. 820 * 821 * @param userAgent A User-Agent, as a {@code String}. If this parameter is passed null, 822 * it will be silently ignored, and a User-Agent won't be used. 823 * 824 * @throws IOException If there are I/O Errors when using the {@code HttpURLConnection}. 825 */ 826 public static void CURL(URL url, String outFileName, String userAgent) throws IOException 827 { 828 HttpURLConnection con = (HttpURLConnection) url.openConnection(); 829 830 con.setRequestMethod("GET"); 831 832 if (userAgent != null) con.setRequestProperty("User-Agent", userAgent); 833 834 InputStream is = con.getInputStream(); 835 FileOutputStream fos = new FileOutputStream(outFileName); 836 byte[] b = new byte[5000]; 837 int result = 0; 838 839 while ((result = is.read(b)) != -1) fos.write(b, 0, result); 840 841 fos.flush(); fos.close(); is.close(); 842 } 843}