001package Torello.Java; 002 003import java.util.*; 004import java.net.*; 005import java.util.regex.*; 006import java.io.*; 007 008import static Torello.Java.C.*; 009 010import Torello.Java.Additional.Ret2; 011 012/** 013 * A class that plays-with URL's, no more, no less. 014 * 015 * <EMBED CLASS='external-html' DATA-FILE-ID=URLS> 016 */ 017@Torello.JavaDoc.StaticFunctional 018public class URLs 019{ 020 private URLs() { } 021 022 /** 023 * This is a Regular-Expression Pattern {@code (java.util.regex.Pattern)} - saved as a 024 * {@code String}. It is subsequently compiled. 025 * 026 * <BR /><BR />The primary function is to match {@code String's} that are intended to match 027 * HTTP-{@code URL's}. This Regular Expression matches: 028 * 029 * <BR /><BR /><UL CLASS=JDUL> 030 * <LI>{@code http(s)://...<any-text>.../}</LI> 031 * <LI>{@code http(s)://...<any-text, not front-slash>...}</LI> 032 * <LI>{@code http(s)://...<any-text>.../...<any-text, not front-slash>...}</LI> 033 * </UL> 034 * 035 * <BR /><BR /><B CLASS=JDDescLabel>Primarily used in:</B> 036 * 037 * <BR /><UL CLASS=JDUL> 038 * <LI>{@link #toProperURLV3(String)}</LI> 039 * <LI>{@link #toProperURLV4(String)}</LI> 040 * </UL> 041 * 042 * @see #P1 043 */ 044 protected static final String RE1 = 045 "^(http[s]?:\\/\\/.*?\\/$|http[s]?:\\/\\/[^\\/]*$|http[s]?:\\/\\/.*?\\/[^\\/]+)"; 046 047 /** 048 * {@code P1 = Pattern.compile(RE1);} 049 * 050 * @see #RE1 051 */ 052 protected static final Pattern P1 = Pattern.compile(RE1); 053 054 /** 055 * Java Help Messag Explaining {@code class java.net.URL} - and the specific output of its 056 * methods. 057 * 058 * <EMBED CLASS='external-html' DATA-FILE-ID=URLS_HELP_MSG> 059 * 060 * @param sw An instance of class StorageWriter. This parameter may be null, and if it is 061 * text-output will be sent to Standard-Output. 062 */ 063 protected static final void javaURLHelpMessage(StorageWriter sw) 064 { 065 if (sw == null) sw = new StorageWriter(); 066 067 String[] urlStrArr = { 068 "https://DALLASCITYHALL.com", "https://dallascityhall.com/", 069 "https://dallascityhall.com/news", 070 "https://dallascityhall.com/news/", "http://DALLASCITYHALL.com/news/ARTICLE-1.html", 071 "https://DallasCityHall.com/NEWS/article1.html?q=somevalue", 072 "https://DallasCityHall.com/news/ARTICLE-1.html#subpart1", 073 "https://DallasCityHall.com/NEWS/article1.html?q=somevalue&q2=someOtherValue", 074 "https://DallasCityHall.com/NEWS/article1.html?q=somevalue&q2=someOtherValue#LocalRef" 075 }; 076 077 URL[] urlArr = new URL[urlStrArr.length]; 078 079 try 080 { for (int i=0; i < urlStrArr.length; i++) urlArr[i] = new URL(urlStrArr[i]); } 081 082 catch (Exception e) 083 { 084 sw.println( 085 "Broke a URL, and it generated an exception.\n" + 086 "Sorry, fix the URL's in this method.\n" + 087 "Did you change them?" 088 ); 089 090 e.printStackTrace(); 091 return; 092 } 093 094 for (URL u : urlArr) 095 { 096 System.out.println( 097 "u.toString():\t\t" + BCYAN + u.toString() + RESET + '\n' + 098 "u.getProtocol():\t" + u.getProtocol() + '\n' + 099 "u.getHost():\t\t" + u.getHost() + '\n' + 100 "u.getPath():\t\t" + u.getPath() + '\n' + 101 "u.getFile():\t\t" + u.getFile() + '\n' + 102 "u.getQuery():\t\t" + u.getQuery() + '\n' + 103 "u.getRef():\t\t" + u.getRef() + '\n' + 104 "u.getAuthority():\t" + u.getAuthority() + '\n' + 105 "u.getUserInfo():\t" + u.getUserInfo() + '\n' + 106 "urlToString(u):\t\t" + urlToString(u) 107 ); 108 } 109 } 110 111 112 // ******************************************************************************************** 113 // ******************************************************************************************** 114 // Helper function for making URL address readable by web-servers. 115 //********************************************************************************************* 116 // ******************************************************************************************** 117 118 119 /** 120 * When scraping Spanish {@code URL's}, these characters can / should be escaped. 121 * 122 * <BR /><BR /><B CLASS=JDDescLabel>Parallel Array Note:</B> 123 * 124 * <BR />This array shall be considered parallel to the <B><I>replacement</I></B> 125 * {@code String[]}-Array {@link #VOWELS_URL}. 126 * 127 * @see #toProperURLV1(String) 128 * @see #VOWELS_URL 129 */ 130 protected static final char[] VOWELS = { 131 'á', 'É', 'é', 'Í', 'í', 'Ó', 'ó', 'Ú', 'ú', 'Ü', 'ü', 132 'Ñ', 'ñ', 'Ý', 'ý', '¿', '¡' 133 }; 134 135 /** 136 * When scraping Spanish {@code URL's}, these {@code String's} are the 137 * <B>URL Escape Sequences</B> for the Spanish Vowel Characters listed in {@link #VOWELS}. 138 * 139 * <BR /><BR /><B CLASS=JDDescLabel>Parallel Array Note:</B> 140 * 141 * <BR />This array shall be considered parallel to {@code String[]}-Array {@link #VOWELS}. 142 * 143 * @see #toProperURLV1(String) 144 * @see #VOWELS 145 */ 146 protected static final String[] VOWELS_URL = 147 { 148 "%C3%A1", "%C3%89", "%C3%A9", "%C3%8D", "%C3%AD", "%C3%93", "%C3%B3", "%C3%9A", 149 "%C3%BA", "%C3%9C", "%C3%BC", "%C3%91", "%C3%B1", "%C3%9D", "%C3%BD", "%C2%BF", 150 "%C2%A1" 151 }; 152 153 /** 154 * This will substitute many of the Spanish-characters that can make a web-query difficult. 155 * These are the substitutions listed: 156 * 157 * <BR /><BR /><TABLE CLASS=JDBriefTable> 158 * <TR><TH>Spanish Language Character</TH><TH>URL Escape Sequence</TH></TR> 159 * <TR><TD>{@code Á}</TD><TD>{@code %C3%81}</TD></TR> 160 * <TR><TD>{@code á}</TD><TD>{@code %C3%A1}</TD></TR> 161 * <TR><TD>{@code É}</TD><TD>{@code %C3%89}</TD></TR> 162 * <TR><TD>{@code é}</TD><TD>{@code %C3%A9}</TD></TR> 163 * <TR><TD>{@code Í}</TD><TD>{@code %C3%8D}</TD></TR> 164 * <TR><TD>{@code í}</TD><TD>{@code %C3%AD}</TD></TR> 165 * <TR><TD>{@code Ó}</TD><TD>{@code %C3%93}</TD></TR> 166 * <TR><TD>{@code ó}</TD><TD>{@code %C3%B3}</TD></TR> 167 * <TR><TD>{@code Ú}</TD><TD>{@code %C3%9A}</TD></TR> 168 * <TR><TD>{@code ú}</TD><TD>{@code %C3%BA}</TD></TR> 169 * <TR><TD>{@code Ü}</TD><TD>{@code %C3%9C}</TD></TR> 170 * <TR><TD>{@code ü}</TD><TD>{@code %C3%BC}</TD></TR> 171 * <TR><TD>{@code Ñ}</TD><TD>{@code %C3%91}</TD></TR> 172 * <TR><TD>{@code ñ}</TD><TD>{@code %C3%B1}</TD></TR> 173 * <TR><TD>{@code Ý}</TD><TD>{@code %C3%9D}</TD></TR> 174 * <TR><TD>{@code ý}</TD><TD>{@code %C3%BD}</TD></TR> 175 * </TABLE> 176 * 177 * <BR /><BR /><B CLASS=JDDescLabel>Historical Note:</B> 178 * 179 * <BR />This method was written the very first time that a {@code URL} needed to be escaped 180 * during the writing of the Java-HTML {@code '.jar'}. 181 * 182 * @param url Any website {@code URL} query. 183 * 184 * @return The same {@code URL} with substitutions made. 185 * 186 * @see #VOWELS 187 * @see #VOWELS_URL 188 * @see StrReplace#r(String, char[], String[]) 189 */ 190 public static String toProperURLV1(String url) 191 { return StrReplace.r(url, VOWELS, VOWELS_URL); } 192 193 /** 194 * This list of java {@code char's} are characters that are better off escaped when passing 195 * them through a {@code URL}. 196 * 197 * @see #toProperURLV2(String) 198 */ 199 protected static final char[] URL_ESC_CHARS = 200 { 201 '%', ' ', '#', '$', '&', '@', '`', '/', ':', ';', '<', '=', '>', '?', '[', '\\', 202 ']', '^', '{', '|', '}', '~', '\'', '+', ',' 203 }; 204 205 /** 206 * This method will clobber the leading Domain-Name and Protocol - 207 * {@code http://domain.name.something/} stuff. It is best to use this method on 208 * {@code String's} that will be inserted into a {@code URL} after the {@code '?'} 209 * question-mark, inside the Query-String. 210 * 211 * <BR /><BR />This can be very useful when sending JSON Arguments, for instance, inside a 212 * {@code URL's} Query-String, instead of the GET / POST part of a request. 213 * 214 * <BR /><BR />Note that this method should not be used to escape characters outside of the 215 * range of Standard-ASCII (characters {@code 0 ... 255}). 216 * 217 * <BR /><BR /><B CLASS=JDDescLabel>State of the Experiment:</B> 218 * 219 * <BR />It seems to help to escape these characters: 220 * 221 * <BR /><B STYLE="color:red;">{@code # $ % & @ ` / : ; < = > ? [ \ ] ^ | ~ " ' + ,} 222 * <CODE> { } </CODE></B> 223 * 224 * @param urlStuff Any information that is intended to be sent via an HTTP-{@code URL}, and 225 * needs to be escaped. 226 * 227 * @return An escaped version of this {@code URL-String} 228 * 229 * @see #URL_ESC_CHARS 230 * @see StrReplace#r(String, char[], IntCharFunction) 231 */ 232 public static String toProperURLV2(String urlStuff) 233 { 234 return StrReplace.r( 235 urlStuff, URL_ESC_CHARS, 236 (int i, char c) -> '%' + Integer.toHexString((int) c) 237 ); 238 } 239 240 /** 241 * This leaves out the actual domain name before starting HTTP-URL Escape Sequences. If this 242 * starts with the words "http://domain.something/" then the initial colon, forward-slash and 243 * periods won't be escaped. Everything after the first front-slash will include URL-HTTP 244 * Escape characters. 245 * 246 * <BR /><BR />This does the same thing as {@code toProperURLV2(String)}, but skips the initial 247 * part of the URL text/string - IF PRESENT! 248 * 249 * <BR /><BR />{@code http(s?)://domain.something/} is skipped by the Regular Expression, 250 * everything else from {@code URLV2} is escaped. 251 * 252 * @param url This may be any internet {@code URL}, represented as a {@code String}. It will 253 * be escaped with the {@code %INT} format. 254 * 255 * @return An escaped {@code URL String} 256 * 257 * @see #toProperURLV2(String) 258 * @see #P1 259 */ 260 public static String toProperURLV3(String url) 261 { 262 String beginsWith = null; 263 Matcher m = P1.matcher(url); 264 265 if (m.find()) 266 { 267 beginsWith = m.group(1); 268 url = url.substring(beginsWith.length()); 269 } 270 271 return ((beginsWith != null) ? beginsWith : "") + toProperURLV2(url); 272 } 273 274 /** 275 * This is a (shortened) list of characters that <I>should</I> be escaped before being used 276 * within a {@code URL}. 277 * 278 * <BR /><BR />This version differs from {@link #URL_ESC_CHARS} in that it does not include the 279 * {@code '&'} (ampersand), the {@code '?'} (question-mark) or the {@code '/'} (forward-slash). 280 * 281 * @see #URL_ESC_CHARS 282 * @see #toProperURLV4(String) 283 */ 284 protected static final char[] URL_ESC_CHARS_ABBREV = 285 { 286 '%', ' ', '#', '$', '@', '`', ':', ';', '<', '=', '>', '[', '\\', ']', 287 '^', '{', '|', '}', '~', '\'', '+', ',' 288 }; 289 290 /** 291 * This does the same thing as V3, but it also will avoid escaping any {@code '?'} 292 * (question-mark) or {@code '&'} (ampersand) or {@code '/'} (forward-slash) symbols anywhere 293 * in the entire {@code String}. It also "skips" escaping the initial 294 * {@code HTTP(s)://domain.net.something/} as well - just like {@code toProperURLV3} 295 * 296 * @return This does the same thing as {@code toProperURLV3(String)}, but leaves out 100% 297 * of the instances of Ampersand, Question-Mark, and Forward-Slash symbols. 298 * 299 * @see #toProperURLV3(String) 300 * @see #P1 301 * @see #URL_ESC_CHARS_ABBREV 302 * @see StrReplace#r(String, char[], IntCharFunction) 303 */ 304 public static String toProperURLV4(String url) 305 { 306 String beginsWith = null; 307 Matcher m = P1.matcher(url); 308 309 if (m.find()) 310 { 311 beginsWith = m.group(1); 312 url = url.substring(beginsWith.length()); 313 } 314 315 return ((beginsWith != null) ? beginsWith : "") + 316 StrReplace.r 317 (url, URL_ESC_CHARS_ABBREV, (int i, char c) -> '%' + Integer.toHexString((int) c)); 318 } 319 320 /** 321 * <EMBED CLASS='external-html' DATA-FILE-ID=URLS_PRP_URL_V5> 322 * 323 * @param url This is the URL to be encoded, properly 324 * 325 * @return A properly encoded URL String. Important, if calling the {@code java.net.URL} 326 * constructor generates a {@code MalformedURLException}, then this method shall return. 327 * The {@code java.net.URL} constructor will be called if the {@code String} passed begins with 328 * the characters {@code 'http://'} or {@code 'https://'}. 329 */ 330 public static String toProperURLV5(String url) 331 { 332 url = url.trim(); 333 334 URL u = null; 335 String[] sArr = null; 336 String tlc = url.toLowerCase(); 337 338 if (tlc.startsWith("http://") || tlc.startsWith("https://")) 339 { try { u = new URL(url); } catch (Exception e) { return null; } } 340 341 if (u == null) sArr = url.split("/"); 342 else sArr = u.getPath().split("/"); 343 344 String slash = ""; 345 StringBuilder sb = new StringBuilder(); 346 347 for (String s : sArr) 348 { 349 try 350 { sb.append(slash + java.net.URLEncoder.encode(s, "UTF-8")); } 351 352 catch (UnsupportedEncodingException e) 353 { /* This really cannot happen, and I don't know what to put here! */ } 354 355 slash = "/"; 356 } 357 358 if (u == null) 359 return sb.toString(); 360 else 361 return 362 u.getProtocol() + "://" + u.getHost() + sb.toString() + 363 ((u.getQuery() != null) ? ("?" + u.getQuery()) : "") + 364 ((u.getRef() != null) ? ("#" + u.getRef()) : ""); 365 } 366 367 /** 368 * Rather than trying to explain what is escaped and what is left alone, please review the 369 * exact code here. 370 * 371 * <BR /><BR /><B CLASS=JDDescLabel>Another One:</B> 372 * 373 * <BR />Well, I just wrote another one, they told me to. This, newest version of 374 * {@code URL}-Encoding is actually pretty successful. It handles all Extra-Characters and is 375 * capable of dealing with {@code URL's} that contain the {@code '?' '=' '&'} operators of 376 * {@code GET}-Requests. 377 * 378 * <BR /><BR />Realize that though the out-of-the-box JDK, there is a class called 379 * "URI Encoder" - but that class expects that the {@code URL} to have already been separated 380 * out into it's distinct parts. 381 * 382 * <BR /><BR />This method does the the {@code URL}-Separating into disparate parts 383 * before performing the Character-Escaping. 384 * 385 * @param url This is any java {@code URL}. 386 * 387 * @return a new {@code String} version of the input parameter {@code 'url'} 388 */ 389 public static String toProperURLV6(String url) 390 { 391 URL u = null; 392 393 try 394 { u = new URL(url); } 395 396 catch (Exception e) { return null; } 397 398 StringBuilder sb = new StringBuilder(); 399 400 sb.append(u.getProtocol()); 401 sb.append("://"); 402 sb.append(u.getHost()); 403 sb.append(toProperURLV5(u.getPath())); 404 405 if (u.getQuery() != null) 406 { 407 String[] sArr = u.getQuery().split("&"); 408 StringBuilder sb2 = new StringBuilder(); 409 String ampersand = ""; 410 411 for (String s : sArr) 412 { 413 String[] s2Arr = s.split("="); 414 StringBuilder sb3 = new StringBuilder(); 415 String equals = ""; 416 417 for (String s2: s2Arr) 418 { 419 try 420 { sb3.append(equals + java.net.URLEncoder.encode(s2, "UTF-8")); } 421 422 // This should never happen - UTF-8 is (sort-of) the only encoding. 423 catch (UnsupportedEncodingException e) { } 424 425 equals = "="; 426 } 427 428 sb2.append(ampersand + sb3.toString()); 429 ampersand = "&"; 430 } 431 432 sb.append("?" + sb2.toString()); 433 } 434 435 // Not really a clue, because a the "#" operator and the "?" probably shouldn't be used 436 // together. Java's java.net.URL class will parse a URL that has both the ? and the #, but 437 // I have no idea which Web-Sites would allow this, or encourage this... 438 439 if (u.getRef() != null) 440 441 try 442 { sb.append("#" + java.net.URLEncoder.encode(u.getRef(), "UTF-8")); } 443 444 catch (UnsupportedEncodingException e) { } 445 446 return sb.toString(); 447 } 448 449 /** 450 * These strictly use Java's URI Encoding Mechanism. They seem to work the same as "V6" 451 * Internally, these are now used. This as of November, 2019. 452 * 453 * @param url A Complete Java {@code URL}, as a {@code String}. Any specialized 454 * Escape-Characters that need to be escaped, will be. 455 * 456 * @throws URISyntaxException This will throw if building the {@code URI} generates an 457 * exception. Internally, all this method does is build a {@code URI}, and then call the Java 458 * Method {@code 'toASCIIString()'} 459 */ 460 public static String toProperURLV7(String url) throws URISyntaxException, MalformedURLException 461 { return toProperURLV8(new URL(url)); } 462 463 /** 464 * These strictly use Java's URI Encoding Mechanism. They seem to work the same as "V6" 465 * Internally, these are now used. This as of November, 2019. 466 * 467 * @param url A Complete Java {@code URL}. Any specialized Escape-Characters that need to be 468 * escaped, will be. 469 * 470 * @throws URISyntaxException This will throw if building the URI generates an exception. 471 * Internally, all this method does is build a URI, and then call the Java Method 472 * {@code 'toASCIIString()'} 473 */ 474 public static String toProperURLV8(URL url) throws URISyntaxException, MalformedURLException 475 { 476 return new URI( 477 url.getProtocol(), 478 url.getUserInfo(), 479 url.getHost(), 480 url.getPort(), 481 url.getPath(), 482 url.getQuery(), 483 url.getRef() 484 ).toASCIIString(); 485 } 486 487 488 // ******************************************************************************************** 489 // ******************************************************************************************** 490 // The original "URLs" class 491 //********************************************************************************************* 492 // ******************************************************************************************** 493 494 495 /** 496 * If you have a list of {@code URL's}, and want to quickly remove any 497 * duplicate-{@code URL's} found in the list - this will remove them. 498 * 499 * <BR /><BR /><B CLASS=JDDescLabel>Case Sensitivity:</B> 500 * 501 * <BR />This method will perform a few "to-lower-case" operations on the protocol and 502 * Web-Domain parts, but not on the file, directory, or Query-String portion of the 503 * {@code URL}. 504 * 505 * <BR /><BR />This should hilite what is Case-Sensitive, and what is not: 506 * 507 * <BR /><BR /><UL CLASS=JDUL> 508 * <LI> These are considered duplicate URL's: 509 * <BR /> 510 * <BR /><CODE>http://some.company.com/index.html</CODE> 511 * <BR /><CODE>HTTP://SOME.COMPANY.COM/index.html</CODE> 512 * <BR /><BR /> 513 * </LI> 514 * 515 * <LI> These are <I>not</I> considered duplicate URL's: 516 * <BR /> 517 * <BR /><CODE>http://other.company.com/Directory/Ben-Bitdiddle.html</CODE> 518 * <BR /><CODE>http://other.company.com/DIRECTORY/BE.html</CODE> 519 * </LI> 520 * </UL> 521 * 522 * @param urls Any list of {@code URL's}, some of which might have been duplicated. The 523 * difference between this {@code 'removeDuplicates'} and the other {@code 'removeDuplicates'} 524 * available in this class is that this one only removes multiple instances of the same 525 * {@code URL} in this {@code Vector}, while the other one iterates through a list of 526 * {@code URL's} already visited in a previous-session. 527 * 528 * <BR /><BR /><B>NOTE:</B> <I>Null {@code Vector}-values are skipped outright, they are 529 * neither removed nor changed.</i> 530 * 531 * @return The number of {@code Vector} elements that were removed. (i.e. <I>The size by which 532 * the {@code Vector} was shrunk.</I>) 533 */ 534 public static int removeDuplicates(Vector<URL> urls) 535 { 536 TreeSet<String> dups = new TreeSet<>(); 537 int count = 0; 538 int size = urls.size(); 539 URL url = null; 540 541 for (int i=0; i < size; i++) 542 543 if ((url = urls.elementAt(i)) != null) 544 if (! dups.add(urlToString(url))) 545 { 546 count++; 547 size--; 548 i--; 549 urls.removeElementAt(i); 550 } 551 552 return count; 553 } 554 555 /** 556 * This simple method will remove any {@code URL's} from the input {@code Vector} parameter 557 * {@code 'potentiallyNewURLs'} which are also present-members of the input {@code Vector} 558 * parameter {@code 'visitedURLs'}. 559 * 560 * <BR /><BR />This may seem trivial, and it is, but it worries about things like the 561 * {@code String's} Case for you. 562 * 563 * @param visitedURLs This parameter is a list of {@code URL's} that have already 564 * "been visited." 565 * 566 * @param potentiallyNewURLs This parameter is a list of {@code URL's} that are possibly 567 * "un-visited" - meaning whatever scrape, crawl or search being performed needs to know which 568 * {@code URL's} are listed in the previous parameter's contents. This may seem trivial, just 569 * use the java {@code url1.equals(url2)} command, but, alas, java doesn't exactly take into 570 * account upper-case and lower-case domain-names. This worries about case. 571 * 572 * @return The number of {@code URL's} that were removed from the input {@code Vector} 573 * parameter {@code 'potentiallyNewURLs'}. 574 */ 575 public static int removeDuplicates(Vector<URL> visitedURLs, Vector<URL> potentiallyNewURLs) 576 { 577 // The easiest way to check for duplicates is to build a tree-set of all the URL's as a 578 // String. Java's TreeSet<> generic already (automatically) scans for duplicates 579 // (efficiently) and will tell you if you have tried to add a duplicate 580 581 TreeSet<String> dups = new TreeSet<>(); 582 583 // Build a TreeSet of the url's from the "Visited URLs" parameter 584 visitedURLs.forEach(url -> dups.add(urlToString(url))); 585 586 // Add the "Possibly New URLs", one-by-one, and remove them if they are already in the 587 // visited list. 588 589 int count = 0; 590 int size = potentiallyNewURLs.size(); 591 URL url = null; 592 593 for (int i=0; i < size; i++) 594 595 if ((url = potentiallyNewURLs.elementAt(i)) != null) 596 597 if (! dups.add(urlToString(url))) 598 { 599 count++; 600 size--; 601 i--; 602 potentiallyNewURLs.removeElementAt(i); 603 } 604 605 return count; 606 } 607 608 /** 609 * Removes any Fragment-{@code URL} {@code '#'} symbols from a {@code URL}. 610 * 611 * <BR /><BR />If this {@code URL} contains a pound-sign Anchor-Name according to the Standard 612 * JDK's {@code URL.getRef()} method. Specifically, if {@code URL.getRef()} returns a non-null 613 * value, this method rebuilds the URL, without any Anchor-Name / Fragment information. 614 * 615 * <BR /><BR />The intention is to return a {@code URL} where any / all {@code String}-data 616 * that occurs after a {@code '#'} Hash-Tab / Pound-Sign is removed. 617 * 618 * @param url Any standard HTTP {@code URL}. If this {@code 'url'} contains a {@code '#'} 619 * (Pound Sign, Partial Reference) - according to the standard JDK {@code URL.getRef()} method, 620 * then it shall be removed. 621 * 622 * @return The {@code URL} without the partial-reference, or the original {@code URL} if there 623 * was no partial reference. Null is returned if there is an error instantiating the new 624 * {@code URL} without the partial-reference. 625 */ 626 public static URL shortenPoundREF(URL url) 627 { 628 try 629 { 630 if (url.getRef() != null) return new URL( 631 ((url.getProtocol() != null) ? url.getProtocol().toLowerCase() : "") + 632 "://" + 633 ((url.getHost() != null) ? url.getHost().toLowerCase() : "") + 634 ((url.getFile() != null) ? url.getFile() : "") 635 ); 636 637 else return url; 638 } 639 640 catch (MalformedURLException e) { return null; } 641 } 642 643 /** 644 * <EMBED CLASS='external-html' DATA-FILE-ID=URLS_NAMED_ANCHORS> 645 * 646 * @param urls Any list of completed (read: <I>fully-resolved</I>) {@code URL's}. 647 * 648 * @param ifExceptionSetNull If this parameter is passed {@code TRUE}, if there is ever an 649 * exception-throw while building the new {@code URL's} (without the fragment / pound-sign), 650 * then that position in the {@code Vector} will be replaced with a null. 651 * 652 * <BR /><BR />When this parameter is passed {@code FALSE}, if an exception is thrown, then 653 * it will be caught and silently ignored. 654 * 655 * @return The number / count of {@code URL's} in this list that were modified. Whenever a 656 * {@code URL} Named-Anchor is encountered, it will be removed from the {@code URL}, and a 657 * new {@code URL} without the fragment-part will be inserted to replace the old one. 658 * 659 * <BR /><BR />The integer that is returned here is the number of times that a replacement 660 * was made to the input {@code Vector}-parameter {@code 'urls'}. 661 */ 662 public static int shortenPoundREFs(Vector<URL> urls, boolean ifExceptionSetNull) 663 { 664 int pos = 0; 665 int shortenCount = 0; 666 667 for (int i = (urls.size() - 1); i >= 0; i--) 668 { 669 URL url = urls.elementAt(i); 670 671 try 672 { 673 if (url.getRef() != null) 674 { 675 URL newURL = new URL( 676 ((url.getProtocol() != null) ? url.getProtocol().toLowerCase() : "") + 677 "://" + 678 ((url.getHost() != null) ? url.getHost().toLowerCase() : "") + 679 ((url.getFile() != null) ? url.getFile() : "") 680 ); 681 682 urls.setElementAt(newURL, i); 683 shortenCount++; 684 } 685 } 686 687 catch (MalformedURLException e) 688 { if (ifExceptionSetNull) urls.setElementAt(null, i); } 689 } 690 691 return shortenCount; 692 } 693 694 /** 695 * <EMBED CLASS='external-html' DATA-FILE-ID=URLS_NAMED_ANCHORS> 696 * 697 * <BR /><BR /><B CLASS=JDDescLabel>KE: Keep Exceptions</B> 698 * 699 * <BR />This method is identical to the previous method, defined above, except that it 700 * allows a programmer to keep / retain any {@code MalformedURLException's} that are thrown 701 * while re-building them. 702 * 703 * @param urls Any list of completed (read: <I>fully-resolved</I>) {@code URL's}. 704 * 705 * @param ifExceptionSetNull If this is {@code TRUE} then if there is ever an exception building 706 * a new {@code URL} without a "Relative {@code URL '#'}" (Pound-Sign), then that position in 707 * the {@code Vector} will be replaced with 'null.' 708 * 709 * @return The number/count of {@code URL's} in this list that were modified. If a {@code URL} 710 * was modified, it was because it had a partial-page reference in it. If in the process of 711 * generating a new {@code URL} out of an old one, a {@code MalformedURLException} occurs, the 712 * exception will be placed in the {@code Ret2.b} position, which is a 713 * {@code Vector<MalformedURLException>}. 714 * 715 * <BR /><BR /><B>SPECIFICALLY:</B> 716 * 717 * <BR /><BR /><UL CLASS=JDUL> 718 * 719 * <LI> {@code Ret2.a = 'Integer'} number of {@code URL's} shortened for having a {@code '#'} 720 * partial-reference. 721 * </LI> 722 * 723 * <LI> {@code Ret2.b = Vector<MalformedURLException>} where each element of this 724 * {@code Vector} is null if there were no problems converting the {@code URL}, or the 725 * exception reference if there were exceptions thrown. 726 * </LI> 727 * 728 * </UL> 729 */ 730 public static Ret2<Integer, Vector<MalformedURLException>> shortenPoundREFs_KE 731 (Vector<URL> urls, boolean ifExceptionSetNull) 732 { 733 int pos = 0; 734 int shortenCount = 0; 735 Vector<MalformedURLException> v = new Vector<>(); 736 737 for (int i=0; i < urls.size(); i++) v.setElementAt(null, i); 738 739 for (int i = (urls.size() - 1); i >= 0; i--) 740 { 741 URL url = urls.elementAt(i); 742 743 try 744 { 745 if (url.getRef() != null) 746 { 747 URL newURL = new URL( 748 ((url.getProtocol() != null) ? url.getProtocol().toLowerCase() : "") + 749 "://" + 750 ((url.getHost() != null) ? url.getHost().toLowerCase() : "") + 751 ((url.getFile() != null) ? url.getFile() : "") 752 ); 753 754 urls.setElementAt(newURL, i); 755 shortenCount++; 756 } 757 } 758 759 catch (MalformedURLException e) 760 { 761 if (ifExceptionSetNull) urls.setElementAt(null, i); 762 v.setElementAt(e, i); 763 } 764 } 765 766 return new Ret2<Integer, Vector<MalformedURLException>>(Integer.valueOf(shortenCount), v); 767 } 768 769 /** 770 * On the internet, a {@code URL} is part case-sensitive, and part case-insensitive. The 771 * Domain-Name and Protocol ({@code http://}, and {@code 'some.company.com'}) portions of the 772 * {@code URL} <I>are Case-Insensitive - they may be in any combination of upper or lower 773 * case</I>. 774 * 775 * <BR /><BR />However, the directory, file-name, and (optional) Query-{@code String} portion 776 * of a {@code URL} are (often, but not always) Case-Sensitive. The sensitivity to case in 777 * these three parts of a {@code URL} is dependent upon the individual Web-Server that is 778 * providing the content for the {@code URL}. 779 * 780 * <BR /><BR />To summarize, DNS servers which monitor the Domain-Name part of a {@code URL} 781 * treat upper & lower case English-Letters as the same. Web-Server that utilize the File 782 * Directory part of a {@code URL} will sometimes care about case, and sometimes won't. This 783 * behavior is dependent upon how the Web-Master has configured his system. 784 * 785 * @param url This may be any Internet-Domain {@code URL} 786 * 787 * @return A {@code String} version of this {@code URL}, but the domain and protocol portions 788 * of the {@code URL} will be a "consistent" lower case. The case of the directory, file and 789 * (possibly, but not guaranteed to be present) {@code query-string} portion will not have 790 * their case modified either way. 791 * 792 * <BR /><BR /><B>NOTE:</B> This type of information is pretty important is you are attempting 793 * to scan for duplicate {@code URL's} or check their equality. 794 */ 795 public static String urlToString(URL url) 796 { 797 return 798 ((url.getProtocol() != null) ? url.getProtocol().toLowerCase() : "") + "://" + 799 ((url.getHost() != null) ? url.getHost().toLowerCase() : "") + 800 ((url.getPath() != null) ? url.getPath() : "") + 801 ((url.getQuery() != null) ? ('?' + url.getQuery()) : "") + 802 ((url.getRef() != null) ? ('#' + url.getRef()) : ""); 803 } 804 805 /** 806 * As of today, the version of UNIX {@code curl} command does not seem to be downloading 807 * everything properly. It downloaded an image {@code '.png'} file just fine, but seemed to 808 * have botched a zip-file. This does what UNIX {@code 'curl'} command, <I>but does not 809 * actually invoke the UNIX operating system to do it.</I> It just does this... 810 * 811 * @param url This may be any URL, but it is intended to be a downloadable file. It will 812 * download {@code '.html'} files fine, but you may try images, data-files, zip-files, 813 * tar-archives, and movies. 814 * 815 * @param outFileName You must specify a file-name, and if this parameter is null, a 816 * {@code NullPointerException} will be thrown immediately. If you would like your program 817 * to guess the filename - <I>based on the file named in the URL</I>, please use the method 818 * {@code URL.getFile()}, or something to that effect. 819 * 820 * @param userAgent A User-Agent, as a {@code String}. If this parameter is passed null, 821 * it will be silently ignored, and a User-Agent won't be used. 822 * 823 * @throws IOException If there are I/O Errors when using the {@code HttpURLConnection}. 824 */ 825 public static void CURL(URL url, String outFileName, String userAgent) throws IOException 826 { 827 HttpURLConnection con = (HttpURLConnection) url.openConnection(); 828 829 con.setRequestMethod("GET"); 830 831 if (userAgent != null) con.setRequestProperty("User-Agent", userAgent); 832 833 InputStream is = con.getInputStream(); 834 FileOutputStream fos = new FileOutputStream(outFileName); 835 byte[] b = new byte[5000]; 836 int result = 0; 837 838 while ((result = is.read(b)) != -1) fos.write(b, 0, result); 839 840 fos.flush(); fos.close(); is.close(); 841 } 842}