Source code

001package Torello.Java.Additional;
002
003import java.util.*;
004import java.net.*;
005import java.util.regex.*;
006import java.io.*;
007
008import static Torello.Java.C.*;
009
010import Torello.Java.StorageWriter;
011import Torello.Java.StrReplace;
012
013/**
014 * A class that plays-with URL's, no more, no less.
015 * 
016 * <EMBED CLASS='external-html' DATA-FILE-ID=URLS>
017 */
018@Torello.JavaDoc.StaticFunctional
019public class URLs
020{
021    private URLs() { }
022
023    /**
024     * This is a Regular-Expression Pattern {@code (java.util.regex.Pattern)} - saved as a 
025     * {@code String}.  It is subsequently compiled.
026     *
027     * <BR /><BR />The primary function is to match {@code String's} that are intended to match
028     * HTTP-{@code URL's}.  This Regular Expression matches:
029     * 
030     * <BR /><BR /><UL CLASS=JDUL>
031     * <LI>{@code http(s)://...<any-text>.../}</LI>
032     * <LI>{@code http(s)://...<any-text, not front-slash>...}</LI>
033     * <LI>{@code http(s)://...<any-text>.../...<any-text, not front-slash>...}</LI>
034     * </UL>
035     * 
036     * <BR /><BR /><B CLASS=JDDescLabel>Primarily used in:</B>
037     * 
038     * <BR /><UL CLASS=JDUL>
039     * <LI>{@link #toProperURLV3(String)}</LI>
040     * <LI>{@link #toProperURLV4(String)}</LI>
041     * </UL>
042     * 
043     * @see #P1
044     */
045    protected static final String RE1 =
046         "^(http[s]?:\\/\\/.*?\\/$|http[s]?:\\/\\/[^\\/]*$|http[s]?:\\/\\/.*?\\/[^\\/]+)";
047
048    /**
049     * {@code P1 = Pattern.compile(RE1);}
050     * 
051     * @see #RE1
052     */
053    protected static final Pattern P1 = Pattern.compile(RE1);
054
055    /**
056     * Java Help Messag Explaining {@code class java.net.URL} - and the specific output of its
057     * methods.
058     *
059     * <EMBED CLASS='external-html' DATA-FILE-ID=URLS_HELP_MSG>
060     *
061     * @param sw An instance of class StorageWriter.  This parameter may be null, and if it is
062     * text-output will be sent to Standard-Output.
063     */
064    protected static final void javaURLHelpMessage(StorageWriter sw)
065    {
066        if (sw == null) sw = new StorageWriter();
067
068        String[] urlStrArr = {
069            "https://DALLASCITYHALL.com", "https://dallascityhall.com/",
070            "https://dallascityhall.com/news",
071            "https://dallascityhall.com/news/", "http://DALLASCITYHALL.com/news/ARTICLE-1.html",
072            "https://DallasCityHall.com/NEWS/article1.html?q=somevalue",
073            "https://DallasCityHall.com/news/ARTICLE-1.html#subpart1",
074            "https://DallasCityHall.com/NEWS/article1.html?q=somevalue&q2=someOtherValue",
075            "https://DallasCityHall.com/NEWS/article1.html?q=somevalue&q2=someOtherValue#LocalRef"
076        };
077
078        URL[] urlArr = new URL[urlStrArr.length];
079
080        try
081            { for (int i=0; i < urlStrArr.length; i++) urlArr[i] = new URL(urlStrArr[i]); }
082
083        catch (Exception e)
084        {
085            sw.println(
086                "Broke a URL, and it generated an exception.\n" +
087                "Sorry, fix the URL's in this method.\n" + 
088                "Did you change them?"
089            );
090
091            e.printStackTrace();
092            return;
093        }
094
095        for (URL u : urlArr)
096        {
097            System.out.println(
098                "u.toString():\t\t"     + BCYAN + u.toString() + RESET + '\n' +
099                "u.getProtocol():\t"    + u.getProtocol() + '\n' +
100                "u.getHost():\t\t"      + u.getHost() + '\n' +
101                "u.getPath():\t\t"      + u.getPath() + '\n' +
102                "u.getFile():\t\t"      + u.getFile() + '\n' +
103                "u.getQuery():\t\t"     + u.getQuery() + '\n' +
104                "u.getRef():\t\t"       + u.getRef() + '\n' +
105                "u.getAuthority():\t"   + u.getAuthority() + '\n' +
106                "u.getUserInfo():\t"    + u.getUserInfo() + '\n' +
107                "urlToString(u):\t\t"   + urlToString(u)
108            );
109        }
110    }
111
112
113    // ********************************************************************************************
114    // ********************************************************************************************
115    // Helper function for making URL address readable by web-servers.
116    //*********************************************************************************************
117    // ********************************************************************************************
118
119
120    /**
121     * When scraping Spanish {@code URL's}, these characters can / should be escaped.
122     * 
123     * <BR /><BR /><B CLASS=JDDescLabel>Parallel Array Note:</B>
124     * 
125     * <BR />This array shall be considered parallel to the <B><I>replacement</I></B>
126     * {@code String[]}-Array {@link #VOWELS_URL}.
127     * 
128     * @see #toProperURLV1(String)
129     * @see #VOWELS_URL
130     */
131    protected static final char[] VOWELS = {
132        'á', 'É', 'é', 'Í', 'í', 'Ó', 'ó', 'Ú', 'ú', 'Ü', 'ü',
133        'Ñ', 'ñ', 'Ý', 'ý', '¿', '¡'
134    };
135
136    /**
137     * When scraping Spanish {@code URL's}, these {@code String's} are the
138     * <B>URL Escape Sequences</B> for the Spanish Vowel Characters listed in {@link #VOWELS}.
139     * 
140     * <BR /><BR /><B CLASS=JDDescLabel>Parallel Array Note:</B>
141     * 
142     * <BR />This array shall be considered parallel to {@code String[]}-Array {@link #VOWELS}.
143     * 
144     * @see #toProperURLV1(String)
145     * @see #VOWELS
146     */
147    protected static final String[] VOWELS_URL =
148    {
149        "%C3%A1", "%C3%89", "%C3%A9", "%C3%8D", "%C3%AD", "%C3%93", "%C3%B3", "%C3%9A",
150        "%C3%BA", "%C3%9C", "%C3%BC", "%C3%91", "%C3%B1", "%C3%9D", "%C3%BD", "%C2%BF",
151        "%C2%A1"
152    };
153
154    /**
155     * This will substitute many of the Spanish-characters that can make a web-query difficult.
156     * These are the substitutions listed:
157     *
158     * <BR /><BR /><TABLE CLASS=JDBriefTable>
159     * <TR><TH>Spanish Language Character</TH><TH>URL Escape Sequence</TH></TR>
160     * <TR><TD>{@code Á}</TD><TD>{@code %C3%81}</TD></TR>
161     * <TR><TD>{@code á}</TD><TD>{@code %C3%A1}</TD></TR>
162     * <TR><TD>{@code É}</TD><TD>{@code %C3%89}</TD></TR>
163     * <TR><TD>{@code é}</TD><TD>{@code %C3%A9}</TD></TR>
164     * <TR><TD>{@code Í}</TD><TD>{@code %C3%8D}</TD></TR>
165     * <TR><TD>{@code í}</TD><TD>{@code %C3%AD}</TD></TR>
166     * <TR><TD>{@code Ó}</TD><TD>{@code %C3%93}</TD></TR>
167     * <TR><TD>{@code ó}</TD><TD>{@code %C3%B3}</TD></TR>
168     * <TR><TD>{@code Ú}</TD><TD>{@code %C3%9A}</TD></TR>
169     * <TR><TD>{@code ú}</TD><TD>{@code %C3%BA}</TD></TR>
170     * <TR><TD>{@code Ü}</TD><TD>{@code %C3%9C}</TD></TR>
171     * <TR><TD>{@code ü}</TD><TD>{@code %C3%BC}</TD></TR>
172     * <TR><TD>{@code Ñ}</TD><TD>{@code %C3%91}</TD></TR>
173     * <TR><TD>{@code ñ}</TD><TD>{@code %C3%B1}</TD></TR>
174     * <TR><TD>{@code Ý}</TD><TD>{@code %C3%9D}</TD></TR>
175     * <TR><TD>{@code ý}</TD><TD>{@code %C3%BD}</TD></TR>
176     * </TABLE>
177     *
178     * <BR /><BR /><B CLASS=JDDescLabel>Historical Note:</B>
179     * 
180     * <BR />This method was written the very first time that a {@code URL} needed to be escaped
181     * during the writing of the Java-HTML {@code '.jar'}.
182     *
183     * @param url  Any website {@code URL} query.
184     *
185     * @return The same {@code URL} with substitutions made.
186     * 
187     * @see #VOWELS
188     * @see #VOWELS_URL
189     * @see StrReplace#r(String, char[], String[])
190     */
191    public static String toProperURLV1(String url)
192    { return StrReplace.r(url, VOWELS, VOWELS_URL); }
193
194    /**
195     * This list of java {@code char's} are characters that are better off escaped when passing
196     * them through a {@code URL}.
197     * 
198     * @see #toProperURLV2(String)
199     */
200    protected static final char[] URL_ESC_CHARS =
201    {
202        '%', ' ', '#', '$', '&', '@', '`', '/', ':', ';', '<', '=', '>', '?', '[', '\\',
203        ']', '^', '{', '|', '}', '~', '\'', '+', ','
204    };
205
206    /**
207     * This method will clobber the leading Domain-Name and Protocol -
208     * {@code http://domain.name.something/} stuff.  It is best to use this method on
209     * {@code String's} that will be inserted into a {@code URL} after the {@code '?'}
210     * question-mark, inside the Query-String.
211     * 
212     * <BR /><BR />This can be very useful when sending JSON Arguments, for instance, inside a
213     * {@code URL's} Query-String, instead of the GET / POST part of a request.
214     * 
215     * <BR /><BR />Note that this method should not be used to escape characters outside of the
216     * range of Standard-ASCII (characters {@code 0 ... 255}).
217     *
218     * <BR /><BR /><B CLASS=JDDescLabel>State of the Experiment:</B>
219     * 
220     * <BR />It seems to help to escape these characters:
221     * 
222     * <BR /><B STYLE="color:red;">{@code # $ % & @ ` / : ; < = > ? [ \ ] ^ | ~ " ' + ,}
223     * <CODE> { } </CODE></B>
224     * 
225     * @param urlStuff Any information that is intended to be sent via an HTTP-{@code URL}, and
226     * needs to be escaped.
227     *
228     * @return An escaped version of this {@code URL-String}
229     * 
230     * @see #URL_ESC_CHARS
231     * @see StrReplace#r(String, char[], IntCharFunction)
232     */
233    public static String toProperURLV2(String urlStuff)
234    {
235        return StrReplace.r(
236            urlStuff, URL_ESC_CHARS,
237            (int i, char c) -> '%' + Integer.toHexString((int) c)
238        );
239    }
240
241    /**
242     * This leaves out the actual domain name before starting HTTP-URL Escape Sequences.  If this
243     * starts with the words "http://domain.something/" then the initial colon, forward-slash and
244     * periods won't be escaped.  Everything after the first front-slash will include URL-HTTP
245     * Escape characters.
246     *
247     * <BR /><BR />This does the same thing as {@code toProperURLV2(String)}, but skips the initial
248     * part of the URL text/string - IF PRESENT!
249     * 
250     * <BR /><BR />{@code http(s?)://domain.something/} is skipped by the Regular Expression, 
251     * everything else from {@code URLV2} is escaped.
252     *
253     * @param url This may be any internet {@code URL}, represented as a {@code String}.  It will
254     * be escaped with the {@code %INT} format.
255     *
256     * @return An escaped {@code URL String}
257     *
258     * @see #toProperURLV2(String)
259     * @see #P1
260     */
261    public static String toProperURLV3(String url)
262    {
263        String  beginsWith  = null;
264        Matcher m           = P1.matcher(url);
265
266        if (m.find())
267        {
268            beginsWith = m.group(1); 
269            url = url.substring(beginsWith.length());
270        }
271
272        return ((beginsWith != null) ? beginsWith : "") + toProperURLV2(url);
273    }
274
275    /**
276     * This is a (shortened) list of characters that <I>should</I> be escaped before being used
277     * within a {@code URL}.
278     * 
279     * <BR /><BR />This version differs from {@link #URL_ESC_CHARS} in that it does not include the
280     * {@code '&'} (ampersand), the {@code '?'} (question-mark) or the {@code '/'} (forward-slash).
281     * 
282     * @see #URL_ESC_CHARS
283     * @see #toProperURLV4(String)
284     */
285    protected static final char[] URL_ESC_CHARS_ABBREV =
286    {
287        '%', ' ', '#', '$', '@', '`', ':', ';', '<', '=', '>', '[', '\\', ']',
288        '^', '{', '|', '}', '~', '\'', '+', ','
289    };
290
291    /**
292     * This does the same thing as V3, but it also will avoid escaping any {@code '?'} 
293     * (question-mark) or {@code '&'} (ampersand) or {@code '/'} (forward-slash) symbols anywhere
294     * in the entire {@code String}.  It also "skips" escaping the initial
295     * {@code HTTP(s)://domain.net.something/} as well - just like {@code toProperURLV3}
296     *
297     * @return This does the same thing as {@code toProperURLV3(String)}, but leaves out 100%
298     * of the instances of Ampersand, Question-Mark, and Forward-Slash symbols. 
299     *
300     * @see #toProperURLV3(String)
301     * @see #P1
302     * @see #URL_ESC_CHARS_ABBREV
303     * @see StrReplace#r(String, char[], IntCharFunction)
304     */
305    public static String toProperURLV4(String url)
306    {
307        String  beginsWith  = null;
308        Matcher m           = P1.matcher(url);
309
310        if (m.find())
311        {
312            beginsWith = m.group(1); 
313            url = url.substring(beginsWith.length());
314        }
315
316        return ((beginsWith != null) ? beginsWith : "") +
317            StrReplace.r
318                (url, URL_ESC_CHARS_ABBREV, (int i, char c) -> '%' + Integer.toHexString((int) c));
319    }
320
321    /**
322     * <EMBED CLASS='external-html' DATA-FILE-ID=URLS_PRP_URL_V5>
323     *
324     * @param url This is the URL to be encoded, properly
325     *
326     * @return A properly encoded URL String.  Important, if calling the {@code java.net.URL}
327     * constructor generates a {@code MalformedURLException}, then this method shall return.
328     * The {@code java.net.URL} constructor will be called if the {@code String} passed begins with
329     * the characters {@code 'http://'} or {@code 'https://'}.
330     */
331    public static String toProperURLV5(String url)
332    {
333        url = url.trim();
334
335        URL         u       = null;
336        String[]    sArr    = null;
337        String      tlc     = url.toLowerCase();
338
339        if (tlc.startsWith("http://") || tlc.startsWith("https://"))
340        { try { u = new URL(url); } catch (Exception e) { return null; } }
341
342        if (u == null)  sArr = url.split("/");
343        else            sArr = u.getPath().split("/");
344
345        String          slash   = "";
346        StringBuilder   sb      = new StringBuilder();
347
348        for (String s : sArr)
349        {
350            try
351                { sb.append(slash + java.net.URLEncoder.encode(s, "UTF-8")); }
352
353            catch (UnsupportedEncodingException e)
354                { /* This really cannot happen, and I don't know what to put here! */ }
355
356            slash = "/";
357        }
358
359        if (u == null)
360            return sb.toString();
361        else
362            return
363                u.getProtocol() + "://" + u.getHost() + sb.toString() +
364                ((u.getQuery() != null) ? ("?" + u.getQuery())  : "") +
365                ((u.getRef() != null)   ? ("#" + u.getRef())    : "");
366    }
367
368    /**
369     * Rather than trying to explain what is escaped and what is left alone, please review the
370     * exact code here.
371     *
372     * <BR /><BR /><B CLASS=JDDescLabel>Another One:</B>
373     * 
374     * <BR />Well, I just wrote another one, they told me to.  This, newest version of
375     * {@code URL}-Encoding is actually pretty successful.  It handles all Extra-Characters and is
376     * capable of dealing with {@code URL's} that contain the {@code '?'  '='  '&'} operators of
377     * {@code GET}-Requests.
378     *
379     * <BR /><BR />Realize that though the out-of-the-box JDK, there is a class called
380     * "URI Encoder" - but that class expects that the {@code URL} to have already been separated
381     * out into it's distinct parts.
382     * 
383     * <BR /><BR />This method does the the {@code URL}-Separating into disparate parts
384     * before performing the Character-Escaping.
385     *
386     * @param url This is any java {@code URL}.
387     *
388     * @return a new {@code String} version of the input parameter {@code 'url'}
389     */
390    public static String toProperURLV6(String url)
391    {
392        URL u = null;
393
394        try
395            { u = new URL(url); }
396
397        catch (Exception e) { return null; }
398
399        StringBuilder sb = new StringBuilder();
400
401        sb.append(u.getProtocol());
402        sb.append("://");
403        sb.append(u.getHost());
404        sb.append(toProperURLV5(u.getPath()));
405
406        if (u.getQuery() != null)
407        {
408            String[]            sArr        = u.getQuery().split("&");
409            StringBuilder       sb2         = new StringBuilder();
410            String              ampersand   = "";
411
412            for (String s : sArr)
413            {
414                String[]        s2Arr       = s.split("=");
415                StringBuilder   sb3         = new StringBuilder();    
416                String          equals      = "";
417
418                for (String s2: s2Arr)
419                {
420                    try
421                        { sb3.append(equals + java.net.URLEncoder.encode(s2, "UTF-8")); }
422
423                    // This should never happen - UTF-8 is (sort-of) the only encoding.
424                    catch (UnsupportedEncodingException e) { }
425
426                    equals = "=";
427                }
428
429                sb2.append(ampersand + sb3.toString());
430                ampersand = "&";
431            }
432
433            sb.append("?" + sb2.toString());
434        }
435
436        // Not really a clue, because a the "#" operator and the "?" probably shouldn't be used
437        // together.  Java's java.net.URL class will parse a URL that has both the ? and the #, but
438        // I have no idea which Web-Sites would allow this, or encourage this...
439
440        if (u.getRef() != null)
441
442            try
443                { sb.append("#" + java.net.URLEncoder.encode(u.getRef(), "UTF-8")); }
444
445            catch (UnsupportedEncodingException e) { }
446
447        return sb.toString();        
448    }
449
450    /**
451     * These strictly use Java's URI Encoding Mechanism.  They seem to work the same as "V6"
452     * Internally, these are now used.  This as of November, 2019.
453     *
454     * @param url A Complete Java {@code URL}, as a {@code String}.  Any specialized
455     * Escape-Characters that need to be escaped, will be.
456     *
457     * @throws URISyntaxException This will throw if building the {@code URI} generates an
458     * exception.  Internally, all this method does is build a {@code URI}, and then call the Java
459     * Method {@code 'toASCIIString()'}
460     */
461    public static String toProperURLV7(String url) throws URISyntaxException, MalformedURLException
462    { return toProperURLV8(new URL(url)); }
463
464    /**
465     * These strictly use Java's URI Encoding Mechanism.  They seem to work the same as "V6"
466     * Internally, these are now used.  This as of November, 2019.
467     *
468     * @param url A Complete Java {@code URL}.  Any specialized Escape-Characters that need to be
469     * escaped, will be.
470     *
471     * @throws URISyntaxException This will throw if building the URI generates an exception.
472     * Internally, all this method does is build a URI, and then call the Java Method
473     * {@code 'toASCIIString()'}
474     */
475    public static String toProperURLV8(URL url) throws URISyntaxException, MalformedURLException
476    {
477        return new URI(
478            url.getProtocol(),
479            url.getUserInfo(),
480            url.getHost(),
481            url.getPort(),
482            url.getPath(),
483            url.getQuery(),
484            url.getRef()
485        ).toASCIIString();
486    }
487
488
489    // ********************************************************************************************
490    // ********************************************************************************************
491    // The original "URLs" class
492    //*********************************************************************************************
493    // ********************************************************************************************
494
495
496    /**
497     * If you have a list of {@code URL's}, and want to quickly remove any
498     * duplicate-{@code URL's} found in the list - this will remove them.
499     *
500     * <BR /><BR /><B CLASS=JDDescLabel>Case Sensitivity:</B>
501     * 
502     * <BR />This method will perform a few "to-lower-case" operations on the protocol and
503     * Web-Domain parts, but not on the file, directory, or Query-String portion of the
504     * {@code URL}.
505     *
506     * <BR /><BR />This should hilite what is Case-Sensitive, and what is not:
507     * 
508     * <BR /><BR /><UL CLASS=JDUL>
509     * <LI> These are considered duplicate URL's:
510     *      <BR />
511     *      <BR /><CODE>http://some.company.com/index.html</CODE>
512     *      <BR /><CODE>HTTP://SOME.COMPANY.COM/index.html</CODE>
513     *      <BR /><BR />
514     *      </LI>
515     * 
516     * <LI> These are <I>not</I> considered duplicate URL's:
517     *      <BR />
518     *      <BR /><CODE>http://other.company.com/Directory/Ben-Bitdiddle.html</CODE>
519     *      <BR /><CODE>http://other.company.com/DIRECTORY/BE.html</CODE>
520     *      </LI>
521     * </UL>
522     *
523     * @param urls Any list of {@code URL's}, some of which might have been duplicated.  The
524     * difference between this {@code 'removeDuplicates'} and the other {@code 'removeDuplicates'}
525     * available in this class is that this one only removes multiple instances of the same 
526     * {@code URL} in this {@code Vector}, while the other one iterates through a list of 
527     * {@code URL's} already visited in a previous-session.
528     * 
529     * <BR /><BR /><B>NOTE:</B> <I>Null {@code Vector}-values are skipped outright, they are
530     * neither removed nor changed.</i>
531     *
532     * @return The number of {@code Vector} elements that were removed.  (i.e. <I>The size by which
533     * the {@code Vector} was shrunk.</I>)
534     */
535    public static int removeDuplicates(Vector<URL> urls)
536    {
537        TreeSet<String> dups    = new TreeSet<>();
538        int             count   = 0;
539        int             size    = urls.size();
540        URL             url     = null;
541
542        for (int i=0; i < size; i++)
543
544            if ((url = urls.elementAt(i)) != null)
545                if (! dups.add(urlToString(url)))
546                {
547                    count++;
548                    size--;
549                    i--;
550                    urls.removeElementAt(i);
551                }
552
553        return count;
554    }
555
556    /**
557     * This simple method will remove any {@code URL's} from the input {@code Vector} parameter
558     * {@code 'potentiallyNewURLs'} which are also present-members of the input {@code Vector} 
559     * parameter {@code 'visitedURLs'}.
560     * 
561     * <BR /><BR />This may seem trivial, and it is, but it worries about things like the
562     * {@code String's} Case for you.
563     *
564     * @param visitedURLs This parameter is a list of {@code URL's} that have already
565     * "been visited."
566     *
567     * @param potentiallyNewURLs This parameter is a list of {@code URL's} that are possibly
568     * "un-visited" - meaning whatever scrape, crawl or search being performed needs to know which
569     * {@code URL's} are listed in the previous parameter's contents.  This may seem trivial, just
570     * use the java {@code url1.equals(url2)} command, but, alas, java doesn't exactly take into
571     * account upper-case and lower-case domain-names.  This worries about case.
572     *
573     * @return The number of {@code URL's} that were removed from the input {@code Vector}
574     * parameter {@code 'potentiallyNewURLs'}.
575     */
576    public static int removeDuplicates(Vector<URL> visitedURLs, Vector<URL> potentiallyNewURLs)
577    {
578        // The easiest way to check for duplicates is to build a tree-set of all the URL's as a
579        // String.  Java's TreeSet<> generic already (automatically) scans for duplicates
580        // (efficiently) and will tell you if you have tried to add a duplicate
581
582        TreeSet<String> dups = new TreeSet<>();
583
584        // Build a TreeSet of the url's from the "Visited URLs" parameter
585        visitedURLs.forEach(url -> dups.add(urlToString(url)));
586
587        // Add the "Possibly New URLs", one-by-one, and remove them if they are already in the
588        // visited list.
589
590        int count   = 0;
591        int size    = potentiallyNewURLs.size();
592        URL url     = null;
593
594        for (int i=0; i < size; i++)
595
596            if ((url = potentiallyNewURLs.elementAt(i)) != null)
597
598                if (! dups.add(urlToString(url)))
599                {
600                    count++;
601                    size--;
602                    i--;
603                    potentiallyNewURLs.removeElementAt(i);
604                }
605
606        return count;
607    }
608
609    /**
610     * Removes any Fragment-{@code URL} {@code '#'} symbols from a {@code URL}.
611     * 
612     * <BR /><BR />If this {@code URL} contains a pound-sign Anchor-Name according to the Standard
613     * JDK's {@code URL.getRef()} method.  Specifically, if {@code URL.getRef()} returns a non-null
614     * value, this method rebuilds the URL, without any Anchor-Name / Fragment information.
615     * 
616     * <BR /><BR />The intention is to return a {@code URL} where any / all {@code String}-data 
617     * that occurs after a {@code '#'} Hash-Tab / Pound-Sign is removed.
618     * 
619     * @param url Any standard HTTP {@code URL}.  If this {@code 'url'} contains a {@code '#'}
620     * (Pound Sign, Partial Reference) - according to the standard JDK {@code URL.getRef()} method,
621     * then it shall be removed.
622     * 
623     * @return The {@code URL} without the partial-reference, or the original {@code URL} if there
624     * was no partial reference.  Null is returned if there is an error instantiating the new
625     * {@code URL} without the partial-reference.
626     */
627    public static URL shortenPoundREF(URL url)
628    {
629        try
630        {
631            if (url.getRef() != null) return new URL(
632                ((url.getProtocol() != null) ? url.getProtocol().toLowerCase()  : "") +
633                    "://" +
634                ((url.getHost()     != null) ? url.getHost().toLowerCase()      : "") +
635                ((url.getFile()     != null) ? url.getFile()                    : "")
636            );
637
638            else return url;
639        }
640
641        catch (MalformedURLException e) { return null; }
642    }
643
644    /**
645     * <EMBED CLASS='external-html' DATA-FILE-ID=URLS_NAMED_ANCHORS>
646     *
647     * @param urls Any list of completed (read: <I>fully-resolved</I>) {@code URL's}.
648     *
649     * @param ifExceptionSetNull If this parameter is passed {@code TRUE}, if there is ever an
650     * exception-throw while building the new {@code URL's} (without the fragment / pound-sign),
651     * then that position in the {@code Vector} will be replaced with a null.
652     * 
653     * <BR /><BR />When this parameter is passed {@code FALSE}, if an exception is thrown, then
654     * it will be caught and silently ignored.
655     *
656     * @return The number / count of {@code URL's} in this list that were modified.  Whenever a
657     * {@code URL} Named-Anchor is encountered, it will be removed from the {@code URL}, and a
658     * new {@code URL} without the fragment-part will be inserted to replace the old one.
659     * 
660     * <BR /><BR />The integer that is returned here is the number of times that a replacement
661     * was made to the input {@code Vector}-parameter {@code 'urls'}.
662     */
663    public static int shortenPoundREFs(Vector<URL> urls, boolean ifExceptionSetNull)
664    {
665        int pos             = 0;
666        int shortenCount    = 0;
667
668        for (int i = (urls.size() - 1); i >= 0; i--)
669        {
670            URL url = urls.elementAt(i);
671
672            try
673            {
674                if (url.getRef() != null)
675                {
676                    URL newURL = new URL(
677                        ((url.getProtocol() != null) ? url.getProtocol().toLowerCase()  : "") +
678                            "://" +
679                        ((url.getHost()     != null) ? url.getHost().toLowerCase()      : "") +
680                        ((url.getFile()     != null) ? url.getFile()                    : "")
681                    );
682
683                    urls.setElementAt(newURL, i);
684                    shortenCount++;
685                }
686            }
687
688            catch (MalformedURLException e)
689                { if (ifExceptionSetNull) urls.setElementAt(null, i); }
690        }
691
692        return shortenCount;
693    }
694
695    /**
696     * <EMBED CLASS='external-html' DATA-FILE-ID=URLS_NAMED_ANCHORS>
697     *
698     * <BR /><BR /><B CLASS=JDDescLabel>KE: Keep Exceptions</B>
699     *
700     * <BR />This method is identical to the previous method, defined above, except that it
701     * allows a programmer to keep / retain any {@code MalformedURLException's} that are thrown
702     * while re-building them.
703     *
704     * @param urls Any list of completed (read: <I>fully-resolved</I>) {@code URL's}.
705     *
706     * @param ifExceptionSetNull If this is {@code TRUE} then if there is ever an exception building
707     * a new {@code URL} without a "Relative {@code URL '#'}" (Pound-Sign), then that position in
708     * the {@code Vector} will be replaced with 'null.'
709     *
710     * @return The number/count of {@code URL's} in this list that were modified.  If a {@code URL}
711     * was modified, it was because it had a partial-page reference in it.  If in the process of
712     * generating a new {@code URL} out of an old one, a {@code MalformedURLException} occurs, the
713     * exception will be placed in the {@code Ret2.b} position, which is a 
714     * {@code Vector<MalformedURLException>}.
715     *
716     * <BR /><BR /><B>SPECIFICALLY:</B>
717     *
718     * <BR /><BR /><UL CLASS=JDUL>
719     * 
720     * <LI> {@code Ret2.a = 'Integer'} number of {@code URL's} shortened for having a {@code '#'}
721     *      partial-reference.
722     *      </LI>
723     * 
724     * <LI> {@code Ret2.b = Vector<MalformedURLException>} where each element of this
725     *      {@code Vector} is null if there were no problems converting the {@code URL}, or the
726     *      exception reference if there were exceptions thrown.
727     *      </LI>
728     * 
729     * </UL>
730     */
731    public static Ret2<Integer, Vector<MalformedURLException>> shortenPoundREFs_KE
732        (Vector<URL> urls, boolean ifExceptionSetNull)
733    {
734        int                             pos             = 0;
735        int                             shortenCount    = 0;
736        Vector<MalformedURLException>   v               = new Vector<>();
737
738        for (int i=0; i < urls.size(); i++) v.setElementAt(null, i);
739
740        for (int i = (urls.size() - 1); i >= 0; i--)
741        {
742            URL url = urls.elementAt(i);
743 
744            try
745            {
746                if (url.getRef() != null)
747                {
748                    URL newURL = new URL(
749                        ((url.getProtocol() != null) ? url.getProtocol().toLowerCase()  : "") +
750                            "://" +
751                        ((url.getHost()     != null) ? url.getHost().toLowerCase()      : "") +
752                        ((url.getFile()     != null) ? url.getFile()                    : "")
753                    );
754
755                    urls.setElementAt(newURL, i);
756                    shortenCount++;
757                }
758            }
759
760            catch (MalformedURLException e)
761            {
762                if (ifExceptionSetNull) urls.setElementAt(null, i);
763                v.setElementAt(e, i);
764            }
765        }
766
767        return new Ret2<Integer, Vector<MalformedURLException>>(Integer.valueOf(shortenCount), v);
768    }
769
770    /**
771     * On the internet, a {@code URL} is part case-sensitive, and part case-insensitive.  The
772     * Domain-Name and Protocol ({@code http://}, and {@code 'some.company.com'}) portions of the
773     * {@code URL} <I>are Case-Insensitive - they may be in any combination of upper or lower
774     * case</I>.
775     *
776     * <BR /><BR />However, the directory, file-name, and (optional) Query-{@code String} portion
777     * of a {@code URL} are (often, but not always) Case-Sensitive.  The sensitivity to case in
778     * these three parts of a {@code URL} is dependent upon the individual Web-Server that is 
779     * providing the content for the {@code URL}.
780     *
781     * <BR /><BR />To summarize, DNS servers which monitor the Domain-Name part of a {@code URL}
782     * treat upper &amp; lower case English-Letters as the same.  Web-Server that utilize the File
783     * Directory part of a {@code URL} will sometimes care about case, and sometimes won't.  This
784     * behavior is dependent upon how the Web-Master has configured his system.
785     *
786     * @param url This may be any Internet-Domain {@code URL}
787     *
788     * @return A {@code String} version of this {@code URL}, but the domain and protocol portions
789     * of the {@code URL} will be a "consistent" lower case.  The case of the directory, file and
790     * (possibly, but not guaranteed to be present) {@code query-string} portion will not have
791     * their case modified either way.
792     *
793     * <BR /><BR /><B>NOTE:</B> This type of information is pretty important is you are attempting
794     * to scan for duplicate {@code URL's} or check their equality.
795     */
796    public static String urlToString(URL url)
797    {
798        return
799            ((url.getProtocol() != null)    ? url.getProtocol().toLowerCase()   : "") + "://" +
800            ((url.getHost()     != null)    ? url.getHost().toLowerCase()       : "") +
801            ((url.getPath()     != null)    ? url.getPath()                     : "") +
802            ((url.getQuery()    != null)    ? ('?' + url.getQuery())            : "") +
803            ((url.getRef()      != null)    ? ('#' + url.getRef())              : "");
804    }
805
806    /**
807     * As of today, the version of UNIX {@code curl} command does not seem to be downloading
808     * everything properly.  It downloaded an image {@code '.png'} file just fine, but seemed to
809     * have botched a zip-file.  This does what UNIX {@code 'curl'} command, <I>but does not
810     * actually invoke the UNIX operating system to do it.</I>  It just does this...
811     *
812     * @param url This may be any URL, but it is intended to be a downloadable file.  It will
813     * download {@code '.html'} files fine, but you may try images, data-files, zip-files,
814     * tar-archives, and movies.
815     *
816     * @param outFileName You must specify a file-name, and if this parameter is null, a
817     * {@code NullPointerException} will be thrown immediately.  If you would like your program
818     * to guess the filename - <I>based on the file named in the URL</I>, please use the method
819     * {@code URL.getFile()}, or something to that effect.
820     * 
821     * @param userAgent A User-Agent, as a {@code String}.  If this parameter is passed null,
822     * it will be silently ignored, and a User-Agent won't be used.
823     * 
824     * @throws IOException If there are I/O Errors when using the {@code HttpURLConnection}.
825     */
826    public static void CURL(URL url, String outFileName, String userAgent) throws IOException
827    {   
828        HttpURLConnection con = (HttpURLConnection) url.openConnection();
829
830        con.setRequestMethod("GET");
831
832        if (userAgent != null) con.setRequestProperty("User-Agent", userAgent);
833
834        InputStream         is      = con.getInputStream();
835        FileOutputStream    fos     = new FileOutputStream(outFileName);
836        byte[]              b       = new byte[5000];
837        int                 result  = 0;
838
839        while ((result = is.read(b)) != -1) fos.write(b, 0, result);
840 
841        fos.flush();    fos.close();    is.close();
842    }
843}