001package Torello.Java;
002
003import java.util.*;
004import java.net.*;
005import java.util.regex.*;
006import java.io.*;
007
008import static Torello.Java.C.*;
009
010import Torello.Java.Additional.Ret2;
011
012/**
013 * A class that plays-with URL's, no more, no less.
014 * 
015 * <EMBED CLASS='external-html' DATA-FILE-ID=URLS>
016 */
017@Torello.JavaDoc.StaticFunctional
018public class URLs
019{
020    private URLs() { }
021
022    /**
023     * This is a Regular-Expression Pattern {@code (java.util.regex.Pattern)} - saved as a 
024     * {@code String}.  It is subsequently compiled.
025     *
026     * <BR /><BR />The primary function is to match {@code String's} that are intended to match
027     * HTTP-{@code URL's}.  This Regular Expression matches:
028     * 
029     * <BR /><BR /><UL CLASS=JDUL>
030     * <LI>{@code http(s)://...<any-text>.../}</LI>
031     * <LI>{@code http(s)://...<any-text, not front-slash>...}</LI>
032     * <LI>{@code http(s)://...<any-text>.../...<any-text, not front-slash>...}</LI>
033     * </UL>
034     * 
035     * <BR /><BR /><B CLASS=JDDescLabel>Primarily used in:</B>
036     * 
037     * <BR /><UL CLASS=JDUL>
038     * <LI>{@link #toProperURLV3(String)}</LI>
039     * <LI>{@link #toProperURLV4(String)}</LI>
040     * </UL>
041     * 
042     * @see #P1
043     */
044    protected static final String RE1 =
045         "^(http[s]?:\\/\\/.*?\\/$|http[s]?:\\/\\/[^\\/]*$|http[s]?:\\/\\/.*?\\/[^\\/]+)";
046
047    /**
048     * {@code P1 = Pattern.compile(RE1);}
049     * 
050     * @see #RE1
051     */
052    protected static final Pattern P1 = Pattern.compile(RE1);
053
054    /**
055     * Java Help Messag Explaining {@code class java.net.URL} - and the specific output of its
056     * methods.
057     *
058     * <EMBED CLASS='external-html' DATA-FILE-ID=URLS_HELP_MSG>
059     *
060     * @param sw An instance of class StorageWriter.  This parameter may be null, and if it is
061     * text-output will be sent to Standard-Output.
062     */
063    protected static final void javaURLHelpMessage(StorageWriter sw)
064    {
065        if (sw == null) sw = new StorageWriter();
066
067        String[] urlStrArr = {
068            "https://DALLASCITYHALL.com", "https://dallascityhall.com/",
069            "https://dallascityhall.com/news",
070            "https://dallascityhall.com/news/", "http://DALLASCITYHALL.com/news/ARTICLE-1.html",
071            "https://DallasCityHall.com/NEWS/article1.html?q=somevalue",
072            "https://DallasCityHall.com/news/ARTICLE-1.html#subpart1",
073            "https://DallasCityHall.com/NEWS/article1.html?q=somevalue&q2=someOtherValue",
074            "https://DallasCityHall.com/NEWS/article1.html?q=somevalue&q2=someOtherValue#LocalRef"
075        };
076
077        URL[] urlArr = new URL[urlStrArr.length];
078
079        try
080            { for (int i=0; i < urlStrArr.length; i++) urlArr[i] = new URL(urlStrArr[i]); }
081
082        catch (Exception e)
083        {
084            sw.println(
085                "Broke a URL, and it generated an exception.\n" +
086                "Sorry, fix the URL's in this method.\n" + 
087                "Did you change them?"
088            );
089
090            e.printStackTrace();
091            return;
092        }
093
094        for (URL u : urlArr)
095        {
096            System.out.println(
097                "u.toString():\t\t"     + BCYAN + u.toString() + RESET + '\n' +
098                "u.getProtocol():\t"    + u.getProtocol() + '\n' +
099                "u.getHost():\t\t"      + u.getHost() + '\n' +
100                "u.getPath():\t\t"      + u.getPath() + '\n' +
101                "u.getFile():\t\t"      + u.getFile() + '\n' +
102                "u.getQuery():\t\t"     + u.getQuery() + '\n' +
103                "u.getRef():\t\t"       + u.getRef() + '\n' +
104                "u.getAuthority():\t"   + u.getAuthority() + '\n' +
105                "u.getUserInfo():\t"    + u.getUserInfo() + '\n' +
106                "urlToString(u):\t\t"   + urlToString(u)
107            );
108        }
109    }
110
111
112    // ********************************************************************************************
113    // ********************************************************************************************
114    // Helper function for making URL address readable by web-servers.
115    //*********************************************************************************************
116    // ********************************************************************************************
117
118
119    /**
120     * When scraping Spanish {@code URL's}, these characters can / should be escaped.
121     * 
122     * <BR /><BR /><B CLASS=JDDescLabel>Parallel Array Note:</B>
123     * 
124     * <BR />This array shall be considered parallel to the <B><I>replacement</I></B>
125     * {@code String[]}-Array {@link #VOWELS_URL}.
126     * 
127     * @see #toProperURLV1(String)
128     * @see #VOWELS_URL
129     */
130    protected static final char[] VOWELS = {
131        'á', 'É', 'é', 'Í', 'í', 'Ó', 'ó', 'Ú', 'ú', 'Ü', 'ü',
132        'Ñ', 'ñ', 'Ý', 'ý', '¿', '¡'
133    };
134
135    /**
136     * When scraping Spanish {@code URL's}, these {@code String's} are the
137     * <B>URL Escape Sequences</B> for the Spanish Vowel Characters listed in {@link #VOWELS}.
138     * 
139     * <BR /><BR /><B CLASS=JDDescLabel>Parallel Array Note:</B>
140     * 
141     * <BR />This array shall be considered parallel to {@code String[]}-Array {@link #VOWELS}.
142     * 
143     * @see #toProperURLV1(String)
144     * @see #VOWELS
145     */
146    protected static final String[] VOWELS_URL =
147    {
148        "%C3%A1", "%C3%89", "%C3%A9", "%C3%8D", "%C3%AD", "%C3%93", "%C3%B3", "%C3%9A",
149        "%C3%BA", "%C3%9C", "%C3%BC", "%C3%91", "%C3%B1", "%C3%9D", "%C3%BD", "%C2%BF",
150        "%C2%A1"
151    };
152
153    /**
154     * This will substitute many of the Spanish-characters that can make a web-query difficult.
155     * These are the substitutions listed:
156     *
157     * <BR /><BR /><TABLE CLASS=JDBriefTable>
158     * <TR><TH>Spanish Language Character</TH><TH>URL Escape Sequence</TH></TR>
159     * <TR><TD>{@code Á}</TD><TD>{@code %C3%81}</TD></TR>
160     * <TR><TD>{@code á}</TD><TD>{@code %C3%A1}</TD></TR>
161     * <TR><TD>{@code É}</TD><TD>{@code %C3%89}</TD></TR>
162     * <TR><TD>{@code é}</TD><TD>{@code %C3%A9}</TD></TR>
163     * <TR><TD>{@code Í}</TD><TD>{@code %C3%8D}</TD></TR>
164     * <TR><TD>{@code í}</TD><TD>{@code %C3%AD}</TD></TR>
165     * <TR><TD>{@code Ó}</TD><TD>{@code %C3%93}</TD></TR>
166     * <TR><TD>{@code ó}</TD><TD>{@code %C3%B3}</TD></TR>
167     * <TR><TD>{@code Ú}</TD><TD>{@code %C3%9A}</TD></TR>
168     * <TR><TD>{@code ú}</TD><TD>{@code %C3%BA}</TD></TR>
169     * <TR><TD>{@code Ü}</TD><TD>{@code %C3%9C}</TD></TR>
170     * <TR><TD>{@code ü}</TD><TD>{@code %C3%BC}</TD></TR>
171     * <TR><TD>{@code Ñ}</TD><TD>{@code %C3%91}</TD></TR>
172     * <TR><TD>{@code ñ}</TD><TD>{@code %C3%B1}</TD></TR>
173     * <TR><TD>{@code Ý}</TD><TD>{@code %C3%9D}</TD></TR>
174     * <TR><TD>{@code ý}</TD><TD>{@code %C3%BD}</TD></TR>
175     * </TABLE>
176     *
177     * <BR /><BR /><B CLASS=JDDescLabel>Historical Note:</B>
178     * 
179     * <BR />This method was written the very first time that a {@code URL} needed to be escaped
180     * during the writing of the Java-HTML {@code '.jar'}.
181     *
182     * @param url  Any website {@code URL} query.
183     *
184     * @return The same {@code URL} with substitutions made.
185     * 
186     * @see #VOWELS
187     * @see #VOWELS_URL
188     * @see StrReplace#r(String, char[], String[])
189     */
190    public static String toProperURLV1(String url)
191    { return StrReplace.r(url, VOWELS, VOWELS_URL); }
192
193    /**
194     * This list of java {@code char's} are characters that are better off escaped when passing
195     * them through a {@code URL}.
196     * 
197     * @see #toProperURLV2(String)
198     */
199    protected static final char[] URL_ESC_CHARS =
200    {
201        '%', ' ', '#', '$', '&', '@', '`', '/', ':', ';', '<', '=', '>', '?', '[', '\\',
202        ']', '^', '{', '|', '}', '~', '\'', '+', ','
203    };
204
205    /**
206     * This method will clobber the leading Domain-Name and Protocol -
207     * {@code http://domain.name.something/} stuff.  It is best to use this method on
208     * {@code String's} that will be inserted into a {@code URL} after the {@code '?'}
209     * question-mark, inside the Query-String.
210     * 
211     * <BR /><BR />This can be very useful when sending JSON Arguments, for instance, inside a
212     * {@code URL's} Query-String, instead of the GET / POST part of a request.
213     * 
214     * <BR /><BR />Note that this method should not be used to escape characters outside of the
215     * range of Standard-ASCII (characters {@code 0 ... 255}).
216     *
217     * <BR /><BR /><B CLASS=JDDescLabel>State of the Experiment:</B>
218     * 
219     * <BR />It seems to help to escape these characters:
220     * 
221     * <BR /><B STYLE="color:red;">{@code # $ % & @ ` / : ; < = > ? [ \ ] ^ | ~ " ' + ,}
222     * <CODE> { } </CODE></B>
223     * 
224     * @param urlStuff Any information that is intended to be sent via an HTTP-{@code URL}, and
225     * needs to be escaped.
226     *
227     * @return An escaped version of this {@code URL-String}
228     * 
229     * @see #URL_ESC_CHARS
230     * @see StrReplace#r(String, char[], IntCharFunction)
231     */
232    public static String toProperURLV2(String urlStuff)
233    {
234        return StrReplace.r(
235            urlStuff, URL_ESC_CHARS,
236            (int i, char c) -> '%' + Integer.toHexString((int) c)
237        );
238    }
239
240    /**
241     * This leaves out the actual domain name before starting HTTP-URL Escape Sequences.  If this
242     * starts with the words "http://domain.something/" then the initial colon, forward-slash and
243     * periods won't be escaped.  Everything after the first front-slash will include URL-HTTP
244     * Escape characters.
245     *
246     * <BR /><BR />This does the same thing as {@code toProperURLV2(String)}, but skips the initial
247     * part of the URL text/string - IF PRESENT!
248     * 
249     * <BR /><BR />{@code http(s?)://domain.something/} is skipped by the Regular Expression, 
250     * everything else from {@code URLV2} is escaped.
251     *
252     * @param url This may be any internet {@code URL}, represented as a {@code String}.  It will
253     * be escaped with the {@code %INT} format.
254     *
255     * @return An escaped {@code URL String}
256     *
257     * @see #toProperURLV2(String)
258     * @see #P1
259     */
260    public static String toProperURLV3(String url)
261    {
262        String  beginsWith  = null;
263        Matcher m           = P1.matcher(url);
264
265        if (m.find())
266        {
267            beginsWith = m.group(1); 
268            url = url.substring(beginsWith.length());
269        }
270
271        return ((beginsWith != null) ? beginsWith : "") + toProperURLV2(url);
272    }
273
274    /**
275     * This is a (shortened) list of characters that <I>should</I> be escaped before being used
276     * within a {@code URL}.
277     * 
278     * <BR /><BR />This version differs from {@link #URL_ESC_CHARS} in that it does not include the
279     * {@code '&'} (ampersand), the {@code '?'} (question-mark) or the {@code '/'} (forward-slash).
280     * 
281     * @see #URL_ESC_CHARS
282     * @see #toProperURLV4(String)
283     */
284    protected static final char[] URL_ESC_CHARS_ABBREV =
285    {
286        '%', ' ', '#', '$', '@', '`', ':', ';', '<', '=', '>', '[', '\\', ']',
287        '^', '{', '|', '}', '~', '\'', '+', ','
288    };
289
290    /**
291     * This does the same thing as V3, but it also will avoid escaping any {@code '?'} 
292     * (question-mark) or {@code '&'} (ampersand) or {@code '/'} (forward-slash) symbols anywhere
293     * in the entire {@code String}.  It also "skips" escaping the initial
294     * {@code HTTP(s)://domain.net.something/} as well - just like {@code toProperURLV3}
295     *
296     * @return This does the same thing as {@code toProperURLV3(String)}, but leaves out 100%
297     * of the instances of Ampersand, Question-Mark, and Forward-Slash symbols. 
298     *
299     * @see #toProperURLV3(String)
300     * @see #P1
301     * @see #URL_ESC_CHARS_ABBREV
302     * @see StrReplace#r(String, char[], IntCharFunction)
303     */
304    public static String toProperURLV4(String url)
305    {
306        String  beginsWith  = null;
307        Matcher m           = P1.matcher(url);
308
309        if (m.find())
310        {
311            beginsWith = m.group(1); 
312            url = url.substring(beginsWith.length());
313        }
314
315        return ((beginsWith != null) ? beginsWith : "") +
316            StrReplace.r
317                (url, URL_ESC_CHARS_ABBREV, (int i, char c) -> '%' + Integer.toHexString((int) c));
318    }
319
320    /**
321     * <EMBED CLASS='external-html' DATA-FILE-ID=URLS_PRP_URL_V5>
322     *
323     * @param url This is the URL to be encoded, properly
324     *
325     * @return A properly encoded URL String.  Important, if calling the {@code java.net.URL}
326     * constructor generates a {@code MalformedURLException}, then this method shall return.
327     * The {@code java.net.URL} constructor will be called if the {@code String} passed begins with
328     * the characters {@code 'http://'} or {@code 'https://'}.
329     */
330    public static String toProperURLV5(String url)
331    {
332        url = url.trim();
333
334        URL         u       = null;
335        String[]    sArr    = null;
336        String      tlc     = url.toLowerCase();
337
338        if (tlc.startsWith("http://") || tlc.startsWith("https://"))
339        { try { u = new URL(url); } catch (Exception e) { return null; } }
340
341        if (u == null)  sArr = url.split("/");
342        else            sArr = u.getPath().split("/");
343
344        String          slash   = "";
345        StringBuilder   sb      = new StringBuilder();
346
347        for (String s : sArr)
348        {
349            try
350                { sb.append(slash + java.net.URLEncoder.encode(s, "UTF-8")); }
351
352            catch (UnsupportedEncodingException e)
353                { /* This really cannot happen, and I don't know what to put here! */ }
354
355            slash = "/";
356        }
357
358        if (u == null)
359            return sb.toString();
360        else
361            return
362                u.getProtocol() + "://" + u.getHost() + sb.toString() +
363                ((u.getQuery() != null) ? ("?" + u.getQuery())  : "") +
364                ((u.getRef() != null)   ? ("#" + u.getRef())    : "");
365    }
366
367    /**
368     * Rather than trying to explain what is escaped and what is left alone, please review the
369     * exact code here.
370     *
371     * <BR /><BR /><B CLASS=JDDescLabel>Another One:</B>
372     * 
373     * <BR />Well, I just wrote another one, they told me to.  This, newest version of
374     * {@code URL}-Encoding is actually pretty successful.  It handles all Extra-Characters and is
375     * capable of dealing with {@code URL's} that contain the {@code '?'  '='  '&'} operators of
376     * {@code GET}-Requests.
377     *
378     * <BR /><BR />Realize that though the out-of-the-box JDK, there is a class called
379     * "URI Encoder" - but that class expects that the {@code URL} to have already been separated
380     * out into it's distinct parts.
381     * 
382     * <BR /><BR />This method does the the {@code URL}-Separating into disparate parts
383     * before performing the Character-Escaping.
384     *
385     * @param url This is any java {@code URL}.
386     *
387     * @return a new {@code String} version of the input parameter {@code 'url'}
388     */
389    public static String toProperURLV6(String url)
390    {
391        URL u = null;
392
393        try
394            { u = new URL(url); }
395
396        catch (Exception e) { return null; }
397
398        StringBuilder sb = new StringBuilder();
399
400        sb.append(u.getProtocol());
401        sb.append("://");
402        sb.append(u.getHost());
403        sb.append(toProperURLV5(u.getPath()));
404
405        if (u.getQuery() != null)
406        {
407            String[]            sArr        = u.getQuery().split("&");
408            StringBuilder       sb2         = new StringBuilder();
409            String              ampersand   = "";
410
411            for (String s : sArr)
412            {
413                String[]        s2Arr       = s.split("=");
414                StringBuilder   sb3         = new StringBuilder();    
415                String          equals      = "";
416
417                for (String s2: s2Arr)
418                {
419                    try
420                        { sb3.append(equals + java.net.URLEncoder.encode(s2, "UTF-8")); }
421
422                    // This should never happen - UTF-8 is (sort-of) the only encoding.
423                    catch (UnsupportedEncodingException e) { }
424
425                    equals = "=";
426                }
427
428                sb2.append(ampersand + sb3.toString());
429                ampersand = "&";
430            }
431
432            sb.append("?" + sb2.toString());
433        }
434
435        // Not really a clue, because a the "#" operator and the "?" probably shouldn't be used
436        // together.  Java's java.net.URL class will parse a URL that has both the ? and the #, but
437        // I have no idea which Web-Sites would allow this, or encourage this...
438
439        if (u.getRef() != null)
440
441            try
442                { sb.append("#" + java.net.URLEncoder.encode(u.getRef(), "UTF-8")); }
443
444            catch (UnsupportedEncodingException e) { }
445
446        return sb.toString();        
447    }
448
449    /**
450     * These strictly use Java's URI Encoding Mechanism.  They seem to work the same as "V6"
451     * Internally, these are now used.  This as of November, 2019.
452     *
453     * @param url A Complete Java {@code URL}, as a {@code String}.  Any specialized
454     * Escape-Characters that need to be escaped, will be.
455     *
456     * @throws URISyntaxException This will throw if building the {@code URI} generates an
457     * exception.  Internally, all this method does is build a {@code URI}, and then call the Java
458     * Method {@code 'toASCIIString()'}
459     */
460    public static String toProperURLV7(String url) throws URISyntaxException, MalformedURLException
461    { return toProperURLV8(new URL(url)); }
462
463    /**
464     * These strictly use Java's URI Encoding Mechanism.  They seem to work the same as "V6"
465     * Internally, these are now used.  This as of November, 2019.
466     *
467     * @param url A Complete Java {@code URL}.  Any specialized Escape-Characters that need to be
468     * escaped, will be.
469     *
470     * @throws URISyntaxException This will throw if building the URI generates an exception.
471     * Internally, all this method does is build a URI, and then call the Java Method
472     * {@code 'toASCIIString()'}
473     */
474    public static String toProperURLV8(URL url) throws URISyntaxException, MalformedURLException
475    {
476        return new URI(
477            url.getProtocol(),
478            url.getUserInfo(),
479            url.getHost(),
480            url.getPort(),
481            url.getPath(),
482            url.getQuery(),
483            url.getRef()
484        ).toASCIIString();
485    }
486
487
488    // ********************************************************************************************
489    // ********************************************************************************************
490    // The original "URLs" class
491    //*********************************************************************************************
492    // ********************************************************************************************
493
494
495    /**
496     * If you have a list of {@code URL's}, and want to quickly remove any
497     * duplicate-{@code URL's} found in the list - this will remove them.
498     *
499     * <BR /><BR /><B CLASS=JDDescLabel>Case Sensitivity:</B>
500     * 
501     * <BR />This method will perform a few "to-lower-case" operations on the protocol and
502     * Web-Domain parts, but not on the file, directory, or Query-String portion of the
503     * {@code URL}.
504     *
505     * <BR /><BR />This should hilite what is Case-Sensitive, and what is not:
506     * 
507     * <BR /><BR /><UL CLASS=JDUL>
508     * <LI> These are considered duplicate URL's:
509     *      <BR />
510     *      <BR /><CODE>http://some.company.com/index.html</CODE>
511     *      <BR /><CODE>HTTP://SOME.COMPANY.COM/index.html</CODE>
512     *      <BR /><BR />
513     *      </LI>
514     * 
515     * <LI> These are <I>not</I> considered duplicate URL's:
516     *      <BR />
517     *      <BR /><CODE>http://other.company.com/Directory/Ben-Bitdiddle.html</CODE>
518     *      <BR /><CODE>http://other.company.com/DIRECTORY/BE.html</CODE>
519     *      </LI>
520     * </UL>
521     *
522     * @param urls Any list of {@code URL's}, some of which might have been duplicated.  The
523     * difference between this {@code 'removeDuplicates'} and the other {@code 'removeDuplicates'}
524     * available in this class is that this one only removes multiple instances of the same 
525     * {@code URL} in this {@code Vector}, while the other one iterates through a list of 
526     * {@code URL's} already visited in a previous-session.
527     * 
528     * <BR /><BR /><B>NOTE:</B> <I>Null {@code Vector}-values are skipped outright, they are
529     * neither removed nor changed.</i>
530     *
531     * @return The number of {@code Vector} elements that were removed.  (i.e. <I>The size by which
532     * the {@code Vector} was shrunk.</I>)
533     */
534    public static int removeDuplicates(Vector<URL> urls)
535    {
536        TreeSet<String> dups    = new TreeSet<>();
537        int             count   = 0;
538        int             size    = urls.size();
539        URL             url     = null;
540
541        for (int i=0; i < size; i++)
542
543            if ((url = urls.elementAt(i)) != null)
544                if (! dups.add(urlToString(url)))
545                {
546                    count++;
547                    size--;
548                    i--;
549                    urls.removeElementAt(i);
550                }
551
552        return count;
553    }
554
555    /**
556     * This simple method will remove any {@code URL's} from the input {@code Vector} parameter
557     * {@code 'potentiallyNewURLs'} which are also present-members of the input {@code Vector} 
558     * parameter {@code 'visitedURLs'}.
559     * 
560     * <BR /><BR />This may seem trivial, and it is, but it worries about things like the
561     * {@code String's} Case for you.
562     *
563     * @param visitedURLs This parameter is a list of {@code URL's} that have already
564     * "been visited."
565     *
566     * @param potentiallyNewURLs This parameter is a list of {@code URL's} that are possibly
567     * "un-visited" - meaning whatever scrape, crawl or search being performed needs to know which
568     * {@code URL's} are listed in the previous parameter's contents.  This may seem trivial, just
569     * use the java {@code url1.equals(url2)} command, but, alas, java doesn't exactly take into
570     * account upper-case and lower-case domain-names.  This worries about case.
571     *
572     * @return The number of {@code URL's} that were removed from the input {@code Vector}
573     * parameter {@code 'potentiallyNewURLs'}.
574     */
575    public static int removeDuplicates(Vector<URL> visitedURLs, Vector<URL> potentiallyNewURLs)
576    {
577        // The easiest way to check for duplicates is to build a tree-set of all the URL's as a
578        // String.  Java's TreeSet<> generic already (automatically) scans for duplicates
579        // (efficiently) and will tell you if you have tried to add a duplicate
580
581        TreeSet<String> dups = new TreeSet<>();
582
583        // Build a TreeSet of the url's from the "Visited URLs" parameter
584        visitedURLs.forEach(url -> dups.add(urlToString(url)));
585
586        // Add the "Possibly New URLs", one-by-one, and remove them if they are already in the
587        // visited list.
588
589        int count   = 0;
590        int size    = potentiallyNewURLs.size();
591        URL url     = null;
592
593        for (int i=0; i < size; i++)
594
595            if ((url = potentiallyNewURLs.elementAt(i)) != null)
596
597                if (! dups.add(urlToString(url)))
598                {
599                    count++;
600                    size--;
601                    i--;
602                    potentiallyNewURLs.removeElementAt(i);
603                }
604
605        return count;
606    }
607
608    /**
609     * Removes any Fragment-{@code URL} {@code '#'} symbols from a {@code URL}.
610     * 
611     * <BR /><BR />If this {@code URL} contains a pound-sign Anchor-Name according to the Standard
612     * JDK's {@code URL.getRef()} method.  Specifically, if {@code URL.getRef()} returns a non-null
613     * value, this method rebuilds the URL, without any Anchor-Name / Fragment information.
614     * 
615     * <BR /><BR />The intention is to return a {@code URL} where any / all {@code String}-data 
616     * that occurs after a {@code '#'} Hash-Tab / Pound-Sign is removed.
617     * 
618     * @param url Any standard HTTP {@code URL}.  If this {@code 'url'} contains a {@code '#'}
619     * (Pound Sign, Partial Reference) - according to the standard JDK {@code URL.getRef()} method,
620     * then it shall be removed.
621     * 
622     * @return The {@code URL} without the partial-reference, or the original {@code URL} if there
623     * was no partial reference.  Null is returned if there is an error instantiating the new
624     * {@code URL} without the partial-reference.
625     */
626    public static URL shortenPoundREF(URL url)
627    {
628        try
629        {
630            if (url.getRef() != null) return new URL(
631                ((url.getProtocol() != null) ? url.getProtocol().toLowerCase()  : "") +
632                    "://" +
633                ((url.getHost()     != null) ? url.getHost().toLowerCase()      : "") +
634                ((url.getFile()     != null) ? url.getFile()                    : "")
635            );
636
637            else return url;
638        }
639
640        catch (MalformedURLException e) { return null; }
641    }
642
643    /**
644     * <EMBED CLASS='external-html' DATA-FILE-ID=URLS_NAMED_ANCHORS>
645     *
646     * @param urls Any list of completed (read: <I>fully-resolved</I>) {@code URL's}.
647     *
648     * @param ifExceptionSetNull If this parameter is passed {@code TRUE}, if there is ever an
649     * exception-throw while building the new {@code URL's} (without the fragment / pound-sign),
650     * then that position in the {@code Vector} will be replaced with a null.
651     * 
652     * <BR /><BR />When this parameter is passed {@code FALSE}, if an exception is thrown, then
653     * it will be caught and silently ignored.
654     *
655     * @return The number / count of {@code URL's} in this list that were modified.  Whenever a
656     * {@code URL} Named-Anchor is encountered, it will be removed from the {@code URL}, and a
657     * new {@code URL} without the fragment-part will be inserted to replace the old one.
658     * 
659     * <BR /><BR />The integer that is returned here is the number of times that a replacement
660     * was made to the input {@code Vector}-parameter {@code 'urls'}.
661     */
662    public static int shortenPoundREFs(Vector<URL> urls, boolean ifExceptionSetNull)
663    {
664        int pos             = 0;
665        int shortenCount    = 0;
666
667        for (int i = (urls.size() - 1); i >= 0; i--)
668        {
669            URL url = urls.elementAt(i);
670
671            try
672            {
673                if (url.getRef() != null)
674                {
675                    URL newURL = new URL(
676                        ((url.getProtocol() != null) ? url.getProtocol().toLowerCase()  : "") +
677                            "://" +
678                        ((url.getHost()     != null) ? url.getHost().toLowerCase()      : "") +
679                        ((url.getFile()     != null) ? url.getFile()                    : "")
680                    );
681
682                    urls.setElementAt(newURL, i);
683                    shortenCount++;
684                }
685            }
686
687            catch (MalformedURLException e)
688                { if (ifExceptionSetNull) urls.setElementAt(null, i); }
689        }
690
691        return shortenCount;
692    }
693
694    /**
695     * <EMBED CLASS='external-html' DATA-FILE-ID=URLS_NAMED_ANCHORS>
696     *
697     * <BR /><BR /><B CLASS=JDDescLabel>KE: Keep Exceptions</B>
698     *
699     * <BR />This method is identical to the previous method, defined above, except that it
700     * allows a programmer to keep / retain any {@code MalformedURLException's} that are thrown
701     * while re-building them.
702     *
703     * @param urls Any list of completed (read: <I>fully-resolved</I>) {@code URL's}.
704     *
705     * @param ifExceptionSetNull If this is {@code TRUE} then if there is ever an exception building
706     * a new {@code URL} without a "Relative {@code URL '#'}" (Pound-Sign), then that position in
707     * the {@code Vector} will be replaced with 'null.'
708     *
709     * @return The number/count of {@code URL's} in this list that were modified.  If a {@code URL}
710     * was modified, it was because it had a partial-page reference in it.  If in the process of
711     * generating a new {@code URL} out of an old one, a {@code MalformedURLException} occurs, the
712     * exception will be placed in the {@code Ret2.b} position, which is a 
713     * {@code Vector<MalformedURLException>}.
714     *
715     * <BR /><BR /><B>SPECIFICALLY:</B>
716     *
717     * <BR /><BR /><UL CLASS=JDUL>
718     * 
719     * <LI> {@code Ret2.a = 'Integer'} number of {@code URL's} shortened for having a {@code '#'}
720     *      partial-reference.
721     *      </LI>
722     * 
723     * <LI> {@code Ret2.b = Vector<MalformedURLException>} where each element of this
724     *      {@code Vector} is null if there were no problems converting the {@code URL}, or the
725     *      exception reference if there were exceptions thrown.
726     *      </LI>
727     * 
728     * </UL>
729     */
730    public static Ret2<Integer, Vector<MalformedURLException>> shortenPoundREFs_KE
731        (Vector<URL> urls, boolean ifExceptionSetNull)
732    {
733        int                             pos             = 0;
734        int                             shortenCount    = 0;
735        Vector<MalformedURLException>   v               = new Vector<>();
736
737        for (int i=0; i < urls.size(); i++) v.setElementAt(null, i);
738
739        for (int i = (urls.size() - 1); i >= 0; i--)
740        {
741            URL url = urls.elementAt(i);
742 
743            try
744            {
745                if (url.getRef() != null)
746                {
747                    URL newURL = new URL(
748                        ((url.getProtocol() != null) ? url.getProtocol().toLowerCase()  : "") +
749                            "://" +
750                        ((url.getHost()     != null) ? url.getHost().toLowerCase()      : "") +
751                        ((url.getFile()     != null) ? url.getFile()                    : "")
752                    );
753
754                    urls.setElementAt(newURL, i);
755                    shortenCount++;
756                }
757            }
758
759            catch (MalformedURLException e)
760            {
761                if (ifExceptionSetNull) urls.setElementAt(null, i);
762                v.setElementAt(e, i);
763            }
764        }
765
766        return new Ret2<Integer, Vector<MalformedURLException>>(Integer.valueOf(shortenCount), v);
767    }
768
769    /**
770     * On the internet, a {@code URL} is part case-sensitive, and part case-insensitive.  The
771     * Domain-Name and Protocol ({@code http://}, and {@code 'some.company.com'}) portions of the
772     * {@code URL} <I>are Case-Insensitive - they may be in any combination of upper or lower
773     * case</I>.
774     *
775     * <BR /><BR />However, the directory, file-name, and (optional) Query-{@code String} portion
776     * of a {@code URL} are (often, but not always) Case-Sensitive.  The sensitivity to case in
777     * these three parts of a {@code URL} is dependent upon the individual Web-Server that is 
778     * providing the content for the {@code URL}.
779     *
780     * <BR /><BR />To summarize, DNS servers which monitor the Domain-Name part of a {@code URL}
781     * treat upper &amp; lower case English-Letters as the same.  Web-Server that utilize the File
782     * Directory part of a {@code URL} will sometimes care about case, and sometimes won't.  This
783     * behavior is dependent upon how the Web-Master has configured his system.
784     *
785     * @param url This may be any Internet-Domain {@code URL}
786     *
787     * @return A {@code String} version of this {@code URL}, but the domain and protocol portions
788     * of the {@code URL} will be a "consistent" lower case.  The case of the directory, file and
789     * (possibly, but not guaranteed to be present) {@code query-string} portion will not have
790     * their case modified either way.
791     *
792     * <BR /><BR /><B>NOTE:</B> This type of information is pretty important is you are attempting
793     * to scan for duplicate {@code URL's} or check their equality.
794     */
795    public static String urlToString(URL url)
796    {
797        return
798            ((url.getProtocol() != null)    ? url.getProtocol().toLowerCase()   : "") + "://" +
799            ((url.getHost()     != null)    ? url.getHost().toLowerCase()       : "") +
800            ((url.getPath()     != null)    ? url.getPath()                     : "") +
801            ((url.getQuery()    != null)    ? ('?' + url.getQuery())            : "") +
802            ((url.getRef()      != null)    ? ('#' + url.getRef())              : "");
803    }
804
805    /**
806     * As of today, the version of UNIX {@code curl} command does not seem to be downloading
807     * everything properly.  It downloaded an image {@code '.png'} file just fine, but seemed to
808     * have botched a zip-file.  This does what UNIX {@code 'curl'} command, <I>but does not
809     * actually invoke the UNIX operating system to do it.</I>  It just does this...
810     *
811     * @param url This may be any URL, but it is intended to be a downloadable file.  It will
812     * download {@code '.html'} files fine, but you may try images, data-files, zip-files,
813     * tar-archives, and movies.
814     *
815     * @param outFileName You must specify a file-name, and if this parameter is null, a
816     * {@code NullPointerException} will be thrown immediately.  If you would like your program
817     * to guess the filename - <I>based on the file named in the URL</I>, please use the method
818     * {@code URL.getFile()}, or something to that effect.
819     * 
820     * @param userAgent A User-Agent, as a {@code String}.  If this parameter is passed null,
821     * it will be silently ignored, and a User-Agent won't be used.
822     * 
823     * @throws IOException If there are I/O Errors when using the {@code HttpURLConnection}.
824     */
825    public static void CURL(URL url, String outFileName, String userAgent) throws IOException
826    {   
827        HttpURLConnection con = (HttpURLConnection) url.openConnection();
828
829        con.setRequestMethod("GET");
830
831        if (userAgent != null) con.setRequestProperty("User-Agent", userAgent);
832
833        InputStream         is      = con.getInputStream();
834        FileOutputStream    fos     = new FileOutputStream(outFileName);
835        byte[]              b       = new byte[5000];
836        int                 result  = 0;
837
838        while ((result = is.read(b)) != -1) fos.write(b, 0, result);
839 
840        fos.flush();    fos.close();    is.close();
841    }
842}