URLs.java.html

package Torello.Java.Additional;

import java.util.*;
import java.net.*;
import java.util.regex.*;
import java.io.*;

import static Torello.Java.C.*;

import Torello.Java.StorageWriter;
import Torello.Java.StrReplace;

/**
 * A class that plays-with URL's, no more, no less.
 * 
 * <EMBED CLASS='external-html' DATA-FILE-ID=URLS>
 */
@Torello.JavaDoc.StaticFunctional
public class URLs
{
    private URLs() { }

    /**
     * This is a Regular-Expression Pattern {@code (java.util.regex.Pattern)} - saved as a 
     * {@code String}.  It is subsequently compiled.
     *
     * <BR /><BR />The primary function is to match {@code String's} that are intended to match
     * HTTP-{@code URL's}.  This Regular Expression matches:
     * 
     * <BR /><BR /><UL CLASS=JDUL>
     * <LI>{@code http(s)://...<any-text>.../}</LI>
     * <LI>{@code http(s)://...<any-text, not front-slash>...}</LI>
     * <LI>{@code http(s)://...<any-text>.../...<any-text, not front-slash>...}</LI>
     * </UL>
     * 
     * <BR /><BR /><B CLASS=JDDescLabel>Primarily used in:</B>
     * 
     * <BR /><UL CLASS=JDUL>
     * <LI>{@link #toProperURLV3(String)}</LI>
     * <LI>{@link #toProperURLV4(String)}</LI>
     * </UL>
     * 
     * @see #P1
     */
    protected static final String RE1 =
         "^(http[s]?:\\/\\/.*?\\/$|http[s]?:\\/\\/[^\\/]*$|http[s]?:\\/\\/.*?\\/[^\\/]+)";

    /**
     * {@code P1 = Pattern.compile(RE1);}
     * 
     * @see #RE1
     */
    protected static final Pattern P1 = Pattern.compile(RE1);

    /**
     * Java Help Messag Explaining {@code class java.net.URL} - and the specific output of its
     * methods.
     *
     * <EMBED CLASS='external-html' DATA-FILE-ID=URLS_HELP_MSG>
     *
     * @param sw An instance of class StorageWriter.  This parameter may be null, and if it is
     * text-output will be sent to Standard-Output.
     */
    protected static final void javaURLHelpMessage(StorageWriter sw)
    {
        if (sw == null) sw = new StorageWriter();

        String[] urlStrArr = {
            "https://DALLASCITYHALL.com", "https://dallascityhall.com/",
            "https://dallascityhall.com/news",
            "https://dallascityhall.com/news/", "http://DALLASCITYHALL.com/news/ARTICLE-1.html",
            "https://DallasCityHall.com/NEWS/article1.html?q=somevalue",
            "https://DallasCityHall.com/news/ARTICLE-1.html#subpart1",
            "https://DallasCityHall.com/NEWS/article1.html?q=somevalue&q2=someOtherValue",
            "https://DallasCityHall.com/NEWS/article1.html?q=somevalue&q2=someOtherValue#LocalRef"
        };

        URL[] urlArr = new URL[urlStrArr.length];

        try
            { for (int i=0; i < urlStrArr.length; i++) urlArr[i] = new URL(urlStrArr[i]); }

        catch (Exception e)
        {
            sw.println(
                "Broke a URL, and it generated an exception.\n" +
                "Sorry, fix the URL's in this method.\n" + 
                "Did you change them?"
            );

            e.printStackTrace();
            return;
        }

        for (URL u : urlArr)
        {
            System.out.println(
                "u.toString():\t\t"     + BCYAN + u.toString() + RESET + '\n' +
                "u.getProtocol():\t"    + u.getProtocol() + '\n' +
                "u.getHost():\t\t"      + u.getHost() + '\n' +
                "u.getPath():\t\t"      + u.getPath() + '\n' +
                "u.getFile():\t\t"      + u.getFile() + '\n' +
                "u.getQuery():\t\t"     + u.getQuery() + '\n' +
                "u.getRef():\t\t"       + u.getRef() + '\n' +
                "u.getAuthority():\t"   + u.getAuthority() + '\n' +
                "u.getUserInfo():\t"    + u.getUserInfo() + '\n' +
                "urlToString(u):\t\t"   + urlToString(u)
            );
        }
    }


    // ********************************************************************************************
    // ********************************************************************************************
    // Helper function for making URL address readable by web-servers.
    //*********************************************************************************************
    // ********************************************************************************************


    /**
     * When scraping Spanish {@code URL's}, these characters can / should be escaped.
     * 
     * <BR /><BR /><B CLASS=JDDescLabel>Parallel Array Note:</B>
     * 
     * <BR />This array shall be considered parallel to the <B><I>replacement</I></B>
     * {@code String[]}-Array {@link #VOWELS_URL}.
     * 
     * @see #toProperURLV1(String)
     * @see #VOWELS_URL
     */
    protected static final char[] VOWELS = {
        'á', 'É', 'é', 'Í', 'í', 'Ó', 'ó', 'Ú', 'ú', 'Ü', 'ü',
        'Ñ', 'ñ', 'Ý', 'ý', '¿', '¡'
    };

    /**
     * When scraping Spanish {@code URL's}, these {@code String's} are the
     * <B>URL Escape Sequences</B> for the Spanish Vowel Characters listed in {@link #VOWELS}.
     * 
     * <BR /><BR /><B CLASS=JDDescLabel>Parallel Array Note:</B>
     * 
     * <BR />This array shall be considered parallel to {@code String[]}-Array {@link #VOWELS}.
     * 
     * @see #toProperURLV1(String)
     * @see #VOWELS
     */
    protected static final String[] VOWELS_URL =
    {
        "%C3%A1", "%C3%89", "%C3%A9", "%C3%8D", "%C3%AD", "%C3%93", "%C3%B3", "%C3%9A",
        "%C3%BA", "%C3%9C", "%C3%BC", "%C3%91", "%C3%B1", "%C3%9D", "%C3%BD", "%C2%BF",
        "%C2%A1"
    };

    /**
     * This will substitute many of the Spanish-characters that can make a web-query difficult.
     * These are the substitutions listed:
     *
     * <BR /><BR /><TABLE CLASS=JDBriefTable>
     * <TR><TH>Spanish Language Character</TH><TH>URL Escape Sequence</TH></TR>
     * <TR><TD>{@code Á}</TD><TD>{@code %C3%81}</TD></TR>
     * <TR><TD>{@code á}</TD><TD>{@code %C3%A1}</TD></TR>
     * <TR><TD>{@code É}</TD><TD>{@code %C3%89}</TD></TR>
     * <TR><TD>{@code é}</TD><TD>{@code %C3%A9}</TD></TR>
     * <TR><TD>{@code Í}</TD><TD>{@code %C3%8D}</TD></TR>
     * <TR><TD>{@code í}</TD><TD>{@code %C3%AD}</TD></TR>
     * <TR><TD>{@code Ó}</TD><TD>{@code %C3%93}</TD></TR>
     * <TR><TD>{@code ó}</TD><TD>{@code %C3%B3}</TD></TR>
     * <TR><TD>{@code Ú}</TD><TD>{@code %C3%9A}</TD></TR>
     * <TR><TD>{@code ú}</TD><TD>{@code %C3%BA}</TD></TR>
     * <TR><TD>{@code Ü}</TD><TD>{@code %C3%9C}</TD></TR>
     * <TR><TD>{@code ü}</TD><TD>{@code %C3%BC}</TD></TR>
     * <TR><TD>{@code Ñ}</TD><TD>{@code %C3%91}</TD></TR>
     * <TR><TD>{@code ñ}</TD><TD>{@code %C3%B1}</TD></TR>
     * <TR><TD>{@code Ý}</TD><TD>{@code %C3%9D}</TD></TR>
     * <TR><TD>{@code ý}</TD><TD>{@code %C3%BD}</TD></TR>
     * </TABLE>
     *
     * <BR /><BR /><B CLASS=JDDescLabel>Historical Note:</B>
     * 
     * <BR />This method was written the very first time that a {@code URL} needed to be escaped
     * during the writing of the Java-HTML {@code '.jar'}.
     *
     * @param url  Any website {@code URL} query.
     *
     * @return The same {@code URL} with substitutions made.
     * 
     * @see #VOWELS
     * @see #VOWELS_URL
     * @see StrReplace#r(String, char[], String[])
     */
    public static String toProperURLV1(String url)
    { return StrReplace.r(url, VOWELS, VOWELS_URL); }

    /**
     * This list of java {@code char's} are characters that are better off escaped when passing
     * them through a {@code URL}.
     * 
     * @see #toProperURLV2(String)
     */
    protected static final char[] URL_ESC_CHARS =
    {
        '%', ' ', '#', '$', '&', '@', '`', '/', ':', ';', '<', '=', '>', '?', '[', '\\',
        ']', '^', '{', '|', '}', '~', '\'', '+', ','
    };

    /**
     * This method will clobber the leading Domain-Name and Protocol -
     * {@code http://domain.name.something/} stuff.  It is best to use this method on
     * {@code String's} that will be inserted into a {@code URL} after the {@code '?'}
     * question-mark, inside the Query-String.
     * 
     * <BR /><BR />This can be very useful when sending JSON Arguments, for instance, inside a
     * {@code URL's} Query-String, instead of the GET / POST part of a request.
     * 
     * <BR /><BR />Note that this method should not be used to escape characters outside of the
     * range of Standard-ASCII (characters {@code 0 ... 255}).
     *
     * <BR /><BR /><B CLASS=JDDescLabel>State of the Experiment:</B>
     * 
     * <BR />It seems to help to escape these characters:
     * 
     * <BR /><B STYLE="color:red;">{@code # $ % & @ ` / : ; < = > ? [ \ ] ^ | ~ " ' + ,}
     * <CODE> { } </CODE></B>
     * 
     * @param urlStuff Any information that is intended to be sent via an HTTP-{@code URL}, and
     * needs to be escaped.
     *
     * @return An escaped version of this {@code URL-String}
     * 
     * @see #URL_ESC_CHARS
     * @see StrReplace#r(String, char[], IntCharFunction)
     */
    public static String toProperURLV2(String urlStuff)
    {
        return StrReplace.r(
            urlStuff, URL_ESC_CHARS,
            (int i, char c) -> '%' + Integer.toHexString((int) c)
        );
    }

    /**
     * This leaves out the actual domain name before starting HTTP-URL Escape Sequences.  If this
     * starts with the words "http://domain.something/" then the initial colon, forward-slash and
     * periods won't be escaped.  Everything after the first front-slash will include URL-HTTP
     * Escape characters.
     *
     * <BR /><BR />This does the same thing as {@code toProperURLV2(String)}, but skips the initial
     * part of the URL text/string - IF PRESENT!
     * 
     * <BR /><BR />{@code http(s?)://domain.something/} is skipped by the Regular Expression, 
     * everything else from {@code URLV2} is escaped.
     *
     * @param url This may be any internet {@code URL}, represented as a {@code String}.  It will
     * be escaped with the {@code %INT} format.
     *
     * @return An escaped {@code URL String}
     *
     * @see #toProperURLV2(String)
     * @see #P1
     */
    public static String toProperURLV3(String url)
    {
        String	beginsWith	= null;
        Matcher	m			= P1.matcher(url);

        if (m.find())
        {
            beginsWith = m.group(1); 
            url = url.substring(beginsWith.length());
        }

        return ((beginsWith != null) ? beginsWith : "") + toProperURLV2(url);
    }

    /**
     * This is a (shortened) list of characters that <I>should</I> be escaped before being used
     * within a {@code URL}.
     * 
     * <BR /><BR />This version differs from {@link #URL_ESC_CHARS} in that it does not include the
     * {@code '&'} (ampersand), the {@code '?'} (question-mark) or the {@code '/'} (forward-slash).
     * 
     * @see #URL_ESC_CHARS
     * @see #toProperURLV4(String)
     */
    protected static final char[] URL_ESC_CHARS_ABBREV =
    {
        '%', ' ', '#', '$', '@', '`', ':', ';', '<', '=', '>', '[', '\\', ']',
        '^', '{', '|', '}', '~', '\'', '+', ','
    };

    /**
     * This does the same thing as V3, but it also will avoid escaping any {@code '?'} 
     * (question-mark) or {@code '&'} (ampersand) or {@code '/'} (forward-slash) symbols anywhere
     * in the entire {@code String}.  It also "skips" escaping the initial
     * {@code HTTP(s)://domain.net.something/} as well - just like {@code toProperURLV3}
     *
     * @return This does the same thing as {@code toProperURLV3(String)}, but leaves out 100%
     * of the instances of Ampersand, Question-Mark, and Forward-Slash symbols. 
     *
     * @see #toProperURLV3(String)
     * @see #P1
     * @see #URL_ESC_CHARS_ABBREV
     * @see StrReplace#r(String, char[], IntCharFunction)
     */
    public static String toProperURLV4(String url)
    {
        String	beginsWith	= null;
        Matcher	m			= P1.matcher(url);

        if (m.find())
        {
            beginsWith = m.group(1); 
            url = url.substring(beginsWith.length());
        }

        return ((beginsWith != null) ? beginsWith : "") +
            StrReplace.r
                (url, URL_ESC_CHARS_ABBREV, (int i, char c) -> '%' + Integer.toHexString((int) c));
    }

    /**
     * <EMBED CLASS='external-html' DATA-FILE-ID=URLS_PRP_URL_V5>
     *
     * @param url This is the URL to be encoded, properly
     *
     * @return A properly encoded URL String.  Important, if calling the {@code java.net.URL}
     * constructor generates a {@code MalformedURLException}, then this method shall return.
     * The {@code java.net.URL} constructor will be called if the {@code String} passed begins with
     * the characters {@code 'http://'} or {@code 'https://'}.
     */
    public static String toProperURLV5(String url)
    {
        url = url.trim();

        URL         u       = null;
        String[]    sArr    = null;
        String      tlc     = url.toLowerCase();

        if (tlc.startsWith("http://") || tlc.startsWith("https://"))
        { try { u = new URL(url); } catch (Exception e) { return null; } }

        if (u == null)  sArr = url.split("/");
        else            sArr = u.getPath().split("/");

        String          slash   = "";
        StringBuilder   sb      = new StringBuilder();

        for (String s : sArr)
        {
            try
                { sb.append(slash + java.net.URLEncoder.encode(s, "UTF-8")); }

            catch (UnsupportedEncodingException e)
                { /* This really cannot happen, and I don't know what to put here! */ }

            slash = "/";
        }

        if (u == null)
            return sb.toString();
        else
            return
                u.getProtocol() + "://" + u.getHost() + sb.toString() +
                ((u.getQuery() != null) ? ("?" + u.getQuery())  : "") +
                ((u.getRef() != null)   ? ("#" + u.getRef())    : "");
    }

    /**
     * Rather than trying to explain what is escaped and what is left alone, please review the
     * exact code here.
     *
     * <BR /><BR /><B CLASS=JDDescLabel>Another One:</B>
     * 
     * <BR />Well, I just wrote another one, they told me to.  This, newest version of
     * {@code URL}-Encoding is actually pretty successful.  It handles all Extra-Characters and is
     * capable of dealing with {@code URL's} that contain the {@code '?'  '='  '&'} operators of
     * {@code GET}-Requests.
     *
     * <BR /><BR />Realize that though the out-of-the-box JDK, there is a class called
     * "URI Encoder" - but that class expects that the {@code URL} to have already been separated
     * out into it's distinct parts.
     * 
     * <BR /><BR />This method does the the {@code URL}-Separating into disparate parts
     * before performing the Character-Escaping.
     *
     * @param url This is any java {@code URL}.
     *
     * @return a new {@code String} version of the input parameter {@code 'url'}
     */
    public static String toProperURLV6(String url)
    {
        URL u = null;

        try
            { u = new URL(url); }

        catch (Exception e) { return null; }

        StringBuilder sb = new StringBuilder();

        sb.append(u.getProtocol());
        sb.append("://");
        sb.append(u.getHost());
        sb.append(toProperURLV5(u.getPath()));

        if (u.getQuery() != null)
        {
            String[]            sArr        = u.getQuery().split("&");
            StringBuilder       sb2         = new StringBuilder();
            String              ampersand   = "";

            for (String s : sArr)
            {
                String[]        s2Arr       = s.split("=");
                StringBuilder   sb3         = new StringBuilder();    
                String          equals      = "";

                for (String s2: s2Arr)
                {
                    try
                        { sb3.append(equals + java.net.URLEncoder.encode(s2, "UTF-8")); }

                    // This should never happen - UTF-8 is (sort-of) the only encoding.
                    catch (UnsupportedEncodingException e) { }

                    equals = "=";
                }

                sb2.append(ampersand + sb3.toString());
                ampersand = "&";
            }

            sb.append("?" + sb2.toString());
        }

        // Not really a clue, because a the "#" operator and the "?" probably shouldn't be used
        // together.  Java's java.net.URL class will parse a URL that has both the ? and the #, but
        // I have no idea which Web-Sites would allow this, or encourage this...

        if (u.getRef() != null)

            try
                { sb.append("#" + java.net.URLEncoder.encode(u.getRef(), "UTF-8")); }

            catch (UnsupportedEncodingException e) { }

        return sb.toString();        
    }

    /**
     * These strictly use Java's URI Encoding Mechanism.  They seem to work the same as "V6"
     * Internally, these are now used.  This as of November, 2019.
     *
     * @param url A Complete Java {@code URL}, as a {@code String}.  Any specialized
     * Escape-Characters that need to be escaped, will be.
     *
     * @throws URISyntaxException This will throw if building the {@code URI} generates an
     * exception.  Internally, all this method does is build a {@code URI}, and then call the Java
     * Method {@code 'toASCIIString()'}
     */
    public static String toProperURLV7(String url) throws URISyntaxException, MalformedURLException
    { return toProperURLV8(new URL(url)); }

    /**
     * These strictly use Java's URI Encoding Mechanism.  They seem to work the same as "V6"
     * Internally, these are now used.  This as of November, 2019.
     *
     * @param url A Complete Java {@code URL}.  Any specialized Escape-Characters that need to be
     * escaped, will be.
     *
     * @throws URISyntaxException This will throw if building the URI generates an exception.
     * Internally, all this method does is build a URI, and then call the Java Method
     * {@code 'toASCIIString()'}
     */
    public static String toProperURLV8(URL url) throws URISyntaxException, MalformedURLException
    {
        return new URI(
            url.getProtocol(),
            url.getUserInfo(),
            url.getHost(),
            url.getPort(),
            url.getPath(),
            url.getQuery(),
            url.getRef()
        ).toASCIIString();
    }


    // ********************************************************************************************
    // ********************************************************************************************
    // The original "URLs" class
    //*********************************************************************************************
    // ********************************************************************************************


    /**
     * If you have a list of {@code URL's}, and want to quickly remove any
     * duplicate-{@code URL's} found in the list - this will remove them.
     *
     * <BR /><BR /><B CLASS=JDDescLabel>Case Sensitivity:</B>
     * 
     * <BR />This method will perform a few "to-lower-case" operations on the protocol and
     * Web-Domain parts, but not on the file, directory, or Query-String portion of the
     * {@code URL}.
     *
     * <BR /><BR />This should hilite what is Case-Sensitive, and what is not:
     * 
     * <BR /><BR /><UL CLASS=JDUL>
     * <LI> These are considered duplicate URL's:
     *      <BR />
     *      <BR /><CODE>http://some.company.com/index.html</CODE>
     *      <BR /><CODE>HTTP://SOME.COMPANY.COM/index.html</CODE>
     *      <BR /><BR />
     *      </LI>
     * 
     * <LI> These are <I>not</I> considered duplicate URL's:
     *      <BR />
     *      <BR /><CODE>http://other.company.com/Directory/Ben-Bitdiddle.html</CODE>
     *      <BR /><CODE>http://other.company.com/DIRECTORY/BE.html</CODE>
     *      </LI>
     * </UL>
     *
     * @param urls Any list of {@code URL's}, some of which might have been duplicated.  The
     * difference between this {@code 'removeDuplicates'} and the other {@code 'removeDuplicates'}
     * available in this class is that this one only removes multiple instances of the same 
     * {@code URL} in this {@code Vector}, while the other one iterates through a list of 
     * {@code URL's} already visited in a previous-session.
     * 
     * <BR /><BR /><B>NOTE:</B> <I>Null {@code Vector}-values are skipped outright, they are
     * neither removed nor changed.</i>
     *
     * @return The number of {@code Vector} elements that were removed.  (i.e. <I>The size by which
     * the {@code Vector} was shrunk.</I>)
     */
    public static int removeDuplicates(Vector<URL> urls)
    {
        TreeSet<String> dups    = new TreeSet<>();
        int             count   = 0;
        int             size    = urls.size();
        URL             url     = null;

        for (int i=0; i < size; i++)

            if ((url = urls.elementAt(i)) != null)
                if (! dups.add(urlToString(url)))
                {
                    count++;
                    size--;
                    i--;
                    urls.removeElementAt(i);
                }

        return count;
    }

    /**
     * This simple method will remove any {@code URL's} from the input {@code Vector} parameter
     * {@code 'potentiallyNewURLs'} which are also present-members of the input {@code Vector} 
     * parameter {@code 'visitedURLs'}.
     * 
     * <BR /><BR />This may seem trivial, and it is, but it worries about things like the
     * {@code String's} Case for you.
     *
     * @param visitedURLs This parameter is a list of {@code URL's} that have already
     * "been visited."
     *
     * @param potentiallyNewURLs This parameter is a list of {@code URL's} that are possibly
     * "un-visited" - meaning whatever scrape, crawl or search being performed needs to know which
     * {@code URL's} are listed in the previous parameter's contents.  This may seem trivial, just
     * use the java {@code url1.equals(url2)} command, but, alas, java doesn't exactly take into
     * account upper-case and lower-case domain-names.  This worries about case.
     *
     * @return The number of {@code URL's} that were removed from the input {@code Vector}
     * parameter {@code 'potentiallyNewURLs'}.
     */
    public static int removeDuplicates(Vector<URL> visitedURLs, Vector<URL> potentiallyNewURLs)
    {
        // The easiest way to check for duplicates is to build a tree-set of all the URL's as a
        // String.  Java's TreeSet<> generic already (automatically) scans for duplicates
        // (efficiently) and will tell you if you have tried to add a duplicate

        TreeSet<String> dups = new TreeSet<>();

        // Build a TreeSet of the url's from the "Visited URLs" parameter
        visitedURLs.forEach(url -> dups.add(urlToString(url)));

        // Add the "Possibly New URLs", one-by-one, and remove them if they are already in the
        // visited list.

        int count   = 0;
        int size    = potentiallyNewURLs.size();
        URL url     = null;

        for (int i=0; i < size; i++)

            if ((url = potentiallyNewURLs.elementAt(i)) != null)

                if (! dups.add(urlToString(url)))
                {
                    count++;
                    size--;
                    i--;
                    potentiallyNewURLs.removeElementAt(i);
                }

        return count;
    }

    /**
     * Removes any Fragment-{@code URL} {@code '#'} symbols from a {@code URL}.
     * 
     * <BR /><BR />If this {@code URL} contains a pound-sign Anchor-Name according to the Standard
     * JDK's {@code URL.getRef()} method.  Specifically, if {@code URL.getRef()} returns a non-null
     * value, this method rebuilds the URL, without any Anchor-Name / Fragment information.
     * 
     * <BR /><BR />The intention is to return a {@code URL} where any / all {@code String}-data 
     * that occurs after a {@code '#'} Hash-Tab / Pound-Sign is removed.
     * 
     * @param url Any standard HTTP {@code URL}.  If this {@code 'url'} contains a {@code '#'}
     * (Pound Sign, Partial Reference) - according to the standard JDK {@code URL.getRef()} method,
     * then it shall be removed.
     * 
     * @return The {@code URL} without the partial-reference, or the original {@code URL} if there
     * was no partial reference.  Null is returned if there is an error instantiating the new
     * {@code URL} without the partial-reference.
     */
    public static URL shortenPoundREF(URL url)
    {
        try
        {
            if (url.getRef() != null) return new URL(
                ((url.getProtocol() != null) ? url.getProtocol().toLowerCase()  : "") +
                    "://" +
                ((url.getHost()     != null) ? url.getHost().toLowerCase()      : "") +
                ((url.getFile()     != null) ? url.getFile()                    : "")
            );

            else return url;
        }

        catch (MalformedURLException e) { return null; }
    }

    /**
     * <EMBED CLASS='external-html' DATA-FILE-ID=URLS_NAMED_ANCHORS>
     *
     * @param urls Any list of completed (read: <I>fully-resolved</I>) {@code URL's}.
     *
     * @param ifExceptionSetNull If this parameter is passed {@code TRUE}, if there is ever an
     * exception-throw while building the new {@code URL's} (without the fragment / pound-sign),
     * then that position in the {@code Vector} will be replaced with a null.
     * 
     * <BR /><BR />When this parameter is passed {@code FALSE}, if an exception is thrown, then
     * it will be caught and silently ignored.
     *
     * @return The number / count of {@code URL's} in this list that were modified.  Whenever a
     * {@code URL} Named-Anchor is encountered, it will be removed from the {@code URL}, and a
     * new {@code URL} without the fragment-part will be inserted to replace the old one.
     * 
     * <BR /><BR />The integer that is returned here is the number of times that a replacement
     * was made to the input {@code Vector}-parameter {@code 'urls'}.
     */
    public static int shortenPoundREFs(Vector<URL> urls, boolean ifExceptionSetNull)
    {
        int pos             = 0;
        int shortenCount    = 0;

        for (int i = (urls.size() - 1); i >= 0; i--)
        {
            URL url = urls.elementAt(i);

            try
            {
                if (url.getRef() != null)
                {
                    URL newURL = new URL(
                        ((url.getProtocol() != null) ? url.getProtocol().toLowerCase()  : "") +
                            "://" +
                        ((url.getHost()     != null) ? url.getHost().toLowerCase()      : "") +
                        ((url.getFile()     != null) ? url.getFile()                    : "")
                    );

                    urls.setElementAt(newURL, i);
                    shortenCount++;
                }
            }

            catch (MalformedURLException e)
                { if (ifExceptionSetNull) urls.setElementAt(null, i); }
        }

        return shortenCount;
    }

    /**
     * <EMBED CLASS='external-html' DATA-FILE-ID=URLS_NAMED_ANCHORS>
     *
     * <BR /><BR /><B CLASS=JDDescLabel>KE: Keep Exceptions</B>
     *
     * <BR />This method is identical to the previous method, defined above, except that it
     * allows a programmer to keep / retain any {@code MalformedURLException's} that are thrown
     * while re-building them.
     *
     * @param urls Any list of completed (read: <I>fully-resolved</I>) {@code URL's}.
     *
     * @param ifExceptionSetNull If this is {@code TRUE} then if there is ever an exception building
     * a new {@code URL} without a "Relative {@code URL '#'}" (Pound-Sign), then that position in
     * the {@code Vector} will be replaced with 'null.'
     *
     * @return The number/count of {@code URL's} in this list that were modified.  If a {@code URL}
     * was modified, it was because it had a partial-page reference in it.  If in the process of
     * generating a new {@code URL} out of an old one, a {@code MalformedURLException} occurs, the
     * exception will be placed in the {@code Ret2.b} position, which is a 
     * {@code Vector<MalformedURLException>}.
     *
     * <BR /><BR /><B>SPECIFICALLY:</B>
     *
     * <BR /><BR /><UL CLASS=JDUL>
     * 
     * <LI> {@code Ret2.a = 'Integer'} number of {@code URL's} shortened for having a {@code '#'}
     *      partial-reference.
     *      </LI>
     * 
     * <LI> {@code Ret2.b = Vector<MalformedURLException>} where each element of this
     *      {@code Vector} is null if there were no problems converting the {@code URL}, or the
     *      exception reference if there were exceptions thrown.
     *      </LI>
     * 
     * </UL>
     */
    public static Ret2<Integer, Vector<MalformedURLException>> shortenPoundREFs_KE
        (Vector<URL> urls, boolean ifExceptionSetNull)
    {
        int                             pos             = 0;
        int                             shortenCount    = 0;
        Vector<MalformedURLException>   v               = new Vector<>();

        for (int i=0; i < urls.size(); i++) v.setElementAt(null, i);

        for (int i = (urls.size() - 1); i >= 0; i--)
        {
            URL url = urls.elementAt(i);
 
            try
            {
                if (url.getRef() != null)
                {
                    URL newURL = new URL(
                        ((url.getProtocol() != null) ? url.getProtocol().toLowerCase()  : "") +
                            "://" +
                        ((url.getHost()     != null) ? url.getHost().toLowerCase()      : "") +
                        ((url.getFile()     != null) ? url.getFile()                    : "")
                    );

                    urls.setElementAt(newURL, i);
                    shortenCount++;
                }
            }

            catch (MalformedURLException e)
            {
                if (ifExceptionSetNull) urls.setElementAt(null, i);
                v.setElementAt(e, i);
            }
        }

        return new Ret2<Integer, Vector<MalformedURLException>>(Integer.valueOf(shortenCount), v);
    }

    /**
     * On the internet, a {@code URL} is part case-sensitive, and part case-insensitive.  The
     * Domain-Name and Protocol ({@code http://}, and {@code 'some.company.com'}) portions of the
     * {@code URL} <I>are Case-Insensitive - they may be in any combination of upper or lower
     * case</I>.
     *
     * <BR /><BR />However, the directory, file-name, and (optional) Query-{@code String} portion
     * of a {@code URL} are (often, but not always) Case-Sensitive.  The sensitivity to case in
     * these three parts of a {@code URL} is dependent upon the individual Web-Server that is 
     * providing the content for the {@code URL}.
     *
     * <BR /><BR />To summarize, DNS servers which monitor the Domain-Name part of a {@code URL}
     * treat upper &amp; lower case English-Letters as the same.  Web-Server that utilize the File
     * Directory part of a {@code URL} will sometimes care about case, and sometimes won't.  This
     * behavior is dependent upon how the Web-Master has configured his system.
     *
     * @param url This may be any Internet-Domain {@code URL}
     *
     * @return A {@code String} version of this {@code URL}, but the domain and protocol portions
     * of the {@code URL} will be a "consistent" lower case.  The case of the directory, file and
     * (possibly, but not guaranteed to be present) {@code query-string} portion will not have
     * their case modified either way.
     *
     * <BR /><BR /><B>NOTE:</B> This type of information is pretty important is you are attempting
     * to scan for duplicate {@code URL's} or check their equality.
     */
    public static String urlToString(URL url)
    {
        return
            ((url.getProtocol() != null)    ? url.getProtocol().toLowerCase()   : "") + "://" +
            ((url.getHost()     != null)    ? url.getHost().toLowerCase()       : "") +
            ((url.getPath()     != null)    ? url.getPath()                     : "") +
            ((url.getQuery()    != null)    ? ('?' + url.getQuery())            : "") +
            ((url.getRef()      != null)    ? ('#' + url.getRef())              : "");
    }

    /**
     * As of today, the version of UNIX {@code curl} command does not seem to be downloading
     * everything properly.  It downloaded an image {@code '.png'} file just fine, but seemed to
     * have botched a zip-file.  This does what UNIX {@code 'curl'} command, <I>but does not
     * actually invoke the UNIX operating system to do it.</I>  It just does this...
     *
     * @param url This may be any URL, but it is intended to be a downloadable file.  It will
     * download {@code '.html'} files fine, but you may try images, data-files, zip-files,
     * tar-archives, and movies.
     *
     * @param outFileName You must specify a file-name, and if this parameter is null, a
     * {@code NullPointerException} will be thrown immediately.  If you would like your program
     * to guess the filename - <I>based on the file named in the URL</I>, please use the method
     * {@code URL.getFile()}, or something to that effect.
     * 
     * @param userAgent A User-Agent, as a {@code String}.  If this parameter is passed null,
     * it will be silently ignored, and a User-Agent won't be used.
     * 
     * @throws IOException If there are I/O Errors when using the {@code HttpURLConnection}.
     */
    public static void CURL(URL url, String outFileName, String userAgent) throws IOException
    {   
        HttpURLConnection con = (HttpURLConnection) url.openConnection();

        con.setRequestMethod("GET");

        if (userAgent != null) con.setRequestProperty("User-Agent", userAgent);

        InputStream         is      = con.getInputStream();
        FileOutputStream    fos     = new FileOutputStream(outFileName);
        byte[]              b       = new byte[5000];
        int                 result  = 0;

        while ((result = is.read(b)) != -1) fos.write(b, 0, result);
 
        fos.flush();    fos.close();    is.close();
    }
}