001package Torello.HTML;
002
003import java.net.*;
004import java.util.*;
005import java.util.stream.IntStream;
006
007import Torello.Java.*;
008
009import Torello.HTML.NodeSearch.InnerTagFind; // Used for an @see reference
010import Torello.HTML.NodeSearch.TagNodeFind;  // Used in getBaseURL
011import Torello.Java.Additional.Ret2;
012import Torello.Java.Additional.Ret3;
013
014/**
015 * Utilities for de-refrencing 'partially-completed' {@code URL's} in a Web-Page {@code Vector}.
016 * 
017 * <BR /><BR /><EMBED CLASS="external-html" DATA-FILE-ID=LINKS>
018 * @see ReplaceNodes
019 * @see ReplaceFunction
020 * @see HTMLPage
021 * @see InnerTagFind
022 * @see Ret2
023 */
024@Torello.HTML.Tools.JavaDoc.StaticFunctional
025public class Links
026{
027    private Links() { }
028
029    /**
030     * List of documented "starter-strings" that are sometimes used in Anchor URL
031     * {@code 'HREF=...'} attributes.
032     * 
033     * @see #NON_URL_HREFS
034     */
035    protected static final String[] _NON_URL_HREFS =
036        { "tel:", "magnet:", "javascript:", "mailto:", "ftp:", "file:", "data:", "blog:", "#" };
037
038    /**
039     * This small method just returns the complete list of commonly found Anchor
040     * {@code 'HREF' String's} that do not actually constitute an HTML {@code 'URL'.}  This method
041     * actually returns a "clone" of an internally stored {@code String[]} Array.  This is to
042     * protect and make sure that the list of potential HTML Anchor-Tag {@code 'HREF'} Attributes
043     * is not changed, doctored or modified
044     * 
045     * @return A clone of the {@code String}-array {@code '_NON_URL_HREFS'}
046     * 
047     * @see #_NON_URL_HREFS
048     */
049    public static String[] NON_URL_HREFS()
050    { return _NON_URL_HREFS.clone(); }
051
052    /**
053     * The methods in this class <I><B>will not automatically extract</I></B> any HTML
054     * {@code <BASE HREF=URL>} definitions that are found on this page.  If the user wishes to
055     * dereference partial / relative {@code URL} definitions that exist on the input page, all the
056     * while respecting any {@code <BASE HREF=URL>} definitions found on the input page, then this
057     * method should be utilized.
058     *
059     * @param page This may be any HTML page or partial page.  If this page has a valid HTML
060     * {@code <BASE HREF=URL>}, it will be extracted and returned as an instance of
061     * {@code class URL}.
062     *
063     * @return This shall return the HTML {@code <BASE HREF="http://...">} element found available
064     * within the input-page parameter {@code 'page'}.  If the page provided does not contain a
065     * {@code BASE URL} definition, then null shall be returned.
066     *
067     * <BR /><BR /><B>NOTE:</B> The HTML Specification clearly states that only one {@code URL}
068     * may be defined using the HTML Element {@code <BASE>}.  Clearly, due to the browser wars,
069     * unspecified / non-deterministic behavior is possible if multiple definitions are provided.
070     * For the purposes of this class, if such a situation arises, an exception is thrown.
071     *
072     * @throws MalformedHTMLException If the HTML page provided contains multiple definitions of
073     * the element {@code <BASE HREF=URL>}, then this exception will throw.
074     *
075     * @throws MalformedURLException If the {@code <BASE HREF=URL>} found / identified within the
076     * input page, but that {@code URL} is invalid, then this exception shall throw.
077     * 
078     * @see TagNodeFind
079     * @see Attributes#retrieve(Vector, int[], String)
080     */
081    public static URL getBaseURL(Vector<? extends HTMLNode> page)
082        throws MalformedHTMLException, MalformedURLException
083    {
084        int[] posArr = TagNodeFind.all(page, TC.OpeningTags, "base");
085
086        if (posArr.length == 0) return null;
087
088        // NOTE: The cast is all right because 'posArr' only points to TagNode's
089        // Attributes expects to avoid processing Vector<TextNode>, and Vector<CommentNode>
090        // Above, there will be nothing in the 'posArr' if either of those was passed.
091        @SuppressWarnings("unchecked")
092        String[]    urls    = Attributes.retrieve((Vector<HTMLNode>) page, posArr, "href");
093
094        boolean     found   = false;
095        String      ret     = null;
096
097        for (String url : urls)
098            if ((url != null) && (url.length() > 0))
099                if (found)
100                    throw new MalformedHTMLException(
101                        "The page you have provided has multiple <BASE HREF=URL> definitions.  " +
102                        "However, the HTML Specifications state that pages may provide just one " +
103                        "definition.  If you wish to proceed, retrieve the definitions manually " +
104                        "using class TagNodeFind.all and Attributes.retrieve, as explained in " +
105                        "the JavaDoc pages for this class."
106                    );
107                else 
108                {
109                    found = true;
110                    ret = url;
111                }
112
113        return new URL(ret);                    
114    }
115
116    // ********************************************************************************************
117    // Complete Vector-Resolve Methods - SRC-ATTRIBUTE
118    // ********************************************************************************************
119
120    /**
121     * Convenience Method.
122     * <BR />Invokes: {@link #resolveAllSRC(Vector, int, int, URL, SD, boolean)}
123     */
124    public static Ret3<int[], int[], int[]> resolveAllSRC(
125            Vector<? super TagNode> html, URL sourcePage, SD quote,
126            boolean askForReturnArraysOrReturnNull
127        )
128    { return resolveAllSRC(html, 0, -1, sourcePage, quote, askForReturnArraysOrReturnNull); }
129
130    /**
131     * Convenience Method.
132     * <BR />Accepts: {@code DotPair}.
133     * <BR />Invokes: {@link #resolveAllSRC(Vector, int, int, URL, SD, boolean)}
134     */
135    public static Ret3<int[], int[], int[]> resolveAllSRC(
136            Vector<? super TagNode> html, DotPair dp, URL sourcePage, SD quote,
137            boolean askForReturnArraysOrReturnNull
138        )
139    {
140        return resolveAllSRC
141            (html, dp.start, dp.end + 1, sourcePage, quote, askForReturnArraysOrReturnNull);
142    }
143
144    /**
145     * This method shall resolve all partial {@code URL} addresses that are found within
146     * {@code TagNode} elements having {@code 'SRC=...'} attributes.  Each instance of
147     * {@code TagNode} found in the input HTML {@code Vector} that has an {@code 'SRC'}
148     * attribute - if the {@code 'URL'} is only partially resolve - shall be updated and replaced
149     * with a new {@code TagNode} with a fully resolved {@code URL}.
150     * 
151     * <EMBED CLASS="external-html" DATA-FILE-ID=BASE_URL_NOTE>
152     * 
153     * @param html <EMBED CLASS="external-html" DATA-FILE-ID="HTMLVECSUP">
154     * @param sPos <EMBED CLASS="external-html" DATA-FILE-ID="SPOSVEC">
155     * @param ePos <EMBED CLASS="external-html" DATA-FILE-ID="EPOSVEC">
156     * 
157     * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 
158     * (possibly-relative) {@code URL's} in the HTML-{@code Vector} will be resolved.
159     * 
160     * @param quote A choice for the quotes to use.  In most cases, {@code URL} attribute
161     * <B STYLE="color: red;">values</B> do not contain quotation-marks.  So likely either
162     * choice would work just fine, without exceptions.
163     * 
164     * <BR /><BR /><B>NOTE:</B> <I>null may be passed to this parameter</I>, and if it is
165     * the original quotation marks found in the {@code TagNode's 'SRC'} attribute will be
166     * reused.  Passing null to this parameter should almost always be easiest, safest.
167     * 
168     * @param askForReturnArraysOrReturnNull This (long-named) parameter is merely here to
169     * facilitate retrieving more information from this method - <I>if necessary</I>.  When this
170     * parameter receives the following values:
171     * 
172     * <BR /><BR /><UL CLASS="JDUL">
173     * <LI> <B>TRUE:</B> Three integer {@code int[]} arrays will be returned as listed in the
174     *      <B>{@code Returns:}</B> section of this method's documentation.
175     * </LI>
176     * <LI><B>FALSE:</B> This method shall return null.
177     * </LI>
178     * </UL>
179     * 
180     * @return If input parameter {@code 'askForReturnArraysOrReturnNull'} has been passed 
181     * <B>FALSE</B>, this method shall return null.  Otherwise, (if passed <B>TRUE</B>), then
182     * this method shall return an instance of {@code 'Ret3<int[], int[], int[]>'} - which is
183     * <I>returning three separate integer-arrays about what was found, and what has occurred.</I>
184     *
185     * <BR /><BR />
186     * Three arrays are returned as a result of this method's invocation.  Keep in mind that
187     * though the information might be superfluous, rejecting these arrays away is easy.
188     * They are provided as a matter of convenience for cases where more details information is
189     * mandatory for ensuring that long lists of {@code HTMLNode's} were properly updated.
190     * 
191     * <BR /><BR /><OL CLASS="JDOL">
192     * <LI> {@code Ret3.a (int[])}
193     *      <BR /><BR />
194     *      The first {@code int[] array} shall contain a list of the index of every
195     *      {@code TagNode} in the input-{@code Vector} parameter's range that <B><I>contained</B>
196     *      </I> a non-null HTML {@code 'SRC'} Attribute.
197     *      <BR /><BR />
198     * </LI>
199     * <LI> {@code Ret3.b (int[])}
200     *      <BR /><BR />
201     *      The second {@code int[] array} will contain an index-list of the indices
202     *      which contained {@code TagNode's} that were <B><I>replaced</I></B> by the
203     *      internal-resolve logic.
204     *      <BR /><BR />
205     * </LI>
206     * <LI> {@code Ret3.c (int[])}
207     *      <BR /><BR />
208     *      The third {@code int[] array} will contain an index-list of the indices
209     *      which contained {@code TagNode's} whose {@code 'SRC=...'} attribute
210     *      <I><B>failed</I></B> to be resolved by the internal-resolve logic, <I>or</I> caused a
211     *      {@code QuotesException} to throw.
212     * </LI>
213     * </OL>
214     * 
215     * @throws IndexOutOfBoundsException <EMBED CLASS="external-html" DATA-FILE-ID="VIOOBEX">
216     * 
217     * @see #resolve(String, URL)
218     * @see TagNode#AV(String)
219     * @see TagNode#setAV(String, String, SD)
220     */
221    public static Ret3<int[], int[], int[]> resolveAllSRC(
222            Vector<? super TagNode> html, int sPos, int ePos, URL sourcePage, SD quote,
223            boolean askForReturnArraysOrReturnNull
224        )
225    {
226        // Retrieve the Vector-location of any TagNode on the page that has
227        // a "SRC=..." attribute.  These are almost always HTML <IMG> elements.
228        // NOTE: FIND Method's are "READ ONLY" - the Cast will make no difference at run-time.
229        //       The @SuppressWarnings is to overcome the cast of 'html'
230        @SuppressWarnings("unchecked")
231        int[] hasSrcPosArr = InnerTagFind.all((Vector<HTMLNode>) html, sPos, ePos, "src");
232
233        // Java Stream's are convenient for keeping "Growing Lists" of return values.
234        // This builder shall keep a list of all URL's that failed to update - for any reason
235        // **UNLESS** the reason is that the URL was already a fully-resolved, non-partial URL,
236        IntStream.Builder failedUpdate = askForReturnArraysOrReturnNull
237            ? IntStream.builder() 
238            : null;
239
240        // This stream will keep a list of all URL's that were updated, and whose TagNode's
241        // were replaced inside the input HTML Vector
242        IntStream.Builder replaced = askForReturnArraysOrReturnNull
243            ? IntStream.builder()
244            : null;
245
246        for (int pos : hasSrcPosArr)
247        {
248            // Get the node at the index
249            TagNode tn = (TagNode) html.elementAt(pos);
250
251            // 1) Retrieve the SRC Attribute
252            // 2) if it is a partial-URL resolve it
253            // 3) Convert to a String
254            String  oldURL = tn.AV("src");
255            URL     newURL = resolve(oldURL, sourcePage);
256
257            // Some URL's cannot be resolved, if so, just skip this TagNode.
258            // Log the index to the stream (if requested), and continue.
259            if (newURL == null)
260            { if (askForReturnArraysOrReturnNull) failedUpdate.accept(pos); continue; }
261
262            // If the URL was already a fully-resolved-URL, continue - don't replace the TagNode;
263            // No logging needed here, the URL was *already* resolved...
264            if (oldURL.length() == newURL.toString().length()) continue;
265
266            // Replace the SRC Attribute in the TagNode.  This builds a new instance of TagNode
267            // If there is an exception, log the index to the stream (if requested), and continue.
268            try
269                { tn = tn.setAV("src", newURL.toString(), quote); }
270            catch (QuotesException qex)
271                { if (askForReturnArraysOrReturnNull) failedUpdate.accept(pos); continue; }
272
273            // Replace the index in the Vector containing the old TagNode with the new one.
274            html.setElementAt(tn , pos);
275
276            // The Vector-Index at this position had it's old TagNode removed and replaced with a
277            // new updated one.  Log this to the stream-list so to allow the user to know.
278            if (askForReturnArraysOrReturnNull) replaced.accept(pos);
279        }
280
281        return askForReturnArraysOrReturnNull
282            ? new Ret3<int[], int[], int[]>
283                (hasSrcPosArr, replaced.build().toArray(), failedUpdate.build().toArray())
284            : null;
285    }
286
287    // ********************************************************************************************
288    // Complete Vector-Resolve Methods - HREF-ATTRIBUTE
289    // ********************************************************************************************
290
291    /**
292     * Convenience Method.
293     * <BR />Invokes: {@link #resolveAllHREF(Vector, int, int, URL, SD, boolean)}
294     */
295    public static Ret3<int[], int[], int[]> resolveAllHREF(
296            Vector<? super TagNode> html, URL sourcePage, SD quote,
297            boolean askForReturnArraysOrReturnNull
298        )
299    { return resolveAllHREF(html, 0, -1, sourcePage, quote, askForReturnArraysOrReturnNull); }
300
301    /**
302     * Convenience Method.
303     * <BR />Accepts: {@code DotPair}.
304     * <BR />Invokes: {@link #resolveAllHREF(Vector, int, int, URL, SD, boolean)}
305     */
306    public static Ret3<int[], int[], int[]> resolveAllHREF(
307            Vector<? super TagNode> html, DotPair dp, URL sourcePage, SD quote,
308            boolean askForReturnArraysOrReturnNull
309        )
310    {
311        return resolveAllHREF
312            (html, dp.start, dp.end + 1, sourcePage, quote, askForReturnArraysOrReturnNull); 
313    }
314
315    /**
316     * This method shall resolve all partial {@code URL} addresses that are found within
317     * {@code TagNode} elements having {@code 'HREF=...'} attributes.  Each instance of
318     * {@code TagNode} found in the input HTML {@code Vector} that has an {@code 'HREF'}
319     * attribute - if the {@code 'URL'} is only partially resolve - shall be updated and replaced
320     * with a new {@code TagNode} with a fully resolved {@code URL}.
321     * 
322     * <EMBED CLASS="external-html" DATA-FILE-ID=BASE_URL_NOTE>
323     * 
324     * @param html <EMBED CLASS="external-html" DATA-FILE-ID="HTMLVECSUP">
325     * @param sPos <EMBED CLASS="external-html" DATA-FILE-ID="SPOSVEC">
326     * @param ePos <EMBED CLASS="external-html" DATA-FILE-ID="EPOSVEC">
327     * 
328     * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 
329     * (possibly-relative) {@code URL's} in the HTML-{@code Vector} will be resolved.
330     * 
331     * @param quote A choice for the quotes to use.  In most cases, {@code URL} attribute
332     * <B STYLE="color: red;">values</B> do not contain quotation-marks.  So likely either
333     * choice would work just fine, without exceptions.
334     * 
335     * <BR /><BR /><B>NOTE:</B> <I>null may be passed to this parameter</I>, and if it is
336     * the original quotation marks found in the {@code TagNode's 'HREF'} attribute will be
337     * reused.  Passing null to this parameter should almost always be easiest, safest.
338     * 
339     * @param askForReturnArraysOrReturnNull This (long-named) parameter is merely here to
340     * facilitate retrieving more information from this method - <I>if necessary</I>.  When this
341     * parameter receives the following values:
342     * 
343     * <BR /><BR /><UL CLASS="JDUL">
344     * <LI> <B>TRUE:</B> Three integer {@code int[]} arrays will be returned as listed in the
345     *      <B>{@code Returns:}</B> section of this method's documentation.
346     * </LI>
347     * <LI><B>FALSE:</B> This method shall return null.
348     * </LI>
349     * </UL>
350     * 
351     * @return If input parameter {@code 'askForReturnArraysOrReturnNull'} has been passed 
352     * <B>FALSE</B>, this method shall return null.  Otherwise, (if passed <B>TRUE</B>), then
353     * this method shall return an instance of {@code 'Ret3<int[], int[], int[]>'} - which is
354     * <I>returning three separate integer-arrays about what was found, and what has occurred.</I>
355     *
356     * <BR /><BR />
357     * Three arrays are returned as a result of this method's invocation.  Keep in mind that
358     * though the information might be superfluous, rejecting these arrays away is easy.
359     * They are provided as a matter of convenience for cases where more details information is
360     * mandatory for ensuring that long lists of {@code HTMLNode's} were properly updated.
361     * 
362     * <BR /><BR /><OL CLASS="JDOL">
363     * <LI> {@code Ret3.a (int[])}
364     *      <BR /><BR />
365     *      The first {@code int[] array} shall contain a list of the index of every
366     *      {@code TagNode} in the input-{@code Vector} parameter's range that <B><I>contained</B>
367     *      </I> a non-null HTML {@code 'HREF'} Attribute.
368     *      <BR /><BR />
369     * </LI>
370     * <LI> {@code Ret3.b (int[])}
371     *      <BR /><BR />
372     *      The second {@code int[] array} will contain an index-list of the indices
373     *      which contained {@code TagNode's} that were <B><I>replaced</I></B> by the
374     *      internal-resolve logic.
375     *      <BR /><BR />
376     * </LI>
377     * <LI> {@code Ret3.c (int[])}
378     *      <BR /><BR />
379     *      The third {@code int[] array} will contain an index-list of the indices
380     *      which contained {@code TagNode's} whose {@code 'HREF=...'} attribute
381     *      <I><B>failed</I></B> to be resolved by the internal-resolve logic, <I>or</I> caused a
382     *      {@code QuotesException} to throw.
383     * </LI>
384     * </OL>
385     * 
386     * @throws IndexOutOfBoundsException <EMBED CLASS="external-html" DATA-FILE-ID="VIOOBEX">
387     * 
388     * @see #resolve(String, URL)
389     * @see TagNode#AV(String)
390     * @see TagNode#setAV(String, String, SD)
391     */
392    public static Ret3<int[], int[], int[]> resolveAllHREF(
393            Vector<? super TagNode> html, int sPos, int ePos, URL sourcePage, SD quote,
394            boolean askForReturnArraysOrReturnNull
395        )
396    {
397        // Retrieve the Vector-location of any TagNode on the page that has
398        // a "HREF=..." attribute.  These are almost always HTML <IMG> elements.
399        // NOTE: FIND Method's are "READ ONLY" - the Cast will make no difference at run-time.
400        //       The @SuppressWarnings is to overcome the cast of 'html'
401        @SuppressWarnings("unchecked")
402        int[] hasHRefPosArr = InnerTagFind.all((Vector<HTMLNode>) html, sPos, ePos, "href");
403
404        // Java Stream's are convenient for keeping "Growing Lists" of return values.
405        // This builder shall keep a list of all URL's that failed to update - for any reason
406        // **UNLESS** the reason is that the URL was already a fully-resolved, non-partial URL,
407        IntStream.Builder failedUpdate = askForReturnArraysOrReturnNull
408            ? IntStream.builder() 
409            : null;
410
411        // This stream will keep a list of all URL's that were updated, and whose TagNode's
412        // were replaced inside the input HTML Vector
413        IntStream.Builder replaced = askForReturnArraysOrReturnNull
414            ? IntStream.builder()
415            : null;
416
417        for (int pos : hasHRefPosArr)
418        {
419            // Get the node at the index
420            TagNode tn = (TagNode) html.elementAt(pos);
421
422            // 1) Retrieve the HREF Attribute
423            // 2) if it is a partial-URL resolve it
424            // 3) Convert to a String
425            String  oldURL = tn.AV("HREF");
426            URL     newURL = resolve(oldURL, sourcePage);
427
428            // Some URL's cannot be resolved, if so, just skip this TagNode.
429            // Log the index to the stream (if requested), and continue.
430            if (newURL == null)
431            { if (askForReturnArraysOrReturnNull) failedUpdate.accept(pos); continue; }
432
433            // If the URL was already a fully-resolved-URL, continue - don't replace the TagNode;
434            // No logging needed here, the URL was *already* resolved...
435            if (oldURL.length() == newURL.toString().length()) continue;
436
437            // Replace the HREF Attribute in the TagNode.  This builds a new instance of TagNode
438            // If there is an exception, log the index to the stream (if requested), and continue.
439            try
440                { tn = tn.setAV("href", newURL.toString(), quote); }
441            catch (QuotesException qex)
442                { if (askForReturnArraysOrReturnNull) failedUpdate.accept(pos); continue; }
443
444            // Replace the index in the Vector containing the old TagNode with the new one.
445            html.setElementAt(tn , pos);
446
447            // The Vector-Index at this position had it's old TagNode removed and replaced with a
448            // new updated one.  Log this to the stream-list so to allow the user to know.
449            if (askForReturnArraysOrReturnNull) replaced.accept(pos);
450        }
451
452        return askForReturnArraysOrReturnNull
453            ? new Ret3<int[], int[], int[]>
454                (hasHRefPosArr, replaced.build().toArray(), failedUpdate.build().toArray())
455            : null;
456    }
457
458    // ********************************************************************************************
459    // Resolve, Not Keep Exceptions
460    // ********************************************************************************************
461
462
463    /**
464     * Convenience Method.
465     * <BR />Invokes: {@link #resolveHREF(TagNode, URL)}.
466     * <BR />And-Then: {@link TagNode#setAV(String, String, SD)}
467     */
468    public static TagNode resolveHREFAndUpdate(TagNode tnWithHREF, URL sourcePage)
469    { 
470        URL url = resolveHREF(tnWithHREF, sourcePage);
471
472        return (url == null)
473            ? null
474            : tnWithHREF.setAV("href", url.toString(), null);
475    }
476
477
478    /**
479     * This should be used for {@code TagNode's} that contain an {@code 'HREF'} inner-tag
480     * (attribute).
481     * 
482     * @param tnWithHREF <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_TN_HREF>
483     * 
484     * @param sourcePage This is the source page {@code URL} from which the {@code TagNode}
485     * (possibly-relative) {@code URL} will be resolved.
486     * 
487     * @return A complete-{@code URL} without any missing "presumed data" - such as host/domain or
488     * directory.  Null is returned if attempting to build the {@code URL} generated a
489     * {@code MalformedURLException}.
490     * 
491     * <BR /><BR /><B STYLE="color: red;">SPECIFICALLY:</B> This method shall catch all 
492     * {@code MalformedURLException's}.
493     * 
494     * @throws HREFException If the {@code TagNode} passed to parameter {@code 'tnWithHREF'} does
495     * not actually contain an {@code HREF} attribute, then this exception shall throw.
496     * 
497     * @see #resolve(String, URL)
498     * @see TagNode#AV(String)
499     */
500    public static URL resolveHREF(TagNode tnWithHREF, URL sourcePage)
501    {
502        String href = tnWithHREF.AV("href");
503
504        if (href == null) throw new HREFException(
505            "The TagNode passed to parameter tnWithHREF does not actually contain an " +
506            "HREF attribute."
507        );
508
509        return resolve(href, sourcePage);
510    }
511
512
513    /**
514     * Convenience Method.
515     * <BR />Invokes: {@link #resolveSRC(TagNode, URL)} 
516     * <BR />And-Then: {@link TagNode#setAV(String, String, SD)}
517     */
518    public static TagNode resolveSRCAndUpdate(TagNode tnWithSRC, URL sourcePage)
519    { 
520        URL url = resolveSRC(tnWithSRC, sourcePage);
521
522        return (url == null) 
523            ? null 
524            : tnWithSRC.setAV("src", url.toString(), null);
525    }
526
527
528    /**
529     * This should be used for {@code TagNode's} that contain a {@code 'SRC'} inner-tag
530     * (attribute).
531     * 
532     * @param tnWithSRC <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_TN_SRC>
533     * 
534     * @param sourcePage This is the source page {@code URL} from which the {@code TagNode}
535     * (possibly-relative) {@code URL} will be resolved.
536     * 
537     * @return A complete-{@code URL} without any missing "presumed data" - such as host/domain or
538     * directory.  Null is returned if attempting to build the {@code URL} generated a
539     * {@code MalformedURLException}.
540     * 
541     * <BR /><BR /><B STYLE="color: red;">SPECIFICALLY:</B> This method shall catch all 
542     * {@code MalformedURLException's}.
543     * 
544     * @throws SRCException If the {@code TagNode} passed to parameter {@code 'tnWithSRC'} does not
545     * actually contain a {@code SRC} attribute, then this exception shall throw.
546     * 
547     * @see #resolve(String, URL)
548     * @see TagNode#AV(String)
549     */
550    public static URL resolveSRC(TagNode tnWithSRC, URL sourcePage)
551    {
552        String src = tnWithSRC.AV("src");
553
554        if (src == null) throw new SRCException(
555            "The TagNode passed to parameter tnWithSRC does not actually contain a " +
556            "SRC attribute."
557        );
558
559        return resolve(src, sourcePage);
560    }
561
562    /**
563     * This should be used for lists of {@code TagNode's}, each of which contain an {@code 'HREF'}
564     * inner-tag (attribute).
565     * 
566     * @param tnListWithHREF <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_TNLIST_HREF>
567     * 
568     * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 
569     * (possibly-relative) {@code URL's} in the {@code Iterable} will be resolved.
570     * 
571     * @return A list of {@code URL's}, each of which have been completed/resolved with the 
572     * {@code 'sourcePage'} parameter.  Any {@code TagNode} which generated an exception, will
573     * result in a null value in the {@code Vector}.
574     * 
575     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_NO_HREF>
576     * 
577     * @see #resolve(String, URL)
578     * @see TagNode#AV(String)
579     */
580    public static Vector<URL> resolveHREFs(Iterable<TagNode> tnListWithHREF, URL sourcePage)
581    {
582        Vector<URL> ret = new Vector<>();
583
584        for (TagNode tn : tnListWithHREF) ret.addElement(resolve(tn.AV("href"), sourcePage));
585
586        return ret;
587    }
588
589
590    /**
591     * This should be used for lists of {@code TagNode's}, each of which contain a {@code 'SRC'}
592     * inner-tag (attribute).
593     * 
594     * @param tnListWithSRC <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_TNLIST_SRC>
595     * 
596     * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's}
597     * (possibly-relative) {@code URL's} in the {@code Iterable} will be resolved.
598     * 
599     * @return A list of {@code URL's}, each of which have been completed/resolved with the
600     * {@code 'sourcePage'} parameter.  Any {@code TagNode} which generated an exception, will
601     * result in a null value in the {@code Vector.}
602     * 
603     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_NO_SRC>
604     * 
605     * @see #resolve(String, URL)
606     * @see TagNode#AV(String)
607     */
608    public static Vector<URL> resolveSRCs(Iterable<TagNode> tnListWithSRC, URL sourcePage)
609    {
610        Vector<URL> ret = new Vector<>();
611
612        for (TagNode tn : tnListWithSRC) ret.addElement(resolve(tn.AV("src"), sourcePage));
613
614        return ret;
615    }
616
617
618    /**
619     * This will use a "pointer array" - an array containing indexes into the downloaded page to
620     * retrieve {@code TagNode's}.  The {@code TagNode's} to which this pointer-array points -
621     * must each contain an {@code HREF} inner-tag with a {@code URL}, or a partial {@code URL}.
622     * 
623     * <EMBED CLASS="external-html" DATA-FILE-ID=BASE_URL_NOTE>
624     * 
625     * @param html <EMBED CLASS="external-html" DATA-FILE-ID="HTMLVEC">
626     * 
627     * @param nodePosArr An array of pointers into the page or sub-page.  The pointers must
628     * reference {@code TagNode's} that contain {@code HREF} attributes.  Integer-pointer Arrays
629     * are usually returned from the {@code package 'NodeSearch'} "Find" methods.
630     *
631     * <DIV CLASS="EXAMPLE">{@code 
632     * // Retrieve 'pointers' to all the '<A HREF=...>' TagNode's.  The term 'pointer' refers to
633     * // integer-indices into the vectorized-html variable 'page'
634     * int[] anchorPosArr = TagNodeFind.all(page, TC.OpeningTags, "a");
635     * 
636     * // Extract each HREF inner-tag, and construct a {@code URL}.  Use the 'sourcePage' parameter if
637     * // the URL is only partially-resolved
638     * Vector<URL> urls = Links.resolveHREFs(page, anchorPosArr, mySourcePage);
639     * }</DIV>
640     * <BR /><I>which would obtain a pointer-array / (a.k.a. a "vector-index-array") to every HTML
641     * {@code "<A ...>"} element</I> that was available in the HTML page-{@code Vector} parameter
642     * {@code 'html'}, and then resolve any shortened {@code URL's}. 
643     *
644     * @param sourcePage This is the source page {@code URL} from whence the (possibly relative)
645     * {@code TagNode URL's} in the {@code Vector} are to be resolved.
646     *
647     * @return A list of {@code URL's}, each of which have been completed/resolved with the
648     * {@code 'sourcePage'} parameter.  Any {@code TagNode} which generated an exception, will
649     * result in a null value in the {@code Vector}.  However, if any of the nodes pointed to by
650     * the {@code 'nodePosArr'} parameter do not contain opening {@code TagNode} elements, then
651     * this mistake shall generate {@code TagNodeExpectedException's}.
652     *
653     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_NO_HREF>
654     *
655     * @throws ArrayIndexOutOfBoundsException
656     * <EMBED CLASS="external-html" DATA-FILE-ID="ATTRAIOOBEX">
657     * @throws TagNodeExpectedException <EMBED CLASS="external-html" DATA-FILE-ID="TNEEX">
658     * @throws OpeningTagNodeExpectedException <EMBED CLASS="external-html" DATA-FILE-ID="OTNEEX">
659     *
660     * @see #resolve(String, URL)
661     * @see TagNode#AV(String)
662     */
663    public static Vector<URL> resolveHREFs
664        (Vector<? extends HTMLNode> html, int[] nodePosArr, URL sourcePage)
665    {
666        Vector<URL> ret = new Vector<>();                           // Return Vector
667
668        for (int nodePos : nodePosArr)
669        {
670            HTMLNode n = html.elementAt(nodePos);
671            if (! n.isTagNode())                                    // Must be an HTML TagNode
672                throw new TagNodeExpectedException(nodePos);
673
674            TagNode tn = (TagNode) n;
675            if (tn.isClosing)                                       // Must be an "Opening" HTML TagNode
676                throw new OpeningTagNodeExpectedException(nodePos);
677
678            ret.addElement(resolve(tn.AV("href"), sourcePage));     // Resolve the 'HREF', save the URL
679        }
680
681        return ret;
682    }
683 
684
685    /**
686     * This will use a "pointer array" - an array containing indexes into the downloaded page to
687     * retrieve {@code TagNode's}.  The {@code TagNode's} to which this pointer-array points - must
688     * each contain a {@code SRC} inner-tag with a {@code URL}, or a partial {@code URL}.
689     * 
690     * <EMBED CLASS="external-html" DATA-FILE-ID=BASE_URL_NOTE>
691     *
692     * @param html <EMBED CLASS="external-html" DATA-FILE-ID="HTMLVEC"> Any HTML page (or sub-page)
693     * 
694     * @param nodePosArr An array of pointers into the page or sub-page.  The pointers must
695     * reference {@code TagNode's} that contain {@code SRC} attributes.  Integer-pointer Arrays are
696     * usually returned from the {@code package 'NodeSearch'} "Find" methods.
697     *
698     * <DIV CLASS="EXAMPLE">{@code 
699     * // Retrieve 'pointers' to all the '<IMG SRC=...>' TagNode's.  The term 'pointer' refers to
700     * // integer-indices into the vectorized-html variable 'page'
701     * int[] picturePosArr = TagNodeFind.all(page, TC.OpeningTags, "img");
702     * 
703     * // Extract each SRC inner-tag, and construct a {@code URL}.  Use the 'sourcePage' parameter if
704     * // the URL is only partially-resolved
705     * Vector<URL> urls = Links.resolveSRCs(page, picturePosArr, mySourcePage);
706     * }</DIV>
707     * <BR /><I>which would obtain a pointer-array / (a.k.a. a "vector-index-array") to every HTML
708     * {@code "<IMG ...>"} element</I> that was available in the HTML page-{@code Vector} parameter
709     * {@code 'html'}, and then resolve any shorted image {@code URL's}.
710     *
711     * @param sourcePage This is the source page {@code URL} from whence the (possibly relative)
712     * {@code TagNode URL's} in the {@code Vector} are to be resolved.
713     *
714     * @return A list of {@code URL's}, each of which have been completed/resolved with the
715     * {@code 'sourcePage'} parameter.  Any {@code TagNode} which generated an exception, will
716     * result in a null value in the {@code Vector}.  However, if any of the nodes pointed to by
717     * the {@code 'nodePosArr'} parameter do not contain opening {@code TagNode} elements, then
718     * this mistake shall generate {@code TagNodeExpectedException's}.
719     *
720     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_NO_SRC>
721     *
722     * @throws ArrayIndexOutOfBoundsException
723     * <EMBED CLASS="external-html" DATA-FILE-ID="ATTRAIOOBEX">
724     * @throws TagNodeExpectedException <EMBED CLASS="external-html" DATA-FILE-ID="TNEEX">
725     * @throws OpeningTagNodeExpectedException <EMBED CLASS="external-html" DATA-FILE-ID="OTNEEX">
726     *
727     * @see #resolve(String, URL)
728     * @see TagNode#AV(String)
729     */
730    public static Vector<URL> resolveSRCs
731        (Vector<? extends HTMLNode> html, int[] nodePosArr, URL sourcePage)
732    {
733        Vector<URL> ret = new Vector<>();                               // Return Vector
734
735        for (int nodePos : nodePosArr)
736        {
737            HTMLNode n = html.elementAt(nodePos);
738            if (! n.isTagNode())                                        // Must be an HTML TagNode
739                throw new TagNodeExpectedException(nodePos);
740
741            TagNode tn = (TagNode) n;
742            if (tn.isClosing)                                           // Must be an "Opening" HTML TagNode
743                throw new OpeningTagNodeExpectedException(nodePos);
744
745            ret.addElement(resolve(tn.AV("src"), sourcePage));          // Resolve the "SRC", save the URL
746        }
747
748        return ret;
749    }
750
751
752    /**
753     * This will convert <I><B>a list of </B></I> simple java {@code String's} to a
754     * list/{@code Vector} of {@code URL's}, de-referencing any missing information using the
755     * {@code 'sourcePage'} parameter.
756     * 
757     * @param src a list of strings - usually partially or totally completed Internet {@code URL's}
758     * 
759     * @param sourcePage This is the source page {@code URL} from which the {@code String's}
760     * (possibly-relative) {@code URL's} in the {@code Vector} will be resolved.
761     * 
762     * @return A list of {@code URL's}, each of which have been completed/resolved with the
763     * {@code 'sourcePage'} parameter.  If there were any {@code String's} that were zero-length or
764     * null,  then null is returned in the related {@code Vector} position.  If any
765     * {@code TagNode} causes a {@code MalformedURLException}, then that position in the
766     * {@code Vector} will be null.
767     * 
768     * @see #resolve(String, URL)
769     */
770    public static Vector<URL> resolve(Vector<String> src, URL sourcePage)
771    {
772        Vector<URL> ret = new Vector<>();
773
774        for (String s : src) ret.addElement(resolve(s, sourcePage));
775
776        return ret;
777    }
778
779    /**
780     * This will convert a simple java {@code String} to a {@code URL}, de-referencing any missing
781     * information using the {@code 'sourcePage'} parameter.
782     * 
783     * @param src Any java {@code String}, usually one which was scraped from an HTML-Page, and
784     * needs to be "completed."
785     * 
786     * @param sourcePage This is the source page {@code URL} from which the String
787     * (possibly-relative) {@code URL} will be resolved.
788     * 
789     * @return A {@code URL}, which has been completed/resolved with the {@code 'sourcePage'}
790     * parameter. If parameter {@code 'src'} is null or zero-length, then this method will also
791     * return null.  If a {@code MalformedURLException} is generated, null will also be returned.
792     */
793    public static URL resolve(String src, URL sourcePage)
794    {
795        if (sourcePage == null) throw new NullPointerException(
796            "Though you may provide null to the partial-URL to dereference parameter, null " +
797            "may not be passed to the Source-Page Parameter.  The purpose of the 'resolve' " +
798            "operation is to resolve partial-URLs against a source-page (root) URL. " +
799            "Therefore this is not allowed."
800        );
801
802        if (src == null) return null;
803
804        src = src.trim();
805
806        if (src.length() == 0) return null;
807
808        String srcLC = src.toLowerCase();
809
810        if (StrCmpr.startsWithXOR(srcLC, _NON_URL_HREFS)) return null;
811
812        if (srcLC.startsWith("http://") || srcLC.startsWith("https://"))
813            try
814                { return new URL(src); }
815            catch (MalformedURLException e)
816                { return null; }
817
818        if (src.startsWith("//") && (src.charAt(3) != '/'))
819            try
820                { return new URL(sourcePage.getProtocol().toLowerCase() + ":" + src); }
821            catch (MalformedURLException e)
822                { return null; }
823        
824        if (src.startsWith("/"))
825            try
826            { 
827                return new URL(
828                    sourcePage.getProtocol().toLowerCase() + "://" +
829                    sourcePage.getHost().toLowerCase() +
830                    src
831                );
832            }
833            catch (MalformedURLException e)
834                { return null; }
835 
836        if (src.startsWith("../"))
837        {
838            String  sourcePageStr   = sourcePage.toString();
839            short   nLevels         = 0;
840
841            do      { nLevels++;  src = src.substring(3); }
842            while   (src.startsWith("../"));
843
844            String  directory = StringParse.dotDotParentDirectory(sourcePage.toString(), nLevels);
845
846            try     { return new URL(directory + src); }
847            catch   (Exception e) { return null; }
848        }
849
850        String  root    = sourcePage.getProtocol().toLowerCase() + "://" + 
851                            sourcePage.getHost().toLowerCase();
852        String  path    = sourcePage.getPath().trim();
853        int     pos     = StringParse.findLastFrontSlashPos(path);
854
855        if (pos == -1) throw new StringIndexOutOfBoundsException(
856            "The URL you have provided: " + sourcePage.toString() + " does not have a '/' " +
857            "front-slash character in it's path.  Cannot proceed resolving relative-URL's " +
858            "without this."
859        );
860
861        path = path.substring(0, pos + 1);
862
863        try     { return new URL(root + path + src); }
864        catch   (MalformedURLException e) { return null; }
865    }
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887    // ********************************************************************************************
888    // Resolve, KE - Keep Exceptions
889    // ********************************************************************************************
890
891    /**
892     * This should be used for {@code TagNode's} that contain an {@code 'HREF'} inner-tag
893     * (attribute).
894     * 
895     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_KE>
896     * 
897     * @param tnWithHREF <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_TN_HREF>
898     * 
899     * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's}
900     * (possibly-relative) {@code URL} will be resolved.
901     * 
902     * @return A complete-{@code URL} without any missing "presumed data" - such as host/domain or 
903     * directory.  If there were no {@code HREF} tag, then null is returned.  If
904     * the {@code TagNode} causes a {@code MalformedURLException}, that is returned in
905     * {@code Ret2.b}
906     * 
907     * <BR /><BR /><B STYLE="color: red;">SPECIFICALLY:</B> This method shall catch all 
908     * {@code MalformedURLException's}.
909     * 
910     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_RET2>
911     * 
912     * @throws HREFException If the {@code TagNode} passed to parameter {@code 'tnWithHREF'} does
913     * not actually contain an {@code HREF} attribute, then this exception shall throw.
914     * 
915     * @see #resolve_KE(String, URL)
916     * @see TagNode#AV(String)
917     * @see Ret2
918     */
919    public static Ret2<URL, MalformedURLException> resolveHREF_KE
920        (TagNode tnWithHREF, URL sourcePage)
921    {
922        String href = tnWithHREF.AV("href");
923
924        if (href == null) throw new HREFException(
925            "The TagNode passed to parameter tnWithHREF does not actually contain an " +
926            "HREF attribute."
927        );
928
929        return resolve_KE(href, sourcePage);
930    }
931
932
933    /**
934     * This should be used for {@code TagNode's} that contain a {@code 'SRC'} inner-tag
935     * (attribute).
936     * 
937     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_KE>
938     * 
939     * @param tnWithSRC <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_TN_SRC>
940     * 
941     * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's}
942     * (possibly-relative) {@code URL} will be resolved.
943     * 
944     * @return A complete-{@code URL} without any missing "presumed data" - such as host/domain or
945     * directory.  If there were no {@code SRC} tag, then null is returned.  If the
946     * {@code TagNode} causes a {@code MalformedURLException}, that is returned in {@code Ret2.b}
947     * 
948     * <BR /><BR /><B STYLE="color: red;">SPECIFICALLY:</B> This method shall catch all 
949     * {@code MalformedURLException's}.
950     *
951     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_RET2>
952     * 
953     * @throws SRCException If the {@code TagNode} passed to parameter {@code 'tnWithSRC'} does not
954     * actually contain a {@code SRC} attribute, then this exception shall throw.
955     * 
956     * @see #resolve_KE(String, URL)
957     * @see TagNode#AV(String)
958     * @see Ret2
959     */
960    public static Ret2<URL, MalformedURLException> resolveSRC_KE
961        (TagNode tnWithSRC, URL sourcePage)
962    {
963        String src = tnWithSRC.AV("src");
964
965        if (src == null) throw new SRCException(
966            "The TagNode passed to parameter tnWithSRC does not actually contain a " +
967            "SRC attribute."
968        );
969
970        return resolve_KE(src, sourcePage);
971    }
972
973
974    /**
975     * This should be used for lists of {@code TagNode's}, each of which contain an {@code 'HREF'}
976     * inner-tag (attribute).
977     * 
978     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_KE>
979     * 
980     * @param tnListWithHREF <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_TNLIST_HREF>
981     * 
982     * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 
983     * (possibly-relative) {@code URL's} in the {@code Iterable} will be resolved.
984     * 
985     * @return A list of {@code URL's}, each of which have been completed/resolved with the
986     * {@code 'sourcePage'} parameter.  If there were any {@code TagNode} with no {@code HREF} tag,
987     * then null is returned in the related {@code Vector} position.  If any {@code TagNode} causes
988     * a {@code MalformedURLException}, then that position in the {@code Vector} will contain the
989     * exception in {@code Ret2.b}
990     * 
991     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_NO_HREF>
992     *
993     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_RET2>
994     * 
995     * @see #resolve_KE(String, URL)
996     * @see TagNode#AV(String)
997     * @see Ret2
998     */
999    public static Vector<Ret2<URL, MalformedURLException>> resolveHREFs_KE
1000        (Iterable<TagNode> tnListWithHREF, URL sourcePage)
1001    {
1002        Vector<Ret2<URL, MalformedURLException>> ret = new Vector<>();
1003
1004        for (TagNode tn : tnListWithHREF) ret.addElement(resolve_KE(tn.AV("href"), sourcePage));
1005
1006        return ret;
1007    }
1008
1009
1010    /**
1011     * This should be used for lists of {@code TagNode's}, each of which contain a {@code 'SRC'}
1012     * inner-tag (attribute).
1013     * 
1014     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_KE>
1015     * 
1016     * @param tnListWithSRC <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_TNLIST_SRC>
1017     * 
1018     * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's}
1019     * (possibly-relative) {@code URL's} in the {@code Iterable} will be resolved.
1020     * 
1021     * @return A list of {@code URL's}, each of which have been completed/resolved with the
1022     * {@code 'sourcePage'} parameter.  If there were any {@code TagNode} with no {@code SRC} tag,
1023     * then null is returned in the related {@code Vector} position.  If any {@code TagNode} causes
1024     * a {@code MalformedURLException}, then that position in the {@code Vector} will contain the
1025     * exception in {@code Ret2.b}
1026     * 
1027     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_NO_SRC>
1028     *
1029     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_RET2>
1030     * 
1031     * @see #resolve_KE(String, URL)
1032     * @see TagNode#AV(String)
1033     * @see Ret2
1034     */
1035    public static Vector<Ret2<URL, MalformedURLException>> resolveSRCs_KE
1036        (Iterable<TagNode> tnListWithSRC, URL sourcePage)
1037    {
1038        Vector<Ret2<URL, MalformedURLException>> ret = new Vector<>();
1039
1040        for (TagNode tn : tnListWithSRC) ret.addElement(resolve_KE(tn.AV("src"), sourcePage));
1041
1042        return ret;
1043    }
1044
1045
1046    /**
1047     * This will use a "pointer array" - an array containing indexes into the downloaded page to
1048     * retrieve {@code TagNode's}.  The {@code TagNode} to which this pointer-array points - must
1049     * contain {@code HREF} inner-tags with {@code URL's}, or partial {@code URL's}.
1050     * 
1051     * <EMBED CLASS="external-html" DATA-FILE-ID=BASE_URL_NOTE>
1052     * 
1053     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_KE>
1054     * 
1055     * @param html <EMBED CLASS="external-html" DATA-FILE-ID="HTMLVEC"> Any HTML page (or sub-page)
1056     * 
1057     * @param nodePosArr An array of pointers into the page or sub-page.  The pointers must
1058     * reference {@code TagNode's} that contain {@code HREF} attributes.  Integer-pointer Arrays
1059     * are usually return from the {@code package 'NodeSearch'} "Find" methods.
1060     *
1061     * <DIV CLASS="EXAMPLE">{@code 
1062     * // Retrieve 'pointers' to all the '<A HREF=...>' TagNode's.  The term 'pointer' refers to
1063     * // integer-indices into the vectorized-html variable 'page'
1064     * int[] anchorPosArr = TagNodeFind.all(page, TC.OpeningTags, "a");
1065     * 
1066     * // Extract each HREF inner-tag, and construct a URL.  Use the 'sourcePage' parameter if
1067     * // the URL is only partially-resolved.  If any URL's on the original-page are invalid, the
1068     * // method shall not crash, but save the exception instead.
1069     * Vector<Ret2<URL, MalformedURLException> urlsWithEx = Links.resolveHREFs_KE(page, picturePosArr, mySourcePage);
1070     *
1071     * // Print out any "failed" urls
1072     * for (Ret2<URL, MalformedURLException> r : urlsWithEx)
1073     *     if (r.b != null) 
1074     *         System.out.println("There was an exception: " + r.b.toString());
1075     * }</DIV>
1076     *
1077     * <BR /><I>which would obtain a pointer-array / (a.k.a. a "vector-index-array") to every HTML
1078     * {@code "<A ...>"} element</I> that was available in the HTML page-{@code Vector} parameter
1079     * {@code 'html'}., and then resolve any shortened {@code URL's}.
1080     *
1081     * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's}
1082     * (possibly-relative) {@code URL's} in the {@code Vector} will be resolved.
1083     * 
1084     * @return A list of {@code URL's}, each of which have been completed/resolved with the
1085     * {@code 'sourcePage'} parameter.  If there were any {@code TagNode} with no {@code HREF} tag,
1086     * then null is returned in the related {@code Vector} position.  If any {@code TagNode} causes
1087     * a {@code MalformedURLException}, then that position in the {@code Vector} will contain the
1088     * exception in {@code Ret2.b}
1089     *
1090     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_NO_HREF>
1091     *
1092     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_RET2>
1093     *
1094     * @throws ArrayIndexOutOfBoundsException
1095     * 
1096     * <EMBED CLASS="external-html" DATA-FILE-ID="ATTRAIOOBEX">
1097     * 
1098     * @throws TagNodeExpectedException <EMBED CLASS="external-html" DATA-FILE-ID="TNEEX">
1099     * @throws OpeningTagNodeExpectedException <EMBED CLASS="external-html" DATA-FILE-ID="OTNEEX">
1100     *
1101     * @see #resolve_KE(String, URL)
1102     * @see TagNode#AV(String)
1103     * @see Ret2
1104     */
1105    public static Vector<Ret2<URL, MalformedURLException>> resolveHREFs_KE
1106        (Vector<? extends HTMLNode> html, int[] nodePosArr, URL sourcePage)
1107    {
1108        Vector<Ret2<URL, MalformedURLException>> ret = new Vector<>();
1109                                                                    // Return Vector
1110
1111        for (int nodePos : nodePosArr)
1112        {
1113            HTMLNode n = html.elementAt(nodePos);
1114            if (! n.isTagNode())                                    // Must be an HTML TagNode
1115                throw new TagNodeExpectedException(nodePos);
1116
1117            TagNode tn = (TagNode) n;
1118            if (tn.isClosing)                                       // Must be an "Opening" HTML TagNode
1119                throw new OpeningTagNodeExpectedException(nodePos);
1120
1121            ret.addElement(resolve_KE(tn.AV("href"), sourcePage));  // Resolve the "HREF", keep the URL
1122        }
1123
1124        return ret;
1125    }
1126 
1127    /**
1128     * This will use a "pointer array" - an array containing indexes into the downloaded page to
1129     * retrieve {@code TagNode's}.  The {@code TagNode} to which this pointer-array points - must 
1130     * contain {@code SRC} inner-tags with {@code URL's}, or partial {@code URL's}.
1131     * 
1132     * <EMBED CLASS="external-html" DATA-FILE-ID=BASE_URL_NOTE>
1133     * 
1134     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_KE>
1135     *
1136     * @param html <EMBED CLASS="external-html" DATA-FILE-ID="HTMLVEC"> Any HTML page (or sub-page)
1137     * 
1138     * @param nodePosArr An array of pointers into the page or sub-page.  The pointers must
1139     * reference {@code TagNode's} that contain {@code SRC} attributes.  Integer-pointer Arrays are
1140     * usually return from the {@code package 'NodeSearch'} "Find" methods.
1141     *
1142     * <DIV CLASS="EXAMPLE">{@code 
1143     * // Retrieve 'pointers' to all the '<IMG SRC=...>' TagNode's.  The term 'pointer' refers to
1144     * // integer-indices into the vectorized-html variable 'page'
1145     * int[] picturePosArr = TagNodeFind.all(page, TC.OpeningTags, "img");
1146     * 
1147     * // Extract each SRC inner-tag, and construct a URL.  Use the 'sourcePage' parameter if
1148     * // the URL is only partially-resolved.  If any URL's on the original-page are invalid,
1149     * // the method shall not crash, but save the exception instead.
1150     * Vector<Ret2<URL, MalformedURLException> urlsWithEx = Links.resolveSRCs_KE(page, picturePosArr, mySourcePage);
1151     *
1152     * // Print out any "failed" urls
1153     * for (Ret2<URL, MalformedURLException> r : urlsWithEx)
1154     *     if (r.b != null) 
1155     *         System.out.println("There was an exception: " + r.b.toString());
1156     * }</DIV>
1157     *
1158     * <BR /><I>which would obtain a pointer-array / (a.k.a. a "vector-index-array") to every HTML
1159     * {@code "<IMG ...>"} element</I> that was available in the HTML page-{@code Vector} parameter
1160     * {@code 'html'}, and then resolve any shortened {@code URL's}.
1161     *
1162     * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's}
1163     * (possibly-relative) {@code URL's} in the {@code Vector} will be resolved.
1164     *
1165     * @return A list of {@code URL's}, each of which have been completed/resolved with the 
1166     * {@code 'sourcePage'} parameter.  If there were any {@code TagNode} with no {@code SRC} tag,
1167     * then null is returned in the related {@code Vector} position.  If any {@code TagNode} causes
1168     * a {@code MalformedURLException}, then that position in the {@code Vector} will contain the
1169     * exception in {@code Ret2.b}
1170     *
1171     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_NO_SRC>
1172     *
1173     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_RET2>
1174     *
1175     * @throws ArrayIndexOutOfBoundsException
1176     * 
1177     * <EMBED CLASS="external-html" DATA-FILE-ID="ATTRAIOOBEX">
1178     * 
1179     * @throws TagNodeExpectedException <EMBED CLASS="external-html" DATA-FILE-ID="TNEEX">
1180     * 
1181     * @throws OpeningTagNodeExpectedException <EMBED CLASS="external-html" DATA-FILE-ID="OTNEEX">
1182     *
1183     * @see #resolve_KE(String, URL)
1184     * @see TagNode#AV(String)
1185     * @see Ret2
1186     */
1187    public static Vector<Ret2<URL, MalformedURLException>> resolveSRCs_KE
1188        (Vector<? extends HTMLNode> html, int[] nodePosArr, URL sourcePage)
1189    {
1190        Vector<Ret2<URL, MalformedURLException>> ret = new Vector<>();
1191                                                                    // Return Vector
1192
1193        for (int nodePos : nodePosArr)
1194        {
1195            HTMLNode n = html.elementAt(nodePos);
1196            if (! n.isTagNode())                                    // Must be an HTML TagNode
1197                throw new TagNodeExpectedException(nodePos);
1198
1199            TagNode tn = (TagNode) n;
1200            if (tn.isClosing)                                       // Must be an "Opening" HTML TagNode
1201                throw new OpeningTagNodeExpectedException(nodePos);
1202
1203            ret.addElement(resolve_KE(tn.AV("src"), sourcePage));   // Resolve "SRC" and keep URL's
1204        }
1205
1206        return ret;
1207    }
1208
1209    /**
1210     * Resolve all {@code URL's}, represented as {@code String's}, inside of a {@code Vector}.
1211     * 
1212     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_KE>
1213     * 
1214     * @param src a list of {@code String's} - usually partially or totally completed Internet
1215     * {@code URL's}
1216     * 
1217     * @param sourcePage This is the source page {@code URL} from which the {@code String's}
1218     * (possibly-relative) {@code URL's} in the {@code Vector} will be resolved.
1219     * 
1220     * @return A list of {@code URL's}, each of which have been completed/resolved with the
1221     * {@code 'sourcePage'} parameter.  If there were any {@code String's} that were zero-length or
1222     * null, then null is returned in the related {@code Vector} position.  If any {@code TagNode} 
1223     * causes a {@code MalformedURLException}, then that position in the {@code Vector} will
1224     * contain the exception in {@code Ret2.b}
1225     *
1226     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_RET2>
1227     * 
1228     * @see #resolve_KE(String, URL)
1229     * @see Ret2
1230     */
1231    public static Vector<Ret2<URL, MalformedURLException>> resolve_KE
1232        (Vector<String> src, URL sourcePage)
1233    {
1234        Vector<Ret2<URL, MalformedURLException>> ret = new Vector<>();
1235
1236        for (String s : src) ret.addElement(resolve_KE(s, sourcePage));
1237
1238        return ret;
1239    }
1240
1241    /**
1242     * This will convert a simple java {@code String} to a {@code URL}, de-referencing any missing
1243     * information using the {@code 'sourcePage'} parameter.
1244     * 
1245     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_KE>
1246     * 
1247     * @param src Any java {@code String}, usually one which was scraped from an HTML-Page, and
1248     * needs to be "completed."
1249     * 
1250     * @param sourcePage This is the source page {@code URL} from which the String (possibly
1251     * relative) {@code URL} will be resolved.
1252     * 
1253     * @return A {@code URL}, which has been completed/resolved with the {@code 'sourcePage'}
1254     * parameter. If parameter {@code 'src'} is null or zero-length, null will be returned.  If a
1255     * {@code MalformedURLException} is thrown, that will be included with the {@code Ret2<>}
1256     * result.
1257     *
1258     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_RET2>
1259     * 
1260     * @see Ret2
1261     */
1262    public static Ret2<URL, MalformedURLException> resolve_KE(String src, URL sourcePage)
1263    {
1264        if (sourcePage == null) throw new NullPointerException(
1265            "Though you may provide null to the partial-URL to dereference parameter, null " +
1266            "may not be passed to the Source-Page Parameter.  The purpose of the 'resolve' " +
1267            "operation is to resolve partial-URLs against a source-page (root) URL. " +
1268            "Therefore this is not allowed."
1269        );
1270
1271        if (src == null) return null;
1272
1273        src = src.trim();
1274
1275        if (src.length() == 0) return null;
1276
1277        String srcLC = src.toLowerCase();
1278
1279        if (StrCmpr.startsWithXOR
1280                (srcLC, "tel:", "javascript:", "mailto:", "magnet:", "file:", "ftp:", "#"))
1281            return new Ret2<URL, MalformedURLException>
1282                (null, new MalformedURLException(
1283                    "InnerTag/Attribute begins with: " + src.substring(0, 1 + src.indexOf(":")) +
1284                    ", so it is not a hyper-link."
1285                ));
1286
1287
1288        // Includes the first few characters of the URL - for reporting/convenience. 
1289        // If this is an "image", the image-type & name will be included
1290        if (StrCmpr.startsWithXOR(srcLC, "data:", "blob:"))
1291            return new Ret2<URL, MalformedURLException>(null, new MalformedURLException(
1292                "InnerTag/Attribute begins with: " +
1293                ((src.length() > 25) ? src.substring(0, 25) : src) +
1294                ", not a URL."
1295            ));
1296
1297
1298        if (srcLC.startsWith("http://") || srcLC.startsWith("https://"))
1299            try
1300                { return new Ret2<URL, MalformedURLException>(new URL(src), null); }
1301            catch (MalformedURLException e)
1302                { return new Ret2<URL, MalformedURLException>(null, e); }
1303
1304
1305        if (src.startsWith("//") && (src.charAt(3) != '/'))
1306            try
1307            { 
1308                return new Ret2<URL, MalformedURLException>
1309                    (new URL(  sourcePage.getProtocol().toLowerCase() + ":" + src), null);
1310            }
1311            catch (MalformedURLException e)
1312                { return new Ret2<URL, MalformedURLException>(null, e); }
1313
1314
1315        if (src.startsWith("/"))
1316            try
1317            {
1318                return new Ret2<URL, MalformedURLException>(new URL(
1319                    sourcePage.getProtocol().toLowerCase() + "://" +
1320                    sourcePage.getHost().toLowerCase() +
1321                    src), null
1322                );
1323            }
1324            catch (MalformedURLException e)
1325                { return new Ret2<URL, MalformedURLException>(null, e); }
1326
1327
1328        if (src.startsWith("../"))
1329        {
1330            String  sourcePageStr   = sourcePage.toString();
1331            short   nLevels         = 0;
1332
1333            do
1334                { nLevels++;  src = src.substring(3); }
1335            while (src.startsWith("../"));
1336
1337            String  directory = StringParse.dotDotParentDirectory(sourcePage.toString(), nLevels);
1338
1339            try
1340                { return new Ret2<URL, MalformedURLException>(new URL(directory + src), null); }
1341            catch (MalformedURLException e)
1342                { return new Ret2<URL, MalformedURLException>(null, e); }
1343            catch (Exception e)
1344            { 
1345                return new Ret2<URL, MalformedURLException>
1346                    (null,
1347                    new MalformedURLException(e.getClass().getCanonicalName() +
1348                    ":" + e.getMessage())
1349                    );
1350            }
1351        }
1352
1353
1354        String  root    = sourcePage.getProtocol().toLowerCase() + "://" + 
1355                            sourcePage.getHost().toLowerCase();
1356        String  path    = sourcePage.getPath().trim();
1357        int     pos     = StringParse.findLastFrontSlashPos(path);
1358
1359        if (pos == -1) throw new StringIndexOutOfBoundsException(
1360            "The URL you have provided: " + sourcePage.toString() +
1361            " does not have a '/' front-slash character in it's path." +
1362            "Cannot proceed resolving relative-URL's without this."
1363        );
1364
1365        path = path.substring(0, pos + 1);
1366
1367        try
1368            { return new Ret2<URL, MalformedURLException>(new URL(root + path + src), null); }
1369        catch (MalformedURLException e)
1370            { return new Ret2<URL, MalformedURLException>(null, e); }
1371
1372    }
1373}