001package Torello.HTML;
002
003import java.net.*;
004import java.util.*;
005import java.util.stream.IntStream;
006
007import Torello.Java.*;
008
009import Torello.HTML.NodeSearch.InnerTagFind; // Used for an @see reference
010import Torello.HTML.NodeSearch.TagNodeFind;  // Used in getBaseURL
011import Torello.Java.Additional.Ret2;
012import Torello.Java.Additional.Ret3;
013
014/**
015 * Utilities for de-refrencing 'partially-completed' {@code URL's} in a Web-Page {@code Vector}.
016 * 
017 * <BR /><BR /><EMBED CLASS="external-html" DATA-FILE-ID=LINKS>
018 * @see ReplaceNodes
019 * @see ReplaceFunction
020 * @see HTMLPage
021 * @see InnerTagFind
022 * @see Ret2
023 */
024@Torello.JavaDoc.StaticFunctional
025public class Links
026{
027    private Links() { }
028
029    /**
030     * List of documented "starter-strings" that are sometimes used in Anchor URL
031     * {@code 'HREF=...'} attributes.
032     * 
033     * @see #NON_URL_HREFS
034     */
035    protected static final String[] _NON_URL_HREFS =
036        { "tel:", "magnet:", "javascript:", "mailto:", "ftp:", "file:", "data:", "blog:", "#" };
037
038    /**
039     * This small method just returns the complete list of commonly found Anchor
040     * {@code 'HREF' String's} that do not actually constitute an HTML {@code 'URL'.}  This method
041     * actually returns a "clone" of an internally stored {@code String[]} Array.  This is to
042     * protect and make sure that the list of potential HTML Anchor-Tag {@code 'HREF'} Attributes
043     * is not changed, doctored or modified
044     * 
045     * @return A clone of the {@code String}-array {@code '_NON_URL_HREFS'}
046     * 
047     * @see #_NON_URL_HREFS
048     */
049    public static String[] NON_URL_HREFS()
050    { return _NON_URL_HREFS.clone(); }
051
052    /**
053     * The methods in this class <I><B>will not automatically extract</I></B> any HTML
054     * {@code <BASE HREF=URL>} definitions that are found on this page.  If the user wishes to
055     * dereference partial / relative {@code URL} definitions that exist on the input page, all the
056     * while respecting any {@code <BASE HREF=URL>} definitions found on the input page, then this
057     * method should be utilized.
058     *
059     * @param page This may be any HTML page or partial page.  If this page has a valid HTML
060     * {@code <BASE HREF=URL>}, it will be extracted and returned as an instance of
061     * {@code class URL}.
062     *
063     * @return This shall return the HTML {@code <BASE HREF="http://...">} element found available
064     * within the input-page parameter {@code 'page'}.  If the page provided does not contain a
065     * {@code BASE URL} definition, then null shall be returned.
066     *
067     * <BR /><BR /><B>NOTE:</B> The HTML Specification clearly states that only one {@code URL}
068     * may be defined using the HTML Element {@code <BASE>}.  Clearly, due to the browser wars,
069     * unspecified / non-deterministic behavior is possible if multiple definitions are provided.
070     * For the purposes of this class, if such a situation arises, an exception is thrown.
071     *
072     * @throws MalformedHTMLException If the HTML page provided contains multiple definitions of
073     * the element {@code <BASE HREF=URL>}, then this exception will throw.
074     *
075     * @throws MalformedURLException If the {@code <BASE HREF=URL>} found / identified within the
076     * input page, but that {@code URL} is invalid, then this exception shall throw.
077     * 
078     * @see TagNodeFind
079     * @see Attributes#retrieve(Vector, int[], String)
080     */
081    public static URL getBaseURL(Vector<? extends HTMLNode> page)
082        throws MalformedHTMLException, MalformedURLException
083    {
084        int[] posArr = TagNodeFind.all(page, TC.OpeningTags, "base");
085
086        if (posArr.length == 0) return null;
087
088        // NOTE: The cast is all right because 'posArr' only points to TagNode's
089        // Attributes expects to avoid processing Vector<TextNode>, and Vector<CommentNode>
090        // Above, there will be nothing in the 'posArr' if either of those was passed.
091
092        @SuppressWarnings("unchecked")
093        String[]    urls    = Attributes.retrieve((Vector<HTMLNode>) page, posArr, "href");
094
095        boolean     found   = false;
096        String      ret     = null;
097
098        for (String url : urls)
099            if ((url != null) && (url.length() > 0))
100                if (found)
101                    throw new MalformedHTMLException(
102                        "The page you have provided has multiple <BASE HREF=URL> definitions.  " +
103                        "However, the HTML Specifications state that pages may provide just one " +
104                        "definition.  If you wish to proceed, retrieve the definitions manually " +
105                        "using class TagNodeFind.all and Attributes.retrieve, as explained in " +
106                        "the JavaDoc pages for this class."
107                    );
108                else 
109                {
110                    found = true;
111                    ret = url;
112                }
113
114        return new URL(ret);                    
115    }
116
117    // ********************************************************************************************
118    // Complete Vector-Resolve Methods - SRC-ATTRIBUTE
119    // ********************************************************************************************
120
121    /**
122     * Convenience Method.
123     * <BR />Invokes: {@link #resolveAllSRC(Vector, int, int, URL, SD, boolean)}
124     */
125    public static Ret3<int[], int[], int[]> resolveAllSRC(
126            Vector<? super TagNode> html, URL sourcePage, SD quote,
127            boolean askForReturnArraysOrReturnNull
128        )
129    { return resolveAllSRC(html, 0, -1, sourcePage, quote, askForReturnArraysOrReturnNull); }
130
131    /**
132     * Convenience Method.
133     * <BR />Accepts: {@code DotPair}.
134     * <BR />Invokes: {@link #resolveAllSRC(Vector, int, int, URL, SD, boolean)}
135     */
136    public static Ret3<int[], int[], int[]> resolveAllSRC(
137            Vector<? super TagNode> html, DotPair dp, URL sourcePage, SD quote,
138            boolean askForReturnArraysOrReturnNull
139        )
140    {
141        return resolveAllSRC
142            (html, dp.start, dp.end + 1, sourcePage, quote, askForReturnArraysOrReturnNull);
143    }
144
145    /**
146     * This method shall resolve all partial {@code URL} addresses that are found within
147     * {@code TagNode} elements having {@code 'SRC=...'} attributes.  Each instance of
148     * {@code TagNode} found in the input HTML {@code Vector} that has an {@code 'SRC'}
149     * attribute - if the {@code 'URL'} is only partially resolve - shall be updated and replaced
150     * with a new {@code TagNode} with a fully resolved {@code URL}.
151     * 
152     * <EMBED CLASS="external-html" DATA-FILE-ID=BASE_URL_NOTE>
153     * 
154     * @param html <EMBED CLASS="external-html" DATA-FILE-ID="HTMLVECSUP">
155     * @param sPos <EMBED CLASS="external-html" DATA-FILE-ID="SPOSVEC">
156     * @param ePos <EMBED CLASS="external-html" DATA-FILE-ID="EPOSVEC">
157     * 
158     * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 
159     * (possibly-relative) {@code URL's} in the HTML-{@code Vector} will be resolved.
160     * 
161     * @param quote A choice for the quotes to use.  In most cases, {@code URL} attribute
162     * <B STYLE="color: red;">values</B> do not contain quotation-marks.  So likely either
163     * choice would work just fine, without exceptions.
164     * 
165     * <BR /><BR /><B>NOTE:</B> <I>null may be passed to this parameter</I>, and if it is
166     * the original quotation marks found in the {@code TagNode's 'SRC'} attribute will be
167     * reused.  Passing null to this parameter should almost always be easiest, safest.
168     * 
169     * @param askForReturnArraysOrReturnNull This (long-named) parameter is merely here to
170     * facilitate retrieving more information from this method - <I>if necessary</I>.  When this
171     * parameter receives the following values:
172     * 
173     * <BR /><BR /><UL CLASS="JDUL">
174     * <LI> <B>TRUE:</B> Three integer {@code int[]} arrays will be returned as listed in the
175     *      <B>{@code Returns:}</B> section of this method's documentation.
176     * </LI>
177     * <LI><B>FALSE:</B> This method shall return null.
178     * </LI>
179     * </UL>
180     * 
181     * @return If input parameter {@code 'askForReturnArraysOrReturnNull'} has been passed 
182     * <B>FALSE</B>, this method shall return null.  Otherwise, (if passed <B>TRUE</B>), then
183     * this method shall return an instance of {@code 'Ret3<int[], int[], int[]>'} - which is
184     * <I>returning three separate integer-arrays about what was found, and what has occurred.</I>
185     *
186     * <BR /><BR />
187     * Three arrays are returned as a result of this method's invocation.  Keep in mind that
188     * though the information might be superfluous, rejecting these arrays away is easy.
189     * They are provided as a matter of convenience for cases where more details information is
190     * mandatory for ensuring that long lists of {@code HTMLNode's} were properly updated.
191     * 
192     * <BR /><BR /><OL CLASS="JDOL">
193     * <LI> {@code Ret3.a (int[])}
194     *      <BR /><BR />
195     *      The first {@code int[] array} shall contain a list of the index of every
196     *      {@code TagNode} in the input-{@code Vector} parameter's range that <B><I>contained</B>
197     *      </I> a non-null HTML {@code 'SRC'} Attribute.
198     *      <BR /><BR />
199     * </LI>
200     * <LI> {@code Ret3.b (int[])}
201     *      <BR /><BR />
202     *      The second {@code int[] array} will contain an index-list of the indices
203     *      which contained {@code TagNode's} that were <B><I>replaced</I></B> by the
204     *      internal-resolve logic.
205     *      <BR /><BR />
206     * </LI>
207     * <LI> {@code Ret3.c (int[])}
208     *      <BR /><BR />
209     *      The third {@code int[] array} will contain an index-list of the indices
210     *      which contained {@code TagNode's} whose {@code 'SRC=...'} attribute
211     *      <I><B>failed</I></B> to be resolved by the internal-resolve logic, <I>or</I> caused a
212     *      {@code QuotesException} to throw.
213     * </LI>
214     * </OL>
215     * 
216     * @throws IndexOutOfBoundsException <EMBED CLASS="external-html" DATA-FILE-ID="VIOOBEX">
217     * 
218     * @see #resolve(String, URL)
219     * @see TagNode#AV(String)
220     * @see TagNode#setAV(String, String, SD)
221     */
222    public static Ret3<int[], int[], int[]> resolveAllSRC(
223            Vector<? super TagNode> html, int sPos, int ePos, URL sourcePage, SD quote,
224            boolean askForReturnArraysOrReturnNull
225        )
226    {
227        // Retrieve the Vector-location of any TagNode on the page that has
228        // a "SRC=..." attribute.  These are almost always HTML <IMG> elements.
229        // NOTE: FIND Method's are "READ ONLY" - the Cast will make no difference at run-time.
230        //       The @SuppressWarnings is to overcome the cast of 'html'
231
232        @SuppressWarnings("unchecked")
233        int[] hasSrcPosArr = InnerTagFind.all((Vector<HTMLNode>) html, sPos, ePos, "src");
234
235        // Java Stream's are convenient for keeping "Growing Lists" of return values.
236        // This builder shall keep a list of all URL's that failed to update - for any reason
237        // **UNLESS** the reason is that the URL was already a fully-resolved, non-partial URL
238
239        IntStream.Builder failedUpdate = askForReturnArraysOrReturnNull
240            ? IntStream.builder() 
241            : null;
242
243        // This stream will keep a list of all URL's that were updated, and whose TagNode's
244        // were replaced inside the input HTML Vector
245
246        IntStream.Builder replaced = askForReturnArraysOrReturnNull
247            ? IntStream.builder()
248            : null;
249
250        for (int pos : hasSrcPosArr)
251        {
252            // Get the node at the index
253            TagNode tn = (TagNode) html.elementAt(pos);
254
255            // 1) Retrieve the SRC Attribute
256            // 2) if it is a partial-URL resolve it
257            // 3) Convert to a String
258
259            String  oldURL = tn.AV("src");
260            URL     newURL = resolve(oldURL, sourcePage);
261
262            // Some URL's cannot be resolved, if so, just skip this TagNode.
263            // Log the index to the stream (if requested), and continue.
264
265            if (newURL == null)
266            { if (askForReturnArraysOrReturnNull) failedUpdate.accept(pos); continue; }
267
268            // If the URL was already a fully-resolved-URL, continue - don't replace the TagNode;
269            // No logging needed here, the URL was *already* resolved...
270
271            if (oldURL.length() == newURL.toString().length()) continue;
272
273            // Replace the SRC Attribute in the TagNode.  This builds a new instance of TagNode
274            // If there is an exception, log the index to the stream (if requested), and continue.
275
276            try
277                { tn = tn.setAV("src", newURL.toString(), quote); }
278
279            catch (QuotesException qex)
280                { if (askForReturnArraysOrReturnNull) failedUpdate.accept(pos); continue; }
281
282            // Replace the index in the Vector containing the old TagNode with the new one.
283            html.setElementAt(tn , pos);
284
285            // The Vector-Index at this position had it's old TagNode removed and replaced with a
286            // new updated one.  Log this to the stream-list so to allow the user to know.
287
288            if (askForReturnArraysOrReturnNull) replaced.accept(pos);
289        }
290
291        return askForReturnArraysOrReturnNull
292
293            ? new Ret3<int[], int[], int[]>
294                (hasSrcPosArr, replaced.build().toArray(), failedUpdate.build().toArray())
295            : null;
296    }
297
298    // ********************************************************************************************
299    // Complete Vector-Resolve Methods - HREF-ATTRIBUTE
300    // ********************************************************************************************
301
302    /**
303     * Convenience Method.
304     * <BR />Invokes: {@link #resolveAllHREF(Vector, int, int, URL, SD, boolean)}
305     */
306    public static Ret3<int[], int[], int[]> resolveAllHREF(
307            Vector<? super TagNode> html, URL sourcePage, SD quote,
308            boolean askForReturnArraysOrReturnNull
309        )
310    { return resolveAllHREF(html, 0, -1, sourcePage, quote, askForReturnArraysOrReturnNull); }
311
312    /**
313     * Convenience Method.
314     * <BR />Accepts: {@code DotPair}.
315     * <BR />Invokes: {@link #resolveAllHREF(Vector, int, int, URL, SD, boolean)}
316     */
317    public static Ret3<int[], int[], int[]> resolveAllHREF(
318            Vector<? super TagNode> html, DotPair dp, URL sourcePage, SD quote,
319            boolean askForReturnArraysOrReturnNull
320        )
321    {
322        return resolveAllHREF
323            (html, dp.start, dp.end + 1, sourcePage, quote, askForReturnArraysOrReturnNull); 
324    }
325
326    /**
327     * This method shall resolve all partial {@code URL} addresses that are found within
328     * {@code TagNode} elements having {@code 'HREF=...'} attributes.  Each instance of
329     * {@code TagNode} found in the input HTML {@code Vector} that has an {@code 'HREF'}
330     * attribute - if the {@code 'URL'} is only partially resolve - shall be updated and replaced
331     * with a new {@code TagNode} with a fully resolved {@code URL}.
332     * 
333     * <EMBED CLASS="external-html" DATA-FILE-ID=BASE_URL_NOTE>
334     * 
335     * @param html <EMBED CLASS="external-html" DATA-FILE-ID="HTMLVECSUP">
336     * @param sPos <EMBED CLASS="external-html" DATA-FILE-ID="SPOSVEC">
337     * @param ePos <EMBED CLASS="external-html" DATA-FILE-ID="EPOSVEC">
338     * 
339     * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 
340     * (possibly-relative) {@code URL's} in the HTML-{@code Vector} will be resolved.
341     * 
342     * @param quote A choice for the quotes to use.  In most cases, {@code URL} attribute
343     * <B STYLE="color: red;">values</B> do not contain quotation-marks.  So likely either
344     * choice would work just fine, without exceptions.
345     * 
346     * <BR /><BR /><B>NOTE:</B> <I>null may be passed to this parameter</I>, and if it is
347     * the original quotation marks found in the {@code TagNode's 'HREF'} attribute will be
348     * reused.  Passing null to this parameter should almost always be easiest, safest.
349     * 
350     * @param askForReturnArraysOrReturnNull This (long-named) parameter is merely here to
351     * facilitate retrieving more information from this method - <I>if necessary</I>.  When this
352     * parameter receives the following values:
353     * 
354     * <BR /><BR /><UL CLASS="JDUL">
355     * <LI> <B>TRUE:</B> Three integer {@code int[]} arrays will be returned as listed in the
356     *      <B>{@code Returns:}</B> section of this method's documentation.
357     * </LI>
358     * <LI><B>FALSE:</B> This method shall return null.
359     * </LI>
360     * </UL>
361     * 
362     * @return If input parameter {@code 'askForReturnArraysOrReturnNull'} has been passed 
363     * <B>FALSE</B>, this method shall return null.  Otherwise, (if passed <B>TRUE</B>), then
364     * this method shall return an instance of {@code 'Ret3<int[], int[], int[]>'} - which is
365     * <I>returning three separate integer-arrays about what was found, and what has occurred.</I>
366     *
367     * <BR /><BR />
368     * Three arrays are returned as a result of this method's invocation.  Keep in mind that
369     * though the information might be superfluous, rejecting these arrays away is easy.
370     * They are provided as a matter of convenience for cases where more details information is
371     * mandatory for ensuring that long lists of {@code HTMLNode's} were properly updated.
372     * 
373     * <BR /><BR /><OL CLASS="JDOL">
374     * <LI> {@code Ret3.a (int[])}
375     *      <BR /><BR />
376     *      The first {@code int[] array} shall contain a list of the index of every
377     *      {@code TagNode} in the input-{@code Vector} parameter's range that <B><I>contained</B>
378     *      </I> a non-null HTML {@code 'HREF'} Attribute.
379     *      <BR /><BR />
380     * </LI>
381     * <LI> {@code Ret3.b (int[])}
382     *      <BR /><BR />
383     *      The second {@code int[] array} will contain an index-list of the indices
384     *      which contained {@code TagNode's} that were <B><I>replaced</I></B> by the
385     *      internal-resolve logic.
386     *      <BR /><BR />
387     * </LI>
388     * <LI> {@code Ret3.c (int[])}
389     *      <BR /><BR />
390     *      The third {@code int[] array} will contain an index-list of the indices
391     *      which contained {@code TagNode's} whose {@code 'HREF=...'} attribute
392     *      <I><B>failed</I></B> to be resolved by the internal-resolve logic, <I>or</I> caused a
393     *      {@code QuotesException} to throw.
394     * </LI>
395     * </OL>
396     * 
397     * @throws IndexOutOfBoundsException <EMBED CLASS="external-html" DATA-FILE-ID="VIOOBEX">
398     * 
399     * @see #resolve(String, URL)
400     * @see TagNode#AV(String)
401     * @see TagNode#setAV(String, String, SD)
402     */
403    public static Ret3<int[], int[], int[]> resolveAllHREF(
404            Vector<? super TagNode> html, int sPos, int ePos, URL sourcePage, SD quote,
405            boolean askForReturnArraysOrReturnNull
406        )
407    {
408        // Retrieve the Vector-location of any TagNode on the page that has
409        // a "HREF=..." attribute.  These are almost always HTML <IMG> elements.
410        // NOTE: FIND Method's are "READ ONLY" - the Cast will make no difference at run-time.
411        //       The @SuppressWarnings is to overcome the cast of 'html'
412
413        @SuppressWarnings("unchecked")
414        int[] hasHRefPosArr = InnerTagFind.all((Vector<HTMLNode>) html, sPos, ePos, "href");
415
416        // Java Stream's are convenient for keeping "Growing Lists" of return values.
417        // This builder shall keep a list of all URL's that failed to update - for any reason
418        // **UNLESS** the reason is that the URL was already a fully-resolved, non-partial URL
419
420        IntStream.Builder failedUpdate = askForReturnArraysOrReturnNull
421            ? IntStream.builder() 
422            : null;
423
424        // This stream will keep a list of all URL's that were updated, and whose TagNode's
425        // were replaced inside the input HTML Vector
426
427        IntStream.Builder replaced = askForReturnArraysOrReturnNull
428            ? IntStream.builder()
429            : null;
430
431        for (int pos : hasHRefPosArr)
432        {
433            // Get the node at the index
434            TagNode tn = (TagNode) html.elementAt(pos);
435
436            // 1) Retrieve the HREF Attribute
437            // 2) if it is a partial-URL resolve it
438            // 3) Convert to a String
439
440            String  oldURL = tn.AV("HREF");
441            URL     newURL = resolve(oldURL, sourcePage);
442
443            // Some URL's cannot be resolved, if so, just skip this TagNode.
444            // Log the index to the stream (if requested), and continue.
445
446            if (newURL == null)
447            { if (askForReturnArraysOrReturnNull) failedUpdate.accept(pos); continue; }
448
449            // If the URL was already a fully-resolved-URL, continue - don't replace the TagNode;
450            // No logging needed here, the URL was *already* resolved...
451
452            if (oldURL.length() == newURL.toString().length()) continue;
453
454            // Replace the HREF Attribute in the TagNode.  This builds a new instance of TagNode
455            // If there is an exception, log the index to the stream (if requested), and continue.
456
457            try
458                { tn = tn.setAV("href", newURL.toString(), quote); }
459
460            catch (QuotesException qex)
461                { if (askForReturnArraysOrReturnNull) failedUpdate.accept(pos); continue; }
462
463            // Replace the index in the Vector containing the old TagNode with the new one.
464            html.setElementAt(tn , pos);
465
466            // The Vector-Index at this position had it's old TagNode removed and replaced with a
467            // new updated one.  Log this to the stream-list so to allow the user to know.
468
469            if (askForReturnArraysOrReturnNull) replaced.accept(pos);
470        }
471
472        return askForReturnArraysOrReturnNull
473
474            ? new Ret3<int[], int[], int[]>
475                (hasHRefPosArr, replaced.build().toArray(), failedUpdate.build().toArray())
476            : null;
477    }
478
479    // ********************************************************************************************
480    // Resolve, Not Keep Exceptions
481    // ********************************************************************************************
482
483
484    /**
485     * Convenience Method.
486     * <BR />Invokes: {@link #resolveHREF(TagNode, URL)}.
487     * <BR />And-Then: {@link TagNode#setAV(String, String, SD)}
488     */
489    public static TagNode resolveHREFAndUpdate(TagNode tnWithHREF, URL sourcePage)
490    { 
491        URL url = resolveHREF(tnWithHREF, sourcePage);
492
493        return (url == null)
494            ? null
495            : tnWithHREF.setAV("href", url.toString(), null);
496    }
497
498
499    /**
500     * This should be used for {@code TagNode's} that contain an {@code 'HREF'} inner-tag
501     * (attribute).
502     * 
503     * @param tnWithHREF <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_TN_HREF>
504     * 
505     * @param sourcePage This is the source page {@code URL} from which the {@code TagNode}
506     * (possibly-relative) {@code URL} will be resolved.
507     * 
508     * @return A complete-{@code URL} without any missing "presumed data" - such as host/domain or
509     * directory.  Null is returned if attempting to build the {@code URL} generated a
510     * {@code MalformedURLException}.
511     * 
512     * <BR /><BR /><B STYLE="color: red;">SPECIFICALLY:</B> This method shall catch all 
513     * {@code MalformedURLException's}.
514     * 
515     * @throws HREFException If the {@code TagNode} passed to parameter {@code 'tnWithHREF'} does
516     * not actually contain an {@code HREF} attribute, then this exception shall throw.
517     * 
518     * @see #resolve(String, URL)
519     * @see TagNode#AV(String)
520     */
521    public static URL resolveHREF(TagNode tnWithHREF, URL sourcePage)
522    {
523        String href = tnWithHREF.AV("href");
524
525        if (href == null) throw new HREFException(
526            "The TagNode passed to parameter tnWithHREF does not actually contain an " +
527            "HREF attribute."
528        );
529
530        return resolve(href, sourcePage);
531    }
532
533
534    /**
535     * Convenience Method.
536     * <BR />Invokes: {@link #resolveSRC(TagNode, URL)} 
537     * <BR />And-Then: {@link TagNode#setAV(String, String, SD)}
538     */
539    public static TagNode resolveSRCAndUpdate(TagNode tnWithSRC, URL sourcePage)
540    { 
541        URL url = resolveSRC(tnWithSRC, sourcePage);
542
543        return (url == null) 
544            ? null 
545            : tnWithSRC.setAV("src", url.toString(), null);
546    }
547
548
549    /**
550     * This should be used for {@code TagNode's} that contain a {@code 'SRC'} inner-tag
551     * (attribute).
552     * 
553     * @param tnWithSRC <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_TN_SRC>
554     * 
555     * @param sourcePage This is the source page {@code URL} from which the {@code TagNode}
556     * (possibly-relative) {@code URL} will be resolved.
557     * 
558     * @return A complete-{@code URL} without any missing "presumed data" - such as host/domain or
559     * directory.  Null is returned if attempting to build the {@code URL} generated a
560     * {@code MalformedURLException}.
561     * 
562     * <BR /><BR /><B STYLE="color: red;">SPECIFICALLY:</B> This method shall catch all 
563     * {@code MalformedURLException's}.
564     * 
565     * @throws SRCException If the {@code TagNode} passed to parameter {@code 'tnWithSRC'} does not
566     * actually contain a {@code SRC} attribute, then this exception shall throw.
567     * 
568     * @see #resolve(String, URL)
569     * @see TagNode#AV(String)
570     */
571    public static URL resolveSRC(TagNode tnWithSRC, URL sourcePage)
572    {
573        String src = tnWithSRC.AV("src");
574
575        if (src == null) throw new SRCException(
576            "The TagNode passed to parameter tnWithSRC does not actually contain a " +
577            "SRC attribute."
578        );
579
580        return resolve(src, sourcePage);
581    }
582
583    /**
584     * This should be used for lists of {@code TagNode's}, each of which contain an {@code 'HREF'}
585     * inner-tag (attribute).
586     * 
587     * @param tnListWithHREF <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_TNLIST_HREF>
588     * 
589     * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 
590     * (possibly-relative) {@code URL's} in the {@code Iterable} will be resolved.
591     * 
592     * @return A list of {@code URL's}, each of which have been completed/resolved with the 
593     * {@code 'sourcePage'} parameter.  Any {@code TagNode} which generated an exception, will
594     * result in a null value in the {@code Vector}.
595     * 
596     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_NO_HREF>
597     * 
598     * @see #resolve(String, URL)
599     * @see TagNode#AV(String)
600     */
601    public static Vector<URL> resolveHREFs(Iterable<TagNode> tnListWithHREF, URL sourcePage)
602    {
603        Vector<URL> ret = new Vector<>();
604
605        for (TagNode tn : tnListWithHREF) ret.addElement(resolve(tn.AV("href"), sourcePage));
606
607        return ret;
608    }
609
610
611    /**
612     * This should be used for lists of {@code TagNode's}, each of which contain a {@code 'SRC'}
613     * inner-tag (attribute).
614     * 
615     * @param tnListWithSRC <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_TNLIST_SRC>
616     * 
617     * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's}
618     * (possibly-relative) {@code URL's} in the {@code Iterable} will be resolved.
619     * 
620     * @return A list of {@code URL's}, each of which have been completed/resolved with the
621     * {@code 'sourcePage'} parameter.  Any {@code TagNode} which generated an exception, will
622     * result in a null value in the {@code Vector.}
623     * 
624     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_NO_SRC>
625     * 
626     * @see #resolve(String, URL)
627     * @see TagNode#AV(String)
628     */
629    public static Vector<URL> resolveSRCs(Iterable<TagNode> tnListWithSRC, URL sourcePage)
630    {
631        Vector<URL> ret = new Vector<>();
632
633        for (TagNode tn : tnListWithSRC) ret.addElement(resolve(tn.AV("src"), sourcePage));
634
635        return ret;
636    }
637
638
639    /**
640     * This will use a "pointer array" - an array containing indexes into the downloaded page to
641     * retrieve {@code TagNode's}.  The {@code TagNode's} to which this pointer-array points -
642     * must each contain an {@code HREF} inner-tag with a {@code URL}, or a partial {@code URL}.
643     * 
644     * <EMBED CLASS="external-html" DATA-FILE-ID=BASE_URL_NOTE>
645     * 
646     * @param html <EMBED CLASS="external-html" DATA-FILE-ID="HTMLVEC">
647     * 
648     * @param nodePosArr An array of pointers into the page or sub-page.  The pointers must
649     * reference {@code TagNode's} that contain {@code HREF} attributes.  Integer-pointer Arrays
650     * are usually returned from the {@code package 'NodeSearch'} "Find" methods.
651     *
652     * <DIV CLASS="EXAMPLE">{@code 
653     * // Retrieve 'pointers' to all the '<A HREF=...>' TagNode's.  The term 'pointer' refers to
654     * // integer-indices into the vectorized-html variable 'page'
655     * int[] anchorPosArr = TagNodeFind.all(page, TC.OpeningTags, "a");
656     * 
657     * // Extract each HREF inner-tag, and construct a {@code URL}.  Use the 'sourcePage' parameter
658     * // if the URL is only partially-resolved
659     * Vector<URL> urls = Links.resolveHREFs(page, anchorPosArr, mySourcePage);
660     * }</DIV>
661     * <BR /><I>which would obtain a pointer-array / (a.k.a. a "vector-index-array") to every HTML
662     * {@code "<A ...>"} element</I> that was available in the HTML page-{@code Vector} parameter
663     * {@code 'html'}, and then resolve any shortened {@code URL's}. 
664     *
665     * @param sourcePage This is the source page {@code URL} from whence the (possibly relative)
666     * {@code TagNode URL's} in the {@code Vector} are to be resolved.
667     *
668     * @return A list of {@code URL's}, each of which have been completed/resolved with the
669     * {@code 'sourcePage'} parameter.  Any {@code TagNode} which generated an exception, will
670     * result in a null value in the {@code Vector}.  However, if any of the nodes pointed to by
671     * the {@code 'nodePosArr'} parameter do not contain opening {@code TagNode} elements, then
672     * this mistake shall generate {@code TagNodeExpectedException's}.
673     *
674     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_NO_HREF>
675     *
676     * @throws ArrayIndexOutOfBoundsException
677     * <EMBED CLASS="external-html" DATA-FILE-ID="ATTRAIOOBEX">
678     * @throws TagNodeExpectedException <EMBED CLASS="external-html" DATA-FILE-ID="TNEEX">
679     * @throws OpeningTagNodeExpectedException <EMBED CLASS="external-html" DATA-FILE-ID="OTNEEX">
680     *
681     * @see #resolve(String, URL)
682     * @see TagNode#AV(String)
683     */
684    public static Vector<URL> resolveHREFs
685        (Vector<? extends HTMLNode> html, int[] nodePosArr, URL sourcePage)
686    {
687        // Return Vector
688        Vector<URL> ret = new Vector<>();
689
690        for (int nodePos : nodePosArr)
691        {
692            HTMLNode n = html.elementAt(nodePos);
693
694            // Must be an HTML TagNode
695            if (! n.isTagNode()) throw new TagNodeExpectedException(nodePos);
696
697            TagNode tn = (TagNode) n;
698
699            // Must be an "Opening" HTML TagNode
700            if (tn.isClosing) throw new OpeningTagNodeExpectedException(nodePos);
701
702            // Resolve the 'HREF', save the URL
703            ret.addElement(resolve(tn.AV("href"), sourcePage));
704        }
705
706        return ret;
707    }
708 
709
710    /**
711     * This will use a "pointer array" - an array containing indexes into the downloaded page to
712     * retrieve {@code TagNode's}.  The {@code TagNode's} to which this pointer-array points - must
713     * each contain a {@code SRC} inner-tag with a {@code URL}, or a partial {@code URL}.
714     * 
715     * <EMBED CLASS="external-html" DATA-FILE-ID=BASE_URL_NOTE>
716     *
717     * @param html <EMBED CLASS="external-html" DATA-FILE-ID="HTMLVEC"> Any HTML page (or sub-page)
718     * 
719     * @param nodePosArr An array of pointers into the page or sub-page.  The pointers must
720     * reference {@code TagNode's} that contain {@code SRC} attributes.  Integer-pointer Arrays are
721     * usually returned from the {@code package 'NodeSearch'} "Find" methods.
722     *
723     * <DIV CLASS="EXAMPLE">{@code 
724     * // Retrieve 'pointers' to all the '<IMG SRC=...>' TagNode's.  The term 'pointer' refers to
725     * // integer-indices into the vectorized-html variable 'page'
726     * int[] picturePosArr = TagNodeFind.all(page, TC.OpeningTags, "img");
727     * 
728     * // Extract each SRC inner-tag, and construct a {@code URL}.  Use the 'sourcePage' parameter
729     * // if the URL is only partially-resolved
730     * Vector<URL> urls = Links.resolveSRCs(page, picturePosArr, mySourcePage);
731     * }</DIV>
732     * 
733     * <BR /><I>which would obtain a pointer-array / (a.k.a. a "vector-index-array") to every HTML
734     * {@code "<IMG ...>"} element</I> that was available in the HTML page-{@code Vector} parameter
735     * {@code 'html'}, and then resolve any shorted image {@code URL's}.
736     *
737     * @param sourcePage This is the source page {@code URL} from whence the (possibly relative)
738     * {@code TagNode URL's} in the {@code Vector} are to be resolved.
739     *
740     * @return A list of {@code URL's}, each of which have been completed/resolved with the
741     * {@code 'sourcePage'} parameter.  Any {@code TagNode} which generated an exception, will
742     * result in a null value in the {@code Vector}.  However, if any of the nodes pointed to by
743     * the {@code 'nodePosArr'} parameter do not contain opening {@code TagNode} elements, then
744     * this mistake shall generate {@code TagNodeExpectedException's}.
745     *
746     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_NO_SRC>
747     *
748     * @throws ArrayIndexOutOfBoundsException
749     * <EMBED CLASS="external-html" DATA-FILE-ID="ATTRAIOOBEX">
750     * @throws TagNodeExpectedException <EMBED CLASS="external-html" DATA-FILE-ID="TNEEX">
751     * @throws OpeningTagNodeExpectedException <EMBED CLASS="external-html" DATA-FILE-ID="OTNEEX">
752     *
753     * @see #resolve(String, URL)
754     * @see TagNode#AV(String)
755     */
756    public static Vector<URL> resolveSRCs
757        (Vector<? extends HTMLNode> html, int[] nodePosArr, URL sourcePage)
758    {
759        // Return Vector
760        Vector<URL> ret = new Vector<>();
761
762        for (int nodePos : nodePosArr)
763        {
764            HTMLNode n = html.elementAt(nodePos);
765
766            // Must be an HTML TagNode
767            if (! n.isTagNode()) throw new TagNodeExpectedException(nodePos);
768
769            TagNode tn = (TagNode) n;
770
771            // Must be an "Opening" HTML TagNode
772            if (tn.isClosing) throw new OpeningTagNodeExpectedException(nodePos);
773
774            // Resolve the "SRC", save the URL
775            ret.addElement(resolve(tn.AV("src"), sourcePage));
776        }
777
778        return ret;
779    }
780
781
782    /**
783     * This will convert <I><B>a list of </B></I> simple java {@code String's} to a
784     * list/{@code Vector} of {@code URL's}, de-referencing any missing information using the
785     * {@code 'sourcePage'} parameter.
786     * 
787     * @param src a list of strings - usually partially or totally completed Internet {@code URL's}
788     * 
789     * @param sourcePage This is the source page {@code URL} from which the {@code String's}
790     * (possibly-relative) {@code URL's} in the {@code Vector} will be resolved.
791     * 
792     * @return A list of {@code URL's}, each of which have been completed/resolved with the
793     * {@code 'sourcePage'} parameter.  If there were any {@code String's} that were zero-length or
794     * null,  then null is returned in the related {@code Vector} position.  If any
795     * {@code TagNode} causes a {@code MalformedURLException}, then that position in the
796     * {@code Vector} will be null.
797     * 
798     * @see #resolve(String, URL)
799     */
800    public static Vector<URL> resolve(Vector<String> src, URL sourcePage)
801    {
802        Vector<URL> ret = new Vector<>();
803
804        for (String s : src) ret.addElement(resolve(s, sourcePage));
805
806        return ret;
807    }
808
809    /**
810     * This will convert a simple java {@code String} to a {@code URL}, de-referencing any missing
811     * information using the {@code 'sourcePage'} parameter.
812     * 
813     * @param src Any java {@code String}, usually one which was scraped from an HTML-Page, and
814     * needs to be "completed."
815     * 
816     * @param sourcePage This is the source page {@code URL} from which the String
817     * (possibly-relative) {@code URL} will be resolved.
818     * 
819     * @return A {@code URL}, which has been completed/resolved with the {@code 'sourcePage'}
820     * parameter. If parameter {@code 'src'} is null or zero-length, then this method will also
821     * return null.  If a {@code MalformedURLException} is generated, null will also be returned.
822     */
823    public static URL resolve(String src, URL sourcePage)
824    {
825        if (sourcePage == null) throw new NullPointerException(
826            "Though you may provide null to the partial-URL to dereference parameter, null " +
827            "may not be passed to the Source-Page Parameter.  The purpose of the 'resolve' " +
828            "operation is to resolve partial-URLs against a source-page (root) URL. " +
829            "Therefore this is not allowed."
830        );
831
832        if (src == null) return null;
833
834        src = src.trim();
835
836        if (src.length() == 0) return null;
837
838        String srcLC = src.toLowerCase();
839
840        if (StrCmpr.startsWithXOR(srcLC, _NON_URL_HREFS)) return null;
841
842        if (srcLC.startsWith("http://") || srcLC.startsWith("https://"))
843
844            try
845                { return new URL(src); }
846
847            catch (MalformedURLException e) { return null; }
848
849        if (src.startsWith("//") && (src.charAt(3) != '/'))
850
851            try
852                { return new URL(sourcePage.getProtocol().toLowerCase() + ":" + src); }
853
854            catch (MalformedURLException e) { return null; }
855        
856        if (src.startsWith("/"))
857
858            try
859            { 
860                return new URL(
861                    sourcePage.getProtocol().toLowerCase() + "://" +
862                    sourcePage.getHost().toLowerCase() +
863                    src
864                );
865            }
866
867            catch (MalformedURLException e) { return null; }
868 
869        if (src.startsWith("../"))
870        {
871            String  sourcePageStr   = sourcePage.toString();
872            short   nLevels         = 0;
873
874            do      { nLevels++;  src = src.substring(3); }
875            while   (src.startsWith("../"));
876
877            String  directory = StringParse.dotDotParentDirectory(sourcePage.toString(), nLevels);
878
879            try     { return new URL(directory + src); }
880            catch   (Exception e) { return null; }
881        }
882
883        String  root =
884            sourcePage.getProtocol().toLowerCase() + "://" + 
885            sourcePage.getHost().toLowerCase();
886
887        String  path    = sourcePage.getPath().trim();
888        int     pos     = StringParse.findLastFrontSlashPos(path);
889
890        if (pos == -1) throw new StringIndexOutOfBoundsException(
891            "The URL you have provided: " + sourcePage.toString() + " does not have a '/' " +
892            "front-slash character in it's path.  Cannot proceed resolving relative-URL's " +
893            "without this."
894        );
895
896        path = path.substring(0, pos + 1);
897
898        try     { return new URL(root + path + src); }
899        catch   (MalformedURLException e) { return null; }
900    }
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922    // ********************************************************************************************
923    // Resolve, KE - Keep Exceptions
924    // ********************************************************************************************
925
926    /**
927     * This should be used for {@code TagNode's} that contain an {@code 'HREF'} inner-tag
928     * (attribute).
929     * 
930     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_KE>
931     * 
932     * @param tnWithHREF <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_TN_HREF>
933     * 
934     * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's}
935     * (possibly-relative) {@code URL} will be resolved.
936     * 
937     * @return A complete-{@code URL} without any missing "presumed data" - such as host/domain or 
938     * directory.  If there were no {@code HREF} tag, then null is returned.  If
939     * the {@code TagNode} causes a {@code MalformedURLException}, that is returned in
940     * {@code Ret2.b}
941     * 
942     * <BR /><BR /><B STYLE="color: red;">SPECIFICALLY:</B> This method shall catch all 
943     * {@code MalformedURLException's}.
944     * 
945     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_RET2>
946     * 
947     * @throws HREFException If the {@code TagNode} passed to parameter {@code 'tnWithHREF'} does
948     * not actually contain an {@code HREF} attribute, then this exception shall throw.
949     * 
950     * @see #resolve_KE(String, URL)
951     * @see TagNode#AV(String)
952     * @see Ret2
953     */
954    public static Ret2<URL, MalformedURLException> resolveHREF_KE
955        (TagNode tnWithHREF, URL sourcePage)
956    {
957        String href = tnWithHREF.AV("href");
958
959        if (href == null) throw new HREFException(
960            "The TagNode passed to parameter tnWithHREF does not actually contain an " +
961            "HREF attribute."
962        );
963
964        return resolve_KE(href, sourcePage);
965    }
966
967
968    /**
969     * This should be used for {@code TagNode's} that contain a {@code 'SRC'} inner-tag
970     * (attribute).
971     * 
972     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_KE>
973     * 
974     * @param tnWithSRC <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_TN_SRC>
975     * 
976     * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's}
977     * (possibly-relative) {@code URL} will be resolved.
978     * 
979     * @return A complete-{@code URL} without any missing "presumed data" - such as host/domain or
980     * directory.  If there were no {@code SRC} tag, then null is returned.  If the
981     * {@code TagNode} causes a {@code MalformedURLException}, that is returned in {@code Ret2.b}
982     * 
983     * <BR /><BR /><B STYLE="color: red;">SPECIFICALLY:</B> This method shall catch all 
984     * {@code MalformedURLException's}.
985     *
986     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_RET2>
987     * 
988     * @throws SRCException If the {@code TagNode} passed to parameter {@code 'tnWithSRC'} does not
989     * actually contain a {@code SRC} attribute, then this exception shall throw.
990     * 
991     * @see #resolve_KE(String, URL)
992     * @see TagNode#AV(String)
993     * @see Ret2
994     */
995    public static Ret2<URL, MalformedURLException> resolveSRC_KE
996        (TagNode tnWithSRC, URL sourcePage)
997    {
998        String src = tnWithSRC.AV("src");
999
1000        if (src == null) throw new SRCException(
1001            "The TagNode passed to parameter tnWithSRC does not actually contain a " +
1002            "SRC attribute."
1003        );
1004
1005        return resolve_KE(src, sourcePage);
1006    }
1007
1008
1009    /**
1010     * This should be used for lists of {@code TagNode's}, each of which contain an {@code 'HREF'}
1011     * inner-tag (attribute).
1012     * 
1013     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_KE>
1014     * 
1015     * @param tnListWithHREF <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_TNLIST_HREF>
1016     * 
1017     * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 
1018     * (possibly-relative) {@code URL's} in the {@code Iterable} will be resolved.
1019     * 
1020     * @return A list of {@code URL's}, each of which have been completed/resolved with the
1021     * {@code 'sourcePage'} parameter.  If there were any {@code TagNode} with no {@code HREF} tag,
1022     * then null is returned in the related {@code Vector} position.  If any {@code TagNode} causes
1023     * a {@code MalformedURLException}, then that position in the {@code Vector} will contain the
1024     * exception in {@code Ret2.b}
1025     * 
1026     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_NO_HREF>
1027     *
1028     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_RET2>
1029     * 
1030     * @see #resolve_KE(String, URL)
1031     * @see TagNode#AV(String)
1032     * @see Ret2
1033     */
1034    public static Vector<Ret2<URL, MalformedURLException>> resolveHREFs_KE
1035        (Iterable<TagNode> tnListWithHREF, URL sourcePage)
1036    {
1037        Vector<Ret2<URL, MalformedURLException>> ret = new Vector<>();
1038
1039        for (TagNode tn : tnListWithHREF) ret.addElement(resolve_KE(tn.AV("href"), sourcePage));
1040
1041        return ret;
1042    }
1043
1044
1045    /**
1046     * This should be used for lists of {@code TagNode's}, each of which contain a {@code 'SRC'}
1047     * inner-tag (attribute).
1048     * 
1049     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_KE>
1050     * 
1051     * @param tnListWithSRC <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_TNLIST_SRC>
1052     * 
1053     * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's}
1054     * (possibly-relative) {@code URL's} in the {@code Iterable} will be resolved.
1055     * 
1056     * @return A list of {@code URL's}, each of which have been completed/resolved with the
1057     * {@code 'sourcePage'} parameter.  If there were any {@code TagNode} with no {@code SRC} tag,
1058     * then null is returned in the related {@code Vector} position.  If any {@code TagNode} causes
1059     * a {@code MalformedURLException}, then that position in the {@code Vector} will contain the
1060     * exception in {@code Ret2.b}
1061     * 
1062     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_NO_SRC>
1063     *
1064     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_RET2>
1065     * 
1066     * @see #resolve_KE(String, URL)
1067     * @see TagNode#AV(String)
1068     * @see Ret2
1069     */
1070    public static Vector<Ret2<URL, MalformedURLException>> resolveSRCs_KE
1071        (Iterable<TagNode> tnListWithSRC, URL sourcePage)
1072    {
1073        Vector<Ret2<URL, MalformedURLException>> ret = new Vector<>();
1074
1075        for (TagNode tn : tnListWithSRC) ret.addElement(resolve_KE(tn.AV("src"), sourcePage));
1076
1077        return ret;
1078    }
1079
1080
1081    /**
1082     * This will use a "pointer array" - an array containing indexes into the downloaded page to
1083     * retrieve {@code TagNode's}.  The {@code TagNode} to which this pointer-array points - must
1084     * contain {@code HREF} inner-tags with {@code URL's}, or partial {@code URL's}.
1085     * 
1086     * <EMBED CLASS="external-html" DATA-FILE-ID=BASE_URL_NOTE>
1087     * 
1088     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_KE>
1089     * 
1090     * @param html <EMBED CLASS="external-html" DATA-FILE-ID="HTMLVEC"> Any HTML page (or sub-page)
1091     * 
1092     * @param nodePosArr An array of pointers into the page or sub-page.  The pointers must
1093     * reference {@code TagNode's} that contain {@code HREF} attributes.  Integer-pointer Arrays
1094     * are usually return from the {@code package 'NodeSearch'} "Find" methods.
1095     *
1096     * <DIV CLASS="EXAMPLE">{@code 
1097     * // Retrieve 'pointers' to all the '<A HREF=...>' TagNode's.  The term 'pointer' refers to
1098     * // integer-indices into the vectorized-html variable 'page'
1099     * int[] anchorPosArr = TagNodeFind.all(page, TC.OpeningTags, "a");
1100     * 
1101     * // Extract each HREF inner-tag, and construct a URL.  Use the 'sourcePage' parameter if
1102     * // the URL is only partially-resolved.  If any URL's on the original-page are invalid, the
1103     * // method shall not crash, but save the exception instead.
1104     * Vector<Ret2<URL, MalformedURLException> urlsWithEx =
1105     *     Links.resolveHREFs_KE(page, picturePosArr, mySourcePage);
1106     *
1107     * // Print out any "failed" urls
1108     * for (Ret2<URL, MalformedURLException> r : urlsWithEx)
1109     *     if (r.b != null) 
1110     *         System.out.println("There was an exception: " + r.b.toString());
1111     * }</DIV>
1112     *
1113     * <BR /><I>which would obtain a pointer-array / (a.k.a. a "vector-index-array") to every HTML
1114     * {@code "<A ...>"} element</I> that was available in the HTML page-{@code Vector} parameter
1115     * {@code 'html'}., and then resolve any shortened {@code URL's}.
1116     *
1117     * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's}
1118     * (possibly-relative) {@code URL's} in the {@code Vector} will be resolved.
1119     * 
1120     * @return A list of {@code URL's}, each of which have been completed/resolved with the
1121     * {@code 'sourcePage'} parameter.  If there were any {@code TagNode} with no {@code HREF} tag,
1122     * then null is returned in the related {@code Vector} position.  If any {@code TagNode} causes
1123     * a {@code MalformedURLException}, then that position in the {@code Vector} will contain the
1124     * exception in {@code Ret2.b}
1125     *
1126     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_NO_HREF>
1127     *
1128     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_RET2>
1129     *
1130     * @throws ArrayIndexOutOfBoundsException
1131     * 
1132     * <EMBED CLASS="external-html" DATA-FILE-ID="ATTRAIOOBEX">
1133     * 
1134     * @throws TagNodeExpectedException <EMBED CLASS="external-html" DATA-FILE-ID="TNEEX">
1135     * @throws OpeningTagNodeExpectedException <EMBED CLASS="external-html" DATA-FILE-ID="OTNEEX">
1136     *
1137     * @see #resolve_KE(String, URL)
1138     * @see TagNode#AV(String)
1139     * @see Ret2
1140     */
1141    public static Vector<Ret2<URL, MalformedURLException>> resolveHREFs_KE
1142        (Vector<? extends HTMLNode> html, int[] nodePosArr, URL sourcePage)
1143    {
1144         // Return Vector
1145        Vector<Ret2<URL, MalformedURLException>> ret = new Vector<>();
1146
1147        for (int nodePos : nodePosArr)
1148        {
1149            HTMLNode n = html.elementAt(nodePos);
1150
1151            // Must be an HTML TagNode
1152            if (! n.isTagNode()) throw new TagNodeExpectedException(nodePos);
1153
1154            TagNode tn = (TagNode) n;
1155
1156            // Must be an "Opening" HTML TagNode
1157            if (tn.isClosing) throw new OpeningTagNodeExpectedException(nodePos);
1158
1159            // Resolve the "HREF", keep the URL
1160            ret.addElement(resolve_KE(tn.AV("href"), sourcePage));
1161        }
1162
1163        return ret;
1164    }
1165 
1166    /**
1167     * This will use a "pointer array" - an array containing indexes into the downloaded page to
1168     * retrieve {@code TagNode's}.  The {@code TagNode} to which this pointer-array points - must 
1169     * contain {@code SRC} inner-tags with {@code URL's}, or partial {@code URL's}.
1170     * 
1171     * <EMBED CLASS="external-html" DATA-FILE-ID=BASE_URL_NOTE>
1172     * 
1173     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_KE>
1174     *
1175     * @param html <EMBED CLASS="external-html" DATA-FILE-ID="HTMLVEC"> Any HTML page (or sub-page)
1176     * 
1177     * @param nodePosArr An array of pointers into the page or sub-page.  The pointers must
1178     * reference {@code TagNode's} that contain {@code SRC} attributes.  Integer-pointer Arrays are
1179     * usually return from the {@code package 'NodeSearch'} "Find" methods.
1180     *
1181     * <DIV CLASS="EXAMPLE">{@code 
1182     * // Retrieve 'pointers' to all the '<IMG SRC=...>' TagNode's.  The term 'pointer' refers to
1183     * // integer-indices into the vectorized-html variable 'page'
1184     * int[] picturePosArr = TagNodeFind.all(page, TC.OpeningTags, "img");
1185     * 
1186     * // Extract each SRC inner-tag, and construct a URL.  Use the 'sourcePage' parameter if
1187     * // the URL is only partially-resolved.  If any URL's on the original-page are invalid,
1188     * // the method shall not crash, but save the exception instead.
1189     * Vector<Ret2<URL, MalformedURLException> urlsWithEx =
1190     *      Links.resolveSRCs_KE(page, picturePosArr, mySourcePage);
1191     *
1192     * // Print out any "failed" urls
1193     * for (Ret2<URL, MalformedURLException> r : urlsWithEx)
1194     *     if (r.b != null) 
1195     *         System.out.println("There was an exception: " + r.b.toString());
1196     * }</DIV>
1197     *
1198     * <BR /><I>which would obtain a pointer-array / (a.k.a. a "vector-index-array") to every HTML
1199     * {@code "<IMG ...>"} element</I> that was available in the HTML page-{@code Vector} parameter
1200     * {@code 'html'}, and then resolve any shortened {@code URL's}.
1201     *
1202     * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's}
1203     * (possibly-relative) {@code URL's} in the {@code Vector} will be resolved.
1204     *
1205     * @return A list of {@code URL's}, each of which have been completed/resolved with the 
1206     * {@code 'sourcePage'} parameter.  If there were any {@code TagNode} with no {@code SRC} tag,
1207     * then null is returned in the related {@code Vector} position.  If any {@code TagNode} causes
1208     * a {@code MalformedURLException}, then that position in the {@code Vector} will contain the
1209     * exception in {@code Ret2.b}
1210     *
1211     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_NO_SRC>
1212     *
1213     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_RET2>
1214     *
1215     * @throws ArrayIndexOutOfBoundsException
1216     * 
1217     * <EMBED CLASS="external-html" DATA-FILE-ID="ATTRAIOOBEX">
1218     * 
1219     * @throws TagNodeExpectedException <EMBED CLASS="external-html" DATA-FILE-ID="TNEEX">
1220     * 
1221     * @throws OpeningTagNodeExpectedException <EMBED CLASS="external-html" DATA-FILE-ID="OTNEEX">
1222     *
1223     * @see #resolve_KE(String, URL)
1224     * @see TagNode#AV(String)
1225     * @see Ret2
1226     */
1227    public static Vector<Ret2<URL, MalformedURLException>> resolveSRCs_KE
1228        (Vector<? extends HTMLNode> html, int[] nodePosArr, URL sourcePage)
1229    {
1230         // Return Vector
1231        Vector<Ret2<URL, MalformedURLException>> ret = new Vector<>();                                         
1232
1233        for (int nodePos : nodePosArr)
1234        {
1235            HTMLNode n = html.elementAt(nodePos);
1236
1237            // Must be an HTML TagNode
1238            if (! n.isTagNode()) throw new TagNodeExpectedException(nodePos);
1239
1240            TagNode tn = (TagNode) n;
1241
1242            // Must be an "Opening" HTML TagNode
1243            if (tn.isClosing) throw new OpeningTagNodeExpectedException(nodePos);
1244
1245            // Resolve "SRC" and keep URL's
1246            ret.addElement(resolve_KE(tn.AV("src"), sourcePage));
1247        }
1248
1249        return ret;
1250    }
1251
1252    /**
1253     * Resolve all {@code URL's}, represented as {@code String's}, inside of a {@code Vector}.
1254     * 
1255     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_KE>
1256     * 
1257     * @param src a list of {@code String's} - usually partially or totally completed Internet
1258     * {@code URL's}
1259     * 
1260     * @param sourcePage This is the source page {@code URL} from which the {@code String's}
1261     * (possibly-relative) {@code URL's} in the {@code Vector} will be resolved.
1262     * 
1263     * @return A list of {@code URL's}, each of which have been completed/resolved with the
1264     * {@code 'sourcePage'} parameter.  If there were any {@code String's} that were zero-length or
1265     * null, then null is returned in the related {@code Vector} position.  If any {@code TagNode} 
1266     * causes a {@code MalformedURLException}, then that position in the {@code Vector} will
1267     * contain the exception in {@code Ret2.b}
1268     *
1269     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_RET2>
1270     * 
1271     * @see #resolve_KE(String, URL)
1272     * @see Ret2
1273     */
1274    public static Vector<Ret2<URL, MalformedURLException>> resolve_KE
1275        (Vector<String> src, URL sourcePage)
1276    {
1277        Vector<Ret2<URL, MalformedURLException>> ret = new Vector<>();
1278
1279        for (String s : src) ret.addElement(resolve_KE(s, sourcePage));
1280
1281        return ret;
1282    }
1283
1284    /**
1285     * This will convert a simple java {@code String} to a {@code URL}, de-referencing any missing
1286     * information using the {@code 'sourcePage'} parameter.
1287     * 
1288     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_KE>
1289     * 
1290     * @param src Any java {@code String}, usually one which was scraped from an HTML-Page, and
1291     * needs to be "completed."
1292     * 
1293     * @param sourcePage This is the source page {@code URL} from which the String (possibly
1294     * relative) {@code URL} will be resolved.
1295     * 
1296     * @return A {@code URL}, which has been completed/resolved with the {@code 'sourcePage'}
1297     * parameter. If parameter {@code 'src'} is null or zero-length, null will be returned.  If a
1298     * {@code MalformedURLException} is thrown, that will be included with the {@code Ret2<>}
1299     * result.
1300     *
1301     * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_RET2>
1302     * 
1303     * @see Ret2
1304     */
1305    public static Ret2<URL, MalformedURLException> resolve_KE(String src, URL sourcePage)
1306    {
1307        if (sourcePage == null) throw new NullPointerException(
1308            "Though you may provide null to the partial-URL to dereference parameter, null " +
1309            "may not be passed to the Source-Page Parameter.  The purpose of the 'resolve' " +
1310            "operation is to resolve partial-URLs against a source-page (root) URL. " +
1311            "Therefore this is not allowed."
1312        );
1313
1314        if (src == null) return null;
1315
1316        src = src.trim();
1317
1318        if (src.length() == 0) return null;
1319
1320        String srcLC = src.toLowerCase();
1321
1322        if (StrCmpr.startsWithXOR
1323                (srcLC, "tel:", "javascript:", "mailto:", "magnet:", "file:", "ftp:", "#"))
1324
1325            return new Ret2<URL, MalformedURLException>
1326                (null, new MalformedURLException(
1327                    "InnerTag/Attribute begins with: " + src.substring(0, 1 + src.indexOf(":")) +
1328                    ", so it is not a hyper-link."
1329                ));
1330
1331
1332        // Includes the first few characters of the URL - for reporting/convenience. 
1333        // If this is an "image", the image-type & name will be included
1334
1335        if (StrCmpr.startsWithXOR(srcLC, "data:", "blob:"))
1336
1337            return new Ret2<URL, MalformedURLException>(null, new MalformedURLException(
1338                "InnerTag/Attribute begins with: " +
1339                ((src.length() > 25) ? src.substring(0, 25) : src) +
1340                ", not a URL."
1341            ));
1342
1343
1344        if (srcLC.startsWith("http://") || srcLC.startsWith("https://"))
1345
1346            try
1347                { return new Ret2<URL, MalformedURLException>(new URL(src), null); }
1348
1349            catch (MalformedURLException e)
1350                { return new Ret2<URL, MalformedURLException>(null, e); }
1351
1352
1353        if (src.startsWith("//") && (src.charAt(3) != '/'))
1354
1355            try
1356            { 
1357                return new Ret2<URL, MalformedURLException>
1358                    (new URL(  sourcePage.getProtocol().toLowerCase() + ":" + src), null);
1359            }
1360
1361            catch (MalformedURLException e)
1362                { return new Ret2<URL, MalformedURLException>(null, e); }
1363
1364
1365        if (src.startsWith("/"))
1366
1367            try
1368            {
1369                return new Ret2<URL, MalformedURLException>(new URL(
1370                    sourcePage.getProtocol().toLowerCase() + "://" +
1371                    sourcePage.getHost().toLowerCase() +
1372                    src), null
1373                );
1374            }
1375
1376            catch (MalformedURLException e)
1377                { return new Ret2<URL, MalformedURLException>(null, e); }
1378
1379
1380        if (src.startsWith("../"))
1381        {
1382            String  sourcePageStr   = sourcePage.toString();
1383            short   nLevels         = 0;
1384
1385            do
1386                { nLevels++;  src = src.substring(3); }
1387            while (src.startsWith("../"));
1388
1389            String  directory = StringParse.dotDotParentDirectory(sourcePage.toString(), nLevels);
1390
1391            try
1392                { return new Ret2<URL, MalformedURLException>(new URL(directory + src), null); }
1393
1394            catch (MalformedURLException e)
1395                { return new Ret2<URL, MalformedURLException>(null, e); }
1396
1397            catch (Exception e)
1398            { 
1399                return new Ret2<URL, MalformedURLException>
1400                    (null,
1401                    new MalformedURLException(e.getClass().getCanonicalName() +
1402                    ":" + e.getMessage())
1403                    );
1404            }
1405        }
1406
1407
1408        String  root =
1409            sourcePage.getProtocol().toLowerCase() + "://" + 
1410            sourcePage.getHost().toLowerCase();
1411
1412        String  path    = sourcePage.getPath().trim();
1413        int     pos     = StringParse.findLastFrontSlashPos(path);
1414
1415        if (pos == -1) throw new StringIndexOutOfBoundsException(
1416            "The URL you have provided: " + sourcePage.toString() +
1417            " does not have a '/' front-slash character in it's path." +
1418            "Cannot proceed resolving relative-URL's without this."
1419        );
1420
1421        path = path.substring(0, pos + 1);
1422
1423        try
1424            { return new Ret2<URL, MalformedURLException>(new URL(root + path + src), null); }
1425
1426        catch (MalformedURLException e)
1427            { return new Ret2<URL, MalformedURLException>(null, e); }
1428    }
1429}