001package Torello.HTML; 002 003import java.net.*; 004import java.util.*; 005import java.util.stream.IntStream; 006 007import Torello.Java.*; 008 009import Torello.HTML.NodeSearch.InnerTagFind; // Used for an @see reference 010import Torello.HTML.NodeSearch.TagNodeFind; // Used in getBaseURL 011import Torello.Java.Additional.Ret2; 012import Torello.Java.Additional.Ret3; 013 014/** 015 * Utilities for de-refrencing 'partially-completed' {@code URL's} in a Web-Page {@code Vector}. 016 * 017 * <BR /><BR /><EMBED CLASS="external-html" DATA-FILE-ID=LINKS> 018 * @see ReplaceNodes 019 * @see ReplaceFunction 020 * @see HTMLPage 021 * @see InnerTagFind 022 * @see Ret2 023 */ 024@Torello.JavaDoc.StaticFunctional 025public class Links 026{ 027 private Links() { } 028 029 /** 030 * List of documented "starter-strings" that are sometimes used in Anchor URL 031 * {@code 'HREF=...'} attributes. 032 * 033 * @see #NON_URL_HREFS 034 */ 035 protected static final String[] _NON_URL_HREFS = 036 { "tel:", "magnet:", "javascript:", "mailto:", "ftp:", "file:", "data:", "blog:", "#" }; 037 038 /** 039 * This small method just returns the complete list of commonly found Anchor 040 * {@code 'HREF' String's} that do not actually constitute an HTML {@code 'URL'.} This method 041 * actually returns a "clone" of an internally stored {@code String[]} Array. This is to 042 * protect and make sure that the list of potential HTML Anchor-Tag {@code 'HREF'} Attributes 043 * is not changed, doctored or modified 044 * 045 * @return A clone of the {@code String}-array {@code '_NON_URL_HREFS'} 046 * 047 * @see #_NON_URL_HREFS 048 */ 049 public static String[] NON_URL_HREFS() 050 { return _NON_URL_HREFS.clone(); } 051 052 /** 053 * The methods in this class <I><B>will not automatically extract</I></B> any HTML 054 * {@code <BASE HREF=URL>} definitions that are found on this page. If the user wishes to 055 * dereference partial / relative {@code URL} definitions that exist on the input page, all the 056 * while respecting any {@code <BASE HREF=URL>} definitions found on the input page, then this 057 * method should be utilized. 058 * 059 * @param page This may be any HTML page or partial page. If this page has a valid HTML 060 * {@code <BASE HREF=URL>}, it will be extracted and returned as an instance of 061 * {@code class URL}. 062 * 063 * @return This shall return the HTML {@code <BASE HREF="http://...">} element found available 064 * within the input-page parameter {@code 'page'}. If the page provided does not contain a 065 * {@code BASE URL} definition, then null shall be returned. 066 * 067 * <BR /><BR /><B>NOTE:</B> The HTML Specification clearly states that only one {@code URL} 068 * may be defined using the HTML Element {@code <BASE>}. Clearly, due to the browser wars, 069 * unspecified / non-deterministic behavior is possible if multiple definitions are provided. 070 * For the purposes of this class, if such a situation arises, an exception is thrown. 071 * 072 * @throws MalformedHTMLException If the HTML page provided contains multiple definitions of 073 * the element {@code <BASE HREF=URL>}, then this exception will throw. 074 * 075 * @throws MalformedURLException If the {@code <BASE HREF=URL>} found / identified within the 076 * input page, but that {@code URL} is invalid, then this exception shall throw. 077 * 078 * @see TagNodeFind 079 * @see Attributes#retrieve(Vector, int[], String) 080 */ 081 public static URL getBaseURL(Vector<? extends HTMLNode> page) 082 throws MalformedHTMLException, MalformedURLException 083 { 084 int[] posArr = TagNodeFind.all(page, TC.OpeningTags, "base"); 085 086 if (posArr.length == 0) return null; 087 088 // NOTE: The cast is all right because 'posArr' only points to TagNode's 089 // Attributes expects to avoid processing Vector<TextNode>, and Vector<CommentNode> 090 // Above, there will be nothing in the 'posArr' if either of those was passed. 091 092 @SuppressWarnings("unchecked") 093 String[] urls = Attributes.retrieve((Vector<HTMLNode>) page, posArr, "href"); 094 095 boolean found = false; 096 String ret = null; 097 098 for (String url : urls) 099 if ((url != null) && (url.length() > 0)) 100 if (found) 101 throw new MalformedHTMLException( 102 "The page you have provided has multiple <BASE HREF=URL> definitions. " + 103 "However, the HTML Specifications state that pages may provide just one " + 104 "definition. If you wish to proceed, retrieve the definitions manually " + 105 "using class TagNodeFind.all and Attributes.retrieve, as explained in " + 106 "the JavaDoc pages for this class." 107 ); 108 else 109 { 110 found = true; 111 ret = url; 112 } 113 114 return new URL(ret); 115 } 116 117 // ******************************************************************************************** 118 // Complete Vector-Resolve Methods - SRC-ATTRIBUTE 119 // ******************************************************************************************** 120 121 /** 122 * Convenience Method. 123 * <BR />Invokes: {@link #resolveAllSRC(Vector, int, int, URL, SD, boolean)} 124 */ 125 public static Ret3<int[], int[], int[]> resolveAllSRC( 126 Vector<? super TagNode> html, URL sourcePage, SD quote, 127 boolean askForReturnArraysOrReturnNull 128 ) 129 { return resolveAllSRC(html, 0, -1, sourcePage, quote, askForReturnArraysOrReturnNull); } 130 131 /** 132 * Convenience Method. 133 * <BR />Accepts: {@code DotPair}. 134 * <BR />Invokes: {@link #resolveAllSRC(Vector, int, int, URL, SD, boolean)} 135 */ 136 public static Ret3<int[], int[], int[]> resolveAllSRC( 137 Vector<? super TagNode> html, DotPair dp, URL sourcePage, SD quote, 138 boolean askForReturnArraysOrReturnNull 139 ) 140 { 141 return resolveAllSRC 142 (html, dp.start, dp.end + 1, sourcePage, quote, askForReturnArraysOrReturnNull); 143 } 144 145 /** 146 * This method shall resolve all partial {@code URL} addresses that are found within 147 * {@code TagNode} elements having {@code 'SRC=...'} attributes. Each instance of 148 * {@code TagNode} found in the input HTML {@code Vector} that has an {@code 'SRC'} 149 * attribute - if the {@code 'URL'} is only partially resolve - shall be updated and replaced 150 * with a new {@code TagNode} with a fully resolved {@code URL}. 151 * 152 * <EMBED CLASS="external-html" DATA-FILE-ID=BASE_URL_NOTE> 153 * 154 * @param html <EMBED CLASS="external-html" DATA-FILE-ID="HTMLVECSUP"> 155 * @param sPos <EMBED CLASS="external-html" DATA-FILE-ID="SPOSVEC"> 156 * @param ePos <EMBED CLASS="external-html" DATA-FILE-ID="EPOSVEC"> 157 * 158 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 159 * (possibly-relative) {@code URL's} in the HTML-{@code Vector} will be resolved. 160 * 161 * @param quote A choice for the quotes to use. In most cases, {@code URL} attribute 162 * <B STYLE="color: red;">values</B> do not contain quotation-marks. So likely either 163 * choice would work just fine, without exceptions. 164 * 165 * <BR /><BR /><B>NOTE:</B> <I>null may be passed to this parameter</I>, and if it is 166 * the original quotation marks found in the {@code TagNode's 'SRC'} attribute will be 167 * reused. Passing null to this parameter should almost always be easiest, safest. 168 * 169 * @param askForReturnArraysOrReturnNull This (long-named) parameter is merely here to 170 * facilitate retrieving more information from this method - <I>if necessary</I>. When this 171 * parameter receives the following values: 172 * 173 * <BR /><BR /><UL CLASS="JDUL"> 174 * <LI> <B>TRUE:</B> Three integer {@code int[]} arrays will be returned as listed in the 175 * <B>{@code Returns:}</B> section of this method's documentation. 176 * </LI> 177 * <LI><B>FALSE:</B> This method shall return null. 178 * </LI> 179 * </UL> 180 * 181 * @return If input parameter {@code 'askForReturnArraysOrReturnNull'} has been passed 182 * <B>FALSE</B>, this method shall return null. Otherwise, (if passed <B>TRUE</B>), then 183 * this method shall return an instance of {@code 'Ret3<int[], int[], int[]>'} - which is 184 * <I>returning three separate integer-arrays about what was found, and what has occurred.</I> 185 * 186 * <BR /><BR /> 187 * Three arrays are returned as a result of this method's invocation. Keep in mind that 188 * though the information might be superfluous, rejecting these arrays away is easy. 189 * They are provided as a matter of convenience for cases where more details information is 190 * mandatory for ensuring that long lists of {@code HTMLNode's} were properly updated. 191 * 192 * <BR /><BR /><OL CLASS="JDOL"> 193 * <LI> {@code Ret3.a (int[])} 194 * <BR /><BR /> 195 * The first {@code int[] array} shall contain a list of the index of every 196 * {@code TagNode} in the input-{@code Vector} parameter's range that <B><I>contained</B> 197 * </I> a non-null HTML {@code 'SRC'} Attribute. 198 * <BR /><BR /> 199 * </LI> 200 * <LI> {@code Ret3.b (int[])} 201 * <BR /><BR /> 202 * The second {@code int[] array} will contain an index-list of the indices 203 * which contained {@code TagNode's} that were <B><I>replaced</I></B> by the 204 * internal-resolve logic. 205 * <BR /><BR /> 206 * </LI> 207 * <LI> {@code Ret3.c (int[])} 208 * <BR /><BR /> 209 * The third {@code int[] array} will contain an index-list of the indices 210 * which contained {@code TagNode's} whose {@code 'SRC=...'} attribute 211 * <I><B>failed</I></B> to be resolved by the internal-resolve logic, <I>or</I> caused a 212 * {@code QuotesException} to throw. 213 * </LI> 214 * </OL> 215 * 216 * @throws IndexOutOfBoundsException <EMBED CLASS="external-html" DATA-FILE-ID="VIOOBEX"> 217 * 218 * @see #resolve(String, URL) 219 * @see TagNode#AV(String) 220 * @see TagNode#setAV(String, String, SD) 221 */ 222 public static Ret3<int[], int[], int[]> resolveAllSRC( 223 Vector<? super TagNode> html, int sPos, int ePos, URL sourcePage, SD quote, 224 boolean askForReturnArraysOrReturnNull 225 ) 226 { 227 // Retrieve the Vector-location of any TagNode on the page that has 228 // a "SRC=..." attribute. These are almost always HTML <IMG> elements. 229 // NOTE: FIND Method's are "READ ONLY" - the Cast will make no difference at run-time. 230 // The @SuppressWarnings is to overcome the cast of 'html' 231 232 @SuppressWarnings("unchecked") 233 int[] hasSrcPosArr = InnerTagFind.all((Vector<HTMLNode>) html, sPos, ePos, "src"); 234 235 // Java Stream's are convenient for keeping "Growing Lists" of return values. 236 // This builder shall keep a list of all URL's that failed to update - for any reason 237 // **UNLESS** the reason is that the URL was already a fully-resolved, non-partial URL 238 239 IntStream.Builder failedUpdate = askForReturnArraysOrReturnNull 240 ? IntStream.builder() 241 : null; 242 243 // This stream will keep a list of all URL's that were updated, and whose TagNode's 244 // were replaced inside the input HTML Vector 245 246 IntStream.Builder replaced = askForReturnArraysOrReturnNull 247 ? IntStream.builder() 248 : null; 249 250 for (int pos : hasSrcPosArr) 251 { 252 // Get the node at the index 253 TagNode tn = (TagNode) html.elementAt(pos); 254 255 // 1) Retrieve the SRC Attribute 256 // 2) if it is a partial-URL resolve it 257 // 3) Convert to a String 258 259 String oldURL = tn.AV("src"); 260 URL newURL = resolve(oldURL, sourcePage); 261 262 // Some URL's cannot be resolved, if so, just skip this TagNode. 263 // Log the index to the stream (if requested), and continue. 264 265 if (newURL == null) 266 { if (askForReturnArraysOrReturnNull) failedUpdate.accept(pos); continue; } 267 268 // If the URL was already a fully-resolved-URL, continue - don't replace the TagNode; 269 // No logging needed here, the URL was *already* resolved... 270 271 if (oldURL.length() == newURL.toString().length()) continue; 272 273 // Replace the SRC Attribute in the TagNode. This builds a new instance of TagNode 274 // If there is an exception, log the index to the stream (if requested), and continue. 275 276 try 277 { tn = tn.setAV("src", newURL.toString(), quote); } 278 279 catch (QuotesException qex) 280 { if (askForReturnArraysOrReturnNull) failedUpdate.accept(pos); continue; } 281 282 // Replace the index in the Vector containing the old TagNode with the new one. 283 html.setElementAt(tn , pos); 284 285 // The Vector-Index at this position had it's old TagNode removed and replaced with a 286 // new updated one. Log this to the stream-list so to allow the user to know. 287 288 if (askForReturnArraysOrReturnNull) replaced.accept(pos); 289 } 290 291 return askForReturnArraysOrReturnNull 292 293 ? new Ret3<int[], int[], int[]> 294 (hasSrcPosArr, replaced.build().toArray(), failedUpdate.build().toArray()) 295 : null; 296 } 297 298 // ******************************************************************************************** 299 // Complete Vector-Resolve Methods - HREF-ATTRIBUTE 300 // ******************************************************************************************** 301 302 /** 303 * Convenience Method. 304 * <BR />Invokes: {@link #resolveAllHREF(Vector, int, int, URL, SD, boolean)} 305 */ 306 public static Ret3<int[], int[], int[]> resolveAllHREF( 307 Vector<? super TagNode> html, URL sourcePage, SD quote, 308 boolean askForReturnArraysOrReturnNull 309 ) 310 { return resolveAllHREF(html, 0, -1, sourcePage, quote, askForReturnArraysOrReturnNull); } 311 312 /** 313 * Convenience Method. 314 * <BR />Accepts: {@code DotPair}. 315 * <BR />Invokes: {@link #resolveAllHREF(Vector, int, int, URL, SD, boolean)} 316 */ 317 public static Ret3<int[], int[], int[]> resolveAllHREF( 318 Vector<? super TagNode> html, DotPair dp, URL sourcePage, SD quote, 319 boolean askForReturnArraysOrReturnNull 320 ) 321 { 322 return resolveAllHREF 323 (html, dp.start, dp.end + 1, sourcePage, quote, askForReturnArraysOrReturnNull); 324 } 325 326 /** 327 * This method shall resolve all partial {@code URL} addresses that are found within 328 * {@code TagNode} elements having {@code 'HREF=...'} attributes. Each instance of 329 * {@code TagNode} found in the input HTML {@code Vector} that has an {@code 'HREF'} 330 * attribute - if the {@code 'URL'} is only partially resolve - shall be updated and replaced 331 * with a new {@code TagNode} with a fully resolved {@code URL}. 332 * 333 * <EMBED CLASS="external-html" DATA-FILE-ID=BASE_URL_NOTE> 334 * 335 * @param html <EMBED CLASS="external-html" DATA-FILE-ID="HTMLVECSUP"> 336 * @param sPos <EMBED CLASS="external-html" DATA-FILE-ID="SPOSVEC"> 337 * @param ePos <EMBED CLASS="external-html" DATA-FILE-ID="EPOSVEC"> 338 * 339 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 340 * (possibly-relative) {@code URL's} in the HTML-{@code Vector} will be resolved. 341 * 342 * @param quote A choice for the quotes to use. In most cases, {@code URL} attribute 343 * <B STYLE="color: red;">values</B> do not contain quotation-marks. So likely either 344 * choice would work just fine, without exceptions. 345 * 346 * <BR /><BR /><B>NOTE:</B> <I>null may be passed to this parameter</I>, and if it is 347 * the original quotation marks found in the {@code TagNode's 'HREF'} attribute will be 348 * reused. Passing null to this parameter should almost always be easiest, safest. 349 * 350 * @param askForReturnArraysOrReturnNull This (long-named) parameter is merely here to 351 * facilitate retrieving more information from this method - <I>if necessary</I>. When this 352 * parameter receives the following values: 353 * 354 * <BR /><BR /><UL CLASS="JDUL"> 355 * <LI> <B>TRUE:</B> Three integer {@code int[]} arrays will be returned as listed in the 356 * <B>{@code Returns:}</B> section of this method's documentation. 357 * </LI> 358 * <LI><B>FALSE:</B> This method shall return null. 359 * </LI> 360 * </UL> 361 * 362 * @return If input parameter {@code 'askForReturnArraysOrReturnNull'} has been passed 363 * <B>FALSE</B>, this method shall return null. Otherwise, (if passed <B>TRUE</B>), then 364 * this method shall return an instance of {@code 'Ret3<int[], int[], int[]>'} - which is 365 * <I>returning three separate integer-arrays about what was found, and what has occurred.</I> 366 * 367 * <BR /><BR /> 368 * Three arrays are returned as a result of this method's invocation. Keep in mind that 369 * though the information might be superfluous, rejecting these arrays away is easy. 370 * They are provided as a matter of convenience for cases where more details information is 371 * mandatory for ensuring that long lists of {@code HTMLNode's} were properly updated. 372 * 373 * <BR /><BR /><OL CLASS="JDOL"> 374 * <LI> {@code Ret3.a (int[])} 375 * <BR /><BR /> 376 * The first {@code int[] array} shall contain a list of the index of every 377 * {@code TagNode} in the input-{@code Vector} parameter's range that <B><I>contained</B> 378 * </I> a non-null HTML {@code 'HREF'} Attribute. 379 * <BR /><BR /> 380 * </LI> 381 * <LI> {@code Ret3.b (int[])} 382 * <BR /><BR /> 383 * The second {@code int[] array} will contain an index-list of the indices 384 * which contained {@code TagNode's} that were <B><I>replaced</I></B> by the 385 * internal-resolve logic. 386 * <BR /><BR /> 387 * </LI> 388 * <LI> {@code Ret3.c (int[])} 389 * <BR /><BR /> 390 * The third {@code int[] array} will contain an index-list of the indices 391 * which contained {@code TagNode's} whose {@code 'HREF=...'} attribute 392 * <I><B>failed</I></B> to be resolved by the internal-resolve logic, <I>or</I> caused a 393 * {@code QuotesException} to throw. 394 * </LI> 395 * </OL> 396 * 397 * @throws IndexOutOfBoundsException <EMBED CLASS="external-html" DATA-FILE-ID="VIOOBEX"> 398 * 399 * @see #resolve(String, URL) 400 * @see TagNode#AV(String) 401 * @see TagNode#setAV(String, String, SD) 402 */ 403 public static Ret3<int[], int[], int[]> resolveAllHREF( 404 Vector<? super TagNode> html, int sPos, int ePos, URL sourcePage, SD quote, 405 boolean askForReturnArraysOrReturnNull 406 ) 407 { 408 // Retrieve the Vector-location of any TagNode on the page that has 409 // a "HREF=..." attribute. These are almost always HTML <IMG> elements. 410 // NOTE: FIND Method's are "READ ONLY" - the Cast will make no difference at run-time. 411 // The @SuppressWarnings is to overcome the cast of 'html' 412 413 @SuppressWarnings("unchecked") 414 int[] hasHRefPosArr = InnerTagFind.all((Vector<HTMLNode>) html, sPos, ePos, "href"); 415 416 // Java Stream's are convenient for keeping "Growing Lists" of return values. 417 // This builder shall keep a list of all URL's that failed to update - for any reason 418 // **UNLESS** the reason is that the URL was already a fully-resolved, non-partial URL 419 420 IntStream.Builder failedUpdate = askForReturnArraysOrReturnNull 421 ? IntStream.builder() 422 : null; 423 424 // This stream will keep a list of all URL's that were updated, and whose TagNode's 425 // were replaced inside the input HTML Vector 426 427 IntStream.Builder replaced = askForReturnArraysOrReturnNull 428 ? IntStream.builder() 429 : null; 430 431 for (int pos : hasHRefPosArr) 432 { 433 // Get the node at the index 434 TagNode tn = (TagNode) html.elementAt(pos); 435 436 // 1) Retrieve the HREF Attribute 437 // 2) if it is a partial-URL resolve it 438 // 3) Convert to a String 439 440 String oldURL = tn.AV("HREF"); 441 URL newURL = resolve(oldURL, sourcePage); 442 443 // Some URL's cannot be resolved, if so, just skip this TagNode. 444 // Log the index to the stream (if requested), and continue. 445 446 if (newURL == null) 447 { if (askForReturnArraysOrReturnNull) failedUpdate.accept(pos); continue; } 448 449 // If the URL was already a fully-resolved-URL, continue - don't replace the TagNode; 450 // No logging needed here, the URL was *already* resolved... 451 452 if (oldURL.length() == newURL.toString().length()) continue; 453 454 // Replace the HREF Attribute in the TagNode. This builds a new instance of TagNode 455 // If there is an exception, log the index to the stream (if requested), and continue. 456 457 try 458 { tn = tn.setAV("href", newURL.toString(), quote); } 459 460 catch (QuotesException qex) 461 { if (askForReturnArraysOrReturnNull) failedUpdate.accept(pos); continue; } 462 463 // Replace the index in the Vector containing the old TagNode with the new one. 464 html.setElementAt(tn , pos); 465 466 // The Vector-Index at this position had it's old TagNode removed and replaced with a 467 // new updated one. Log this to the stream-list so to allow the user to know. 468 469 if (askForReturnArraysOrReturnNull) replaced.accept(pos); 470 } 471 472 return askForReturnArraysOrReturnNull 473 474 ? new Ret3<int[], int[], int[]> 475 (hasHRefPosArr, replaced.build().toArray(), failedUpdate.build().toArray()) 476 : null; 477 } 478 479 // ******************************************************************************************** 480 // Resolve, Not Keep Exceptions 481 // ******************************************************************************************** 482 483 484 /** 485 * Convenience Method. 486 * <BR />Invokes: {@link #resolveHREF(TagNode, URL)}. 487 * <BR />And-Then: {@link TagNode#setAV(String, String, SD)} 488 */ 489 public static TagNode resolveHREFAndUpdate(TagNode tnWithHREF, URL sourcePage) 490 { 491 URL url = resolveHREF(tnWithHREF, sourcePage); 492 493 return (url == null) 494 ? null 495 : tnWithHREF.setAV("href", url.toString(), null); 496 } 497 498 499 /** 500 * This should be used for {@code TagNode's} that contain an {@code 'HREF'} inner-tag 501 * (attribute). 502 * 503 * @param tnWithHREF <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_TN_HREF> 504 * 505 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode} 506 * (possibly-relative) {@code URL} will be resolved. 507 * 508 * @return A complete-{@code URL} without any missing "presumed data" - such as host/domain or 509 * directory. Null is returned if attempting to build the {@code URL} generated a 510 * {@code MalformedURLException}. 511 * 512 * <BR /><BR /><B STYLE="color: red;">SPECIFICALLY:</B> This method shall catch all 513 * {@code MalformedURLException's}. 514 * 515 * @throws HREFException If the {@code TagNode} passed to parameter {@code 'tnWithHREF'} does 516 * not actually contain an {@code HREF} attribute, then this exception shall throw. 517 * 518 * @see #resolve(String, URL) 519 * @see TagNode#AV(String) 520 */ 521 public static URL resolveHREF(TagNode tnWithHREF, URL sourcePage) 522 { 523 String href = tnWithHREF.AV("href"); 524 525 if (href == null) throw new HREFException( 526 "The TagNode passed to parameter tnWithHREF does not actually contain an " + 527 "HREF attribute." 528 ); 529 530 return resolve(href, sourcePage); 531 } 532 533 534 /** 535 * Convenience Method. 536 * <BR />Invokes: {@link #resolveSRC(TagNode, URL)} 537 * <BR />And-Then: {@link TagNode#setAV(String, String, SD)} 538 */ 539 public static TagNode resolveSRCAndUpdate(TagNode tnWithSRC, URL sourcePage) 540 { 541 URL url = resolveSRC(tnWithSRC, sourcePage); 542 543 return (url == null) 544 ? null 545 : tnWithSRC.setAV("src", url.toString(), null); 546 } 547 548 549 /** 550 * This should be used for {@code TagNode's} that contain a {@code 'SRC'} inner-tag 551 * (attribute). 552 * 553 * @param tnWithSRC <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_TN_SRC> 554 * 555 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode} 556 * (possibly-relative) {@code URL} will be resolved. 557 * 558 * @return A complete-{@code URL} without any missing "presumed data" - such as host/domain or 559 * directory. Null is returned if attempting to build the {@code URL} generated a 560 * {@code MalformedURLException}. 561 * 562 * <BR /><BR /><B STYLE="color: red;">SPECIFICALLY:</B> This method shall catch all 563 * {@code MalformedURLException's}. 564 * 565 * @throws SRCException If the {@code TagNode} passed to parameter {@code 'tnWithSRC'} does not 566 * actually contain a {@code SRC} attribute, then this exception shall throw. 567 * 568 * @see #resolve(String, URL) 569 * @see TagNode#AV(String) 570 */ 571 public static URL resolveSRC(TagNode tnWithSRC, URL sourcePage) 572 { 573 String src = tnWithSRC.AV("src"); 574 575 if (src == null) throw new SRCException( 576 "The TagNode passed to parameter tnWithSRC does not actually contain a " + 577 "SRC attribute." 578 ); 579 580 return resolve(src, sourcePage); 581 } 582 583 /** 584 * This should be used for lists of {@code TagNode's}, each of which contain an {@code 'HREF'} 585 * inner-tag (attribute). 586 * 587 * @param tnListWithHREF <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_TNLIST_HREF> 588 * 589 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 590 * (possibly-relative) {@code URL's} in the {@code Iterable} will be resolved. 591 * 592 * @return A list of {@code URL's}, each of which have been completed/resolved with the 593 * {@code 'sourcePage'} parameter. Any {@code TagNode} which generated an exception, will 594 * result in a null value in the {@code Vector}. 595 * 596 * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_NO_HREF> 597 * 598 * @see #resolve(String, URL) 599 * @see TagNode#AV(String) 600 */ 601 public static Vector<URL> resolveHREFs(Iterable<TagNode> tnListWithHREF, URL sourcePage) 602 { 603 Vector<URL> ret = new Vector<>(); 604 605 for (TagNode tn : tnListWithHREF) ret.addElement(resolve(tn.AV("href"), sourcePage)); 606 607 return ret; 608 } 609 610 611 /** 612 * This should be used for lists of {@code TagNode's}, each of which contain a {@code 'SRC'} 613 * inner-tag (attribute). 614 * 615 * @param tnListWithSRC <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_TNLIST_SRC> 616 * 617 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 618 * (possibly-relative) {@code URL's} in the {@code Iterable} will be resolved. 619 * 620 * @return A list of {@code URL's}, each of which have been completed/resolved with the 621 * {@code 'sourcePage'} parameter. Any {@code TagNode} which generated an exception, will 622 * result in a null value in the {@code Vector.} 623 * 624 * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_NO_SRC> 625 * 626 * @see #resolve(String, URL) 627 * @see TagNode#AV(String) 628 */ 629 public static Vector<URL> resolveSRCs(Iterable<TagNode> tnListWithSRC, URL sourcePage) 630 { 631 Vector<URL> ret = new Vector<>(); 632 633 for (TagNode tn : tnListWithSRC) ret.addElement(resolve(tn.AV("src"), sourcePage)); 634 635 return ret; 636 } 637 638 639 /** 640 * This will use a "pointer array" - an array containing indexes into the downloaded page to 641 * retrieve {@code TagNode's}. The {@code TagNode's} to which this pointer-array points - 642 * must each contain an {@code HREF} inner-tag with a {@code URL}, or a partial {@code URL}. 643 * 644 * <EMBED CLASS="external-html" DATA-FILE-ID=BASE_URL_NOTE> 645 * 646 * @param html <EMBED CLASS="external-html" DATA-FILE-ID="HTMLVEC"> 647 * 648 * @param nodePosArr An array of pointers into the page or sub-page. The pointers must 649 * reference {@code TagNode's} that contain {@code HREF} attributes. Integer-pointer Arrays 650 * are usually returned from the {@code package 'NodeSearch'} "Find" methods. 651 * 652 * <DIV CLASS="EXAMPLE">{@code 653 * // Retrieve 'pointers' to all the '<A HREF=...>' TagNode's. The term 'pointer' refers to 654 * // integer-indices into the vectorized-html variable 'page' 655 * int[] anchorPosArr = TagNodeFind.all(page, TC.OpeningTags, "a"); 656 * 657 * // Extract each HREF inner-tag, and construct a {@code URL}. Use the 'sourcePage' parameter 658 * // if the URL is only partially-resolved 659 * Vector<URL> urls = Links.resolveHREFs(page, anchorPosArr, mySourcePage); 660 * }</DIV> 661 * <BR /><I>which would obtain a pointer-array / (a.k.a. a "vector-index-array") to every HTML 662 * {@code "<A ...>"} element</I> that was available in the HTML page-{@code Vector} parameter 663 * {@code 'html'}, and then resolve any shortened {@code URL's}. 664 * 665 * @param sourcePage This is the source page {@code URL} from whence the (possibly relative) 666 * {@code TagNode URL's} in the {@code Vector} are to be resolved. 667 * 668 * @return A list of {@code URL's}, each of which have been completed/resolved with the 669 * {@code 'sourcePage'} parameter. Any {@code TagNode} which generated an exception, will 670 * result in a null value in the {@code Vector}. However, if any of the nodes pointed to by 671 * the {@code 'nodePosArr'} parameter do not contain opening {@code TagNode} elements, then 672 * this mistake shall generate {@code TagNodeExpectedException's}. 673 * 674 * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_NO_HREF> 675 * 676 * @throws ArrayIndexOutOfBoundsException 677 * <EMBED CLASS="external-html" DATA-FILE-ID="ATTRAIOOBEX"> 678 * @throws TagNodeExpectedException <EMBED CLASS="external-html" DATA-FILE-ID="TNEEX"> 679 * @throws OpeningTagNodeExpectedException <EMBED CLASS="external-html" DATA-FILE-ID="OTNEEX"> 680 * 681 * @see #resolve(String, URL) 682 * @see TagNode#AV(String) 683 */ 684 public static Vector<URL> resolveHREFs 685 (Vector<? extends HTMLNode> html, int[] nodePosArr, URL sourcePage) 686 { 687 // Return Vector 688 Vector<URL> ret = new Vector<>(); 689 690 for (int nodePos : nodePosArr) 691 { 692 HTMLNode n = html.elementAt(nodePos); 693 694 // Must be an HTML TagNode 695 if (! n.isTagNode()) throw new TagNodeExpectedException(nodePos); 696 697 TagNode tn = (TagNode) n; 698 699 // Must be an "Opening" HTML TagNode 700 if (tn.isClosing) throw new OpeningTagNodeExpectedException(nodePos); 701 702 // Resolve the 'HREF', save the URL 703 ret.addElement(resolve(tn.AV("href"), sourcePage)); 704 } 705 706 return ret; 707 } 708 709 710 /** 711 * This will use a "pointer array" - an array containing indexes into the downloaded page to 712 * retrieve {@code TagNode's}. The {@code TagNode's} to which this pointer-array points - must 713 * each contain a {@code SRC} inner-tag with a {@code URL}, or a partial {@code URL}. 714 * 715 * <EMBED CLASS="external-html" DATA-FILE-ID=BASE_URL_NOTE> 716 * 717 * @param html <EMBED CLASS="external-html" DATA-FILE-ID="HTMLVEC"> Any HTML page (or sub-page) 718 * 719 * @param nodePosArr An array of pointers into the page or sub-page. The pointers must 720 * reference {@code TagNode's} that contain {@code SRC} attributes. Integer-pointer Arrays are 721 * usually returned from the {@code package 'NodeSearch'} "Find" methods. 722 * 723 * <DIV CLASS="EXAMPLE">{@code 724 * // Retrieve 'pointers' to all the '<IMG SRC=...>' TagNode's. The term 'pointer' refers to 725 * // integer-indices into the vectorized-html variable 'page' 726 * int[] picturePosArr = TagNodeFind.all(page, TC.OpeningTags, "img"); 727 * 728 * // Extract each SRC inner-tag, and construct a {@code URL}. Use the 'sourcePage' parameter 729 * // if the URL is only partially-resolved 730 * Vector<URL> urls = Links.resolveSRCs(page, picturePosArr, mySourcePage); 731 * }</DIV> 732 * 733 * <BR /><I>which would obtain a pointer-array / (a.k.a. a "vector-index-array") to every HTML 734 * {@code "<IMG ...>"} element</I> that was available in the HTML page-{@code Vector} parameter 735 * {@code 'html'}, and then resolve any shorted image {@code URL's}. 736 * 737 * @param sourcePage This is the source page {@code URL} from whence the (possibly relative) 738 * {@code TagNode URL's} in the {@code Vector} are to be resolved. 739 * 740 * @return A list of {@code URL's}, each of which have been completed/resolved with the 741 * {@code 'sourcePage'} parameter. Any {@code TagNode} which generated an exception, will 742 * result in a null value in the {@code Vector}. However, if any of the nodes pointed to by 743 * the {@code 'nodePosArr'} parameter do not contain opening {@code TagNode} elements, then 744 * this mistake shall generate {@code TagNodeExpectedException's}. 745 * 746 * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_NO_SRC> 747 * 748 * @throws ArrayIndexOutOfBoundsException 749 * <EMBED CLASS="external-html" DATA-FILE-ID="ATTRAIOOBEX"> 750 * @throws TagNodeExpectedException <EMBED CLASS="external-html" DATA-FILE-ID="TNEEX"> 751 * @throws OpeningTagNodeExpectedException <EMBED CLASS="external-html" DATA-FILE-ID="OTNEEX"> 752 * 753 * @see #resolve(String, URL) 754 * @see TagNode#AV(String) 755 */ 756 public static Vector<URL> resolveSRCs 757 (Vector<? extends HTMLNode> html, int[] nodePosArr, URL sourcePage) 758 { 759 // Return Vector 760 Vector<URL> ret = new Vector<>(); 761 762 for (int nodePos : nodePosArr) 763 { 764 HTMLNode n = html.elementAt(nodePos); 765 766 // Must be an HTML TagNode 767 if (! n.isTagNode()) throw new TagNodeExpectedException(nodePos); 768 769 TagNode tn = (TagNode) n; 770 771 // Must be an "Opening" HTML TagNode 772 if (tn.isClosing) throw new OpeningTagNodeExpectedException(nodePos); 773 774 // Resolve the "SRC", save the URL 775 ret.addElement(resolve(tn.AV("src"), sourcePage)); 776 } 777 778 return ret; 779 } 780 781 782 /** 783 * This will convert <I><B>a list of </B></I> simple java {@code String's} to a 784 * list/{@code Vector} of {@code URL's}, de-referencing any missing information using the 785 * {@code 'sourcePage'} parameter. 786 * 787 * @param src a list of strings - usually partially or totally completed Internet {@code URL's} 788 * 789 * @param sourcePage This is the source page {@code URL} from which the {@code String's} 790 * (possibly-relative) {@code URL's} in the {@code Vector} will be resolved. 791 * 792 * @return A list of {@code URL's}, each of which have been completed/resolved with the 793 * {@code 'sourcePage'} parameter. If there were any {@code String's} that were zero-length or 794 * null, then null is returned in the related {@code Vector} position. If any 795 * {@code TagNode} causes a {@code MalformedURLException}, then that position in the 796 * {@code Vector} will be null. 797 * 798 * @see #resolve(String, URL) 799 */ 800 public static Vector<URL> resolve(Vector<String> src, URL sourcePage) 801 { 802 Vector<URL> ret = new Vector<>(); 803 804 for (String s : src) ret.addElement(resolve(s, sourcePage)); 805 806 return ret; 807 } 808 809 /** 810 * This will convert a simple java {@code String} to a {@code URL}, de-referencing any missing 811 * information using the {@code 'sourcePage'} parameter. 812 * 813 * @param src Any java {@code String}, usually one which was scraped from an HTML-Page, and 814 * needs to be "completed." 815 * 816 * @param sourcePage This is the source page {@code URL} from which the String 817 * (possibly-relative) {@code URL} will be resolved. 818 * 819 * @return A {@code URL}, which has been completed/resolved with the {@code 'sourcePage'} 820 * parameter. If parameter {@code 'src'} is null or zero-length, then this method will also 821 * return null. If a {@code MalformedURLException} is generated, null will also be returned. 822 */ 823 public static URL resolve(String src, URL sourcePage) 824 { 825 if (sourcePage == null) throw new NullPointerException( 826 "Though you may provide null to the partial-URL to dereference parameter, null " + 827 "may not be passed to the Source-Page Parameter. The purpose of the 'resolve' " + 828 "operation is to resolve partial-URLs against a source-page (root) URL. " + 829 "Therefore this is not allowed." 830 ); 831 832 if (src == null) return null; 833 834 src = src.trim(); 835 836 if (src.length() == 0) return null; 837 838 String srcLC = src.toLowerCase(); 839 840 if (StrCmpr.startsWithXOR(srcLC, _NON_URL_HREFS)) return null; 841 842 if (srcLC.startsWith("http://") || srcLC.startsWith("https://")) 843 844 try 845 { return new URL(src); } 846 847 catch (MalformedURLException e) { return null; } 848 849 if (src.startsWith("//") && (src.charAt(3) != '/')) 850 851 try 852 { return new URL(sourcePage.getProtocol().toLowerCase() + ":" + src); } 853 854 catch (MalformedURLException e) { return null; } 855 856 if (src.startsWith("/")) 857 858 try 859 { 860 return new URL( 861 sourcePage.getProtocol().toLowerCase() + "://" + 862 sourcePage.getHost().toLowerCase() + 863 src 864 ); 865 } 866 867 catch (MalformedURLException e) { return null; } 868 869 if (src.startsWith("../")) 870 { 871 String sourcePageStr = sourcePage.toString(); 872 short nLevels = 0; 873 874 do { nLevels++; src = src.substring(3); } 875 while (src.startsWith("../")); 876 877 String directory = StringParse.dotDotParentDirectory(sourcePage.toString(), nLevels); 878 879 try { return new URL(directory + src); } 880 catch (Exception e) { return null; } 881 } 882 883 String root = 884 sourcePage.getProtocol().toLowerCase() + "://" + 885 sourcePage.getHost().toLowerCase(); 886 887 String path = sourcePage.getPath().trim(); 888 int pos = StringParse.findLastFrontSlashPos(path); 889 890 if (pos == -1) throw new StringIndexOutOfBoundsException( 891 "The URL you have provided: " + sourcePage.toString() + " does not have a '/' " + 892 "front-slash character in it's path. Cannot proceed resolving relative-URL's " + 893 "without this." 894 ); 895 896 path = path.substring(0, pos + 1); 897 898 try { return new URL(root + path + src); } 899 catch (MalformedURLException e) { return null; } 900 } 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 // ******************************************************************************************** 923 // Resolve, KE - Keep Exceptions 924 // ******************************************************************************************** 925 926 /** 927 * This should be used for {@code TagNode's} that contain an {@code 'HREF'} inner-tag 928 * (attribute). 929 * 930 * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_KE> 931 * 932 * @param tnWithHREF <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_TN_HREF> 933 * 934 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 935 * (possibly-relative) {@code URL} will be resolved. 936 * 937 * @return A complete-{@code URL} without any missing "presumed data" - such as host/domain or 938 * directory. If there were no {@code HREF} tag, then null is returned. If 939 * the {@code TagNode} causes a {@code MalformedURLException}, that is returned in 940 * {@code Ret2.b} 941 * 942 * <BR /><BR /><B STYLE="color: red;">SPECIFICALLY:</B> This method shall catch all 943 * {@code MalformedURLException's}. 944 * 945 * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_RET2> 946 * 947 * @throws HREFException If the {@code TagNode} passed to parameter {@code 'tnWithHREF'} does 948 * not actually contain an {@code HREF} attribute, then this exception shall throw. 949 * 950 * @see #resolve_KE(String, URL) 951 * @see TagNode#AV(String) 952 * @see Ret2 953 */ 954 public static Ret2<URL, MalformedURLException> resolveHREF_KE 955 (TagNode tnWithHREF, URL sourcePage) 956 { 957 String href = tnWithHREF.AV("href"); 958 959 if (href == null) throw new HREFException( 960 "The TagNode passed to parameter tnWithHREF does not actually contain an " + 961 "HREF attribute." 962 ); 963 964 return resolve_KE(href, sourcePage); 965 } 966 967 968 /** 969 * This should be used for {@code TagNode's} that contain a {@code 'SRC'} inner-tag 970 * (attribute). 971 * 972 * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_KE> 973 * 974 * @param tnWithSRC <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_TN_SRC> 975 * 976 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 977 * (possibly-relative) {@code URL} will be resolved. 978 * 979 * @return A complete-{@code URL} without any missing "presumed data" - such as host/domain or 980 * directory. If there were no {@code SRC} tag, then null is returned. If the 981 * {@code TagNode} causes a {@code MalformedURLException}, that is returned in {@code Ret2.b} 982 * 983 * <BR /><BR /><B STYLE="color: red;">SPECIFICALLY:</B> This method shall catch all 984 * {@code MalformedURLException's}. 985 * 986 * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_RET2> 987 * 988 * @throws SRCException If the {@code TagNode} passed to parameter {@code 'tnWithSRC'} does not 989 * actually contain a {@code SRC} attribute, then this exception shall throw. 990 * 991 * @see #resolve_KE(String, URL) 992 * @see TagNode#AV(String) 993 * @see Ret2 994 */ 995 public static Ret2<URL, MalformedURLException> resolveSRC_KE 996 (TagNode tnWithSRC, URL sourcePage) 997 { 998 String src = tnWithSRC.AV("src"); 999 1000 if (src == null) throw new SRCException( 1001 "The TagNode passed to parameter tnWithSRC does not actually contain a " + 1002 "SRC attribute." 1003 ); 1004 1005 return resolve_KE(src, sourcePage); 1006 } 1007 1008 1009 /** 1010 * This should be used for lists of {@code TagNode's}, each of which contain an {@code 'HREF'} 1011 * inner-tag (attribute). 1012 * 1013 * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_KE> 1014 * 1015 * @param tnListWithHREF <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_TNLIST_HREF> 1016 * 1017 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 1018 * (possibly-relative) {@code URL's} in the {@code Iterable} will be resolved. 1019 * 1020 * @return A list of {@code URL's}, each of which have been completed/resolved with the 1021 * {@code 'sourcePage'} parameter. If there were any {@code TagNode} with no {@code HREF} tag, 1022 * then null is returned in the related {@code Vector} position. If any {@code TagNode} causes 1023 * a {@code MalformedURLException}, then that position in the {@code Vector} will contain the 1024 * exception in {@code Ret2.b} 1025 * 1026 * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_NO_HREF> 1027 * 1028 * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_RET2> 1029 * 1030 * @see #resolve_KE(String, URL) 1031 * @see TagNode#AV(String) 1032 * @see Ret2 1033 */ 1034 public static Vector<Ret2<URL, MalformedURLException>> resolveHREFs_KE 1035 (Iterable<TagNode> tnListWithHREF, URL sourcePage) 1036 { 1037 Vector<Ret2<URL, MalformedURLException>> ret = new Vector<>(); 1038 1039 for (TagNode tn : tnListWithHREF) ret.addElement(resolve_KE(tn.AV("href"), sourcePage)); 1040 1041 return ret; 1042 } 1043 1044 1045 /** 1046 * This should be used for lists of {@code TagNode's}, each of which contain a {@code 'SRC'} 1047 * inner-tag (attribute). 1048 * 1049 * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_KE> 1050 * 1051 * @param tnListWithSRC <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_TNLIST_SRC> 1052 * 1053 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 1054 * (possibly-relative) {@code URL's} in the {@code Iterable} will be resolved. 1055 * 1056 * @return A list of {@code URL's}, each of which have been completed/resolved with the 1057 * {@code 'sourcePage'} parameter. If there were any {@code TagNode} with no {@code SRC} tag, 1058 * then null is returned in the related {@code Vector} position. If any {@code TagNode} causes 1059 * a {@code MalformedURLException}, then that position in the {@code Vector} will contain the 1060 * exception in {@code Ret2.b} 1061 * 1062 * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_NO_SRC> 1063 * 1064 * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_RET2> 1065 * 1066 * @see #resolve_KE(String, URL) 1067 * @see TagNode#AV(String) 1068 * @see Ret2 1069 */ 1070 public static Vector<Ret2<URL, MalformedURLException>> resolveSRCs_KE 1071 (Iterable<TagNode> tnListWithSRC, URL sourcePage) 1072 { 1073 Vector<Ret2<URL, MalformedURLException>> ret = new Vector<>(); 1074 1075 for (TagNode tn : tnListWithSRC) ret.addElement(resolve_KE(tn.AV("src"), sourcePage)); 1076 1077 return ret; 1078 } 1079 1080 1081 /** 1082 * This will use a "pointer array" - an array containing indexes into the downloaded page to 1083 * retrieve {@code TagNode's}. The {@code TagNode} to which this pointer-array points - must 1084 * contain {@code HREF} inner-tags with {@code URL's}, or partial {@code URL's}. 1085 * 1086 * <EMBED CLASS="external-html" DATA-FILE-ID=BASE_URL_NOTE> 1087 * 1088 * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_KE> 1089 * 1090 * @param html <EMBED CLASS="external-html" DATA-FILE-ID="HTMLVEC"> Any HTML page (or sub-page) 1091 * 1092 * @param nodePosArr An array of pointers into the page or sub-page. The pointers must 1093 * reference {@code TagNode's} that contain {@code HREF} attributes. Integer-pointer Arrays 1094 * are usually return from the {@code package 'NodeSearch'} "Find" methods. 1095 * 1096 * <DIV CLASS="EXAMPLE">{@code 1097 * // Retrieve 'pointers' to all the '<A HREF=...>' TagNode's. The term 'pointer' refers to 1098 * // integer-indices into the vectorized-html variable 'page' 1099 * int[] anchorPosArr = TagNodeFind.all(page, TC.OpeningTags, "a"); 1100 * 1101 * // Extract each HREF inner-tag, and construct a URL. Use the 'sourcePage' parameter if 1102 * // the URL is only partially-resolved. If any URL's on the original-page are invalid, the 1103 * // method shall not crash, but save the exception instead. 1104 * Vector<Ret2<URL, MalformedURLException> urlsWithEx = 1105 * Links.resolveHREFs_KE(page, picturePosArr, mySourcePage); 1106 * 1107 * // Print out any "failed" urls 1108 * for (Ret2<URL, MalformedURLException> r : urlsWithEx) 1109 * if (r.b != null) 1110 * System.out.println("There was an exception: " + r.b.toString()); 1111 * }</DIV> 1112 * 1113 * <BR /><I>which would obtain a pointer-array / (a.k.a. a "vector-index-array") to every HTML 1114 * {@code "<A ...>"} element</I> that was available in the HTML page-{@code Vector} parameter 1115 * {@code 'html'}., and then resolve any shortened {@code URL's}. 1116 * 1117 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 1118 * (possibly-relative) {@code URL's} in the {@code Vector} will be resolved. 1119 * 1120 * @return A list of {@code URL's}, each of which have been completed/resolved with the 1121 * {@code 'sourcePage'} parameter. If there were any {@code TagNode} with no {@code HREF} tag, 1122 * then null is returned in the related {@code Vector} position. If any {@code TagNode} causes 1123 * a {@code MalformedURLException}, then that position in the {@code Vector} will contain the 1124 * exception in {@code Ret2.b} 1125 * 1126 * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_NO_HREF> 1127 * 1128 * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_RET2> 1129 * 1130 * @throws ArrayIndexOutOfBoundsException 1131 * 1132 * <EMBED CLASS="external-html" DATA-FILE-ID="ATTRAIOOBEX"> 1133 * 1134 * @throws TagNodeExpectedException <EMBED CLASS="external-html" DATA-FILE-ID="TNEEX"> 1135 * @throws OpeningTagNodeExpectedException <EMBED CLASS="external-html" DATA-FILE-ID="OTNEEX"> 1136 * 1137 * @see #resolve_KE(String, URL) 1138 * @see TagNode#AV(String) 1139 * @see Ret2 1140 */ 1141 public static Vector<Ret2<URL, MalformedURLException>> resolveHREFs_KE 1142 (Vector<? extends HTMLNode> html, int[] nodePosArr, URL sourcePage) 1143 { 1144 // Return Vector 1145 Vector<Ret2<URL, MalformedURLException>> ret = new Vector<>(); 1146 1147 for (int nodePos : nodePosArr) 1148 { 1149 HTMLNode n = html.elementAt(nodePos); 1150 1151 // Must be an HTML TagNode 1152 if (! n.isTagNode()) throw new TagNodeExpectedException(nodePos); 1153 1154 TagNode tn = (TagNode) n; 1155 1156 // Must be an "Opening" HTML TagNode 1157 if (tn.isClosing) throw new OpeningTagNodeExpectedException(nodePos); 1158 1159 // Resolve the "HREF", keep the URL 1160 ret.addElement(resolve_KE(tn.AV("href"), sourcePage)); 1161 } 1162 1163 return ret; 1164 } 1165 1166 /** 1167 * This will use a "pointer array" - an array containing indexes into the downloaded page to 1168 * retrieve {@code TagNode's}. The {@code TagNode} to which this pointer-array points - must 1169 * contain {@code SRC} inner-tags with {@code URL's}, or partial {@code URL's}. 1170 * 1171 * <EMBED CLASS="external-html" DATA-FILE-ID=BASE_URL_NOTE> 1172 * 1173 * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_KE> 1174 * 1175 * @param html <EMBED CLASS="external-html" DATA-FILE-ID="HTMLVEC"> Any HTML page (or sub-page) 1176 * 1177 * @param nodePosArr An array of pointers into the page or sub-page. The pointers must 1178 * reference {@code TagNode's} that contain {@code SRC} attributes. Integer-pointer Arrays are 1179 * usually return from the {@code package 'NodeSearch'} "Find" methods. 1180 * 1181 * <DIV CLASS="EXAMPLE">{@code 1182 * // Retrieve 'pointers' to all the '<IMG SRC=...>' TagNode's. The term 'pointer' refers to 1183 * // integer-indices into the vectorized-html variable 'page' 1184 * int[] picturePosArr = TagNodeFind.all(page, TC.OpeningTags, "img"); 1185 * 1186 * // Extract each SRC inner-tag, and construct a URL. Use the 'sourcePage' parameter if 1187 * // the URL is only partially-resolved. If any URL's on the original-page are invalid, 1188 * // the method shall not crash, but save the exception instead. 1189 * Vector<Ret2<URL, MalformedURLException> urlsWithEx = 1190 * Links.resolveSRCs_KE(page, picturePosArr, mySourcePage); 1191 * 1192 * // Print out any "failed" urls 1193 * for (Ret2<URL, MalformedURLException> r : urlsWithEx) 1194 * if (r.b != null) 1195 * System.out.println("There was an exception: " + r.b.toString()); 1196 * }</DIV> 1197 * 1198 * <BR /><I>which would obtain a pointer-array / (a.k.a. a "vector-index-array") to every HTML 1199 * {@code "<IMG ...>"} element</I> that was available in the HTML page-{@code Vector} parameter 1200 * {@code 'html'}, and then resolve any shortened {@code URL's}. 1201 * 1202 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 1203 * (possibly-relative) {@code URL's} in the {@code Vector} will be resolved. 1204 * 1205 * @return A list of {@code URL's}, each of which have been completed/resolved with the 1206 * {@code 'sourcePage'} parameter. If there were any {@code TagNode} with no {@code SRC} tag, 1207 * then null is returned in the related {@code Vector} position. If any {@code TagNode} causes 1208 * a {@code MalformedURLException}, then that position in the {@code Vector} will contain the 1209 * exception in {@code Ret2.b} 1210 * 1211 * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_NO_SRC> 1212 * 1213 * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_RET2> 1214 * 1215 * @throws ArrayIndexOutOfBoundsException 1216 * 1217 * <EMBED CLASS="external-html" DATA-FILE-ID="ATTRAIOOBEX"> 1218 * 1219 * @throws TagNodeExpectedException <EMBED CLASS="external-html" DATA-FILE-ID="TNEEX"> 1220 * 1221 * @throws OpeningTagNodeExpectedException <EMBED CLASS="external-html" DATA-FILE-ID="OTNEEX"> 1222 * 1223 * @see #resolve_KE(String, URL) 1224 * @see TagNode#AV(String) 1225 * @see Ret2 1226 */ 1227 public static Vector<Ret2<URL, MalformedURLException>> resolveSRCs_KE 1228 (Vector<? extends HTMLNode> html, int[] nodePosArr, URL sourcePage) 1229 { 1230 // Return Vector 1231 Vector<Ret2<URL, MalformedURLException>> ret = new Vector<>(); 1232 1233 for (int nodePos : nodePosArr) 1234 { 1235 HTMLNode n = html.elementAt(nodePos); 1236 1237 // Must be an HTML TagNode 1238 if (! n.isTagNode()) throw new TagNodeExpectedException(nodePos); 1239 1240 TagNode tn = (TagNode) n; 1241 1242 // Must be an "Opening" HTML TagNode 1243 if (tn.isClosing) throw new OpeningTagNodeExpectedException(nodePos); 1244 1245 // Resolve "SRC" and keep URL's 1246 ret.addElement(resolve_KE(tn.AV("src"), sourcePage)); 1247 } 1248 1249 return ret; 1250 } 1251 1252 /** 1253 * Resolve all {@code URL's}, represented as {@code String's}, inside of a {@code Vector}. 1254 * 1255 * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_KE> 1256 * 1257 * @param src a list of {@code String's} - usually partially or totally completed Internet 1258 * {@code URL's} 1259 * 1260 * @param sourcePage This is the source page {@code URL} from which the {@code String's} 1261 * (possibly-relative) {@code URL's} in the {@code Vector} will be resolved. 1262 * 1263 * @return A list of {@code URL's}, each of which have been completed/resolved with the 1264 * {@code 'sourcePage'} parameter. If there were any {@code String's} that were zero-length or 1265 * null, then null is returned in the related {@code Vector} position. If any {@code TagNode} 1266 * causes a {@code MalformedURLException}, then that position in the {@code Vector} will 1267 * contain the exception in {@code Ret2.b} 1268 * 1269 * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_RET2> 1270 * 1271 * @see #resolve_KE(String, URL) 1272 * @see Ret2 1273 */ 1274 public static Vector<Ret2<URL, MalformedURLException>> resolve_KE 1275 (Vector<String> src, URL sourcePage) 1276 { 1277 Vector<Ret2<URL, MalformedURLException>> ret = new Vector<>(); 1278 1279 for (String s : src) ret.addElement(resolve_KE(s, sourcePage)); 1280 1281 return ret; 1282 } 1283 1284 /** 1285 * This will convert a simple java {@code String} to a {@code URL}, de-referencing any missing 1286 * information using the {@code 'sourcePage'} parameter. 1287 * 1288 * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_KE> 1289 * 1290 * @param src Any java {@code String}, usually one which was scraped from an HTML-Page, and 1291 * needs to be "completed." 1292 * 1293 * @param sourcePage This is the source page {@code URL} from which the String (possibly 1294 * relative) {@code URL} will be resolved. 1295 * 1296 * @return A {@code URL}, which has been completed/resolved with the {@code 'sourcePage'} 1297 * parameter. If parameter {@code 'src'} is null or zero-length, null will be returned. If a 1298 * {@code MalformedURLException} is thrown, that will be included with the {@code Ret2<>} 1299 * result. 1300 * 1301 * <EMBED CLASS="external-html" DATA-FILE-ID=LINKS_RET2> 1302 * 1303 * @see Ret2 1304 */ 1305 public static Ret2<URL, MalformedURLException> resolve_KE(String src, URL sourcePage) 1306 { 1307 if (sourcePage == null) throw new NullPointerException( 1308 "Though you may provide null to the partial-URL to dereference parameter, null " + 1309 "may not be passed to the Source-Page Parameter. The purpose of the 'resolve' " + 1310 "operation is to resolve partial-URLs against a source-page (root) URL. " + 1311 "Therefore this is not allowed." 1312 ); 1313 1314 if (src == null) return null; 1315 1316 src = src.trim(); 1317 1318 if (src.length() == 0) return null; 1319 1320 String srcLC = src.toLowerCase(); 1321 1322 if (StrCmpr.startsWithXOR 1323 (srcLC, "tel:", "javascript:", "mailto:", "magnet:", "file:", "ftp:", "#")) 1324 1325 return new Ret2<URL, MalformedURLException> 1326 (null, new MalformedURLException( 1327 "InnerTag/Attribute begins with: " + src.substring(0, 1 + src.indexOf(":")) + 1328 ", so it is not a hyper-link." 1329 )); 1330 1331 1332 // Includes the first few characters of the URL - for reporting/convenience. 1333 // If this is an "image", the image-type & name will be included 1334 1335 if (StrCmpr.startsWithXOR(srcLC, "data:", "blob:")) 1336 1337 return new Ret2<URL, MalformedURLException>(null, new MalformedURLException( 1338 "InnerTag/Attribute begins with: " + 1339 ((src.length() > 25) ? src.substring(0, 25) : src) + 1340 ", not a URL." 1341 )); 1342 1343 1344 if (srcLC.startsWith("http://") || srcLC.startsWith("https://")) 1345 1346 try 1347 { return new Ret2<URL, MalformedURLException>(new URL(src), null); } 1348 1349 catch (MalformedURLException e) 1350 { return new Ret2<URL, MalformedURLException>(null, e); } 1351 1352 1353 if (src.startsWith("//") && (src.charAt(3) != '/')) 1354 1355 try 1356 { 1357 return new Ret2<URL, MalformedURLException> 1358 (new URL( sourcePage.getProtocol().toLowerCase() + ":" + src), null); 1359 } 1360 1361 catch (MalformedURLException e) 1362 { return new Ret2<URL, MalformedURLException>(null, e); } 1363 1364 1365 if (src.startsWith("/")) 1366 1367 try 1368 { 1369 return new Ret2<URL, MalformedURLException>(new URL( 1370 sourcePage.getProtocol().toLowerCase() + "://" + 1371 sourcePage.getHost().toLowerCase() + 1372 src), null 1373 ); 1374 } 1375 1376 catch (MalformedURLException e) 1377 { return new Ret2<URL, MalformedURLException>(null, e); } 1378 1379 1380 if (src.startsWith("../")) 1381 { 1382 String sourcePageStr = sourcePage.toString(); 1383 short nLevels = 0; 1384 1385 do 1386 { nLevels++; src = src.substring(3); } 1387 while (src.startsWith("../")); 1388 1389 String directory = StringParse.dotDotParentDirectory(sourcePage.toString(), nLevels); 1390 1391 try 1392 { return new Ret2<URL, MalformedURLException>(new URL(directory + src), null); } 1393 1394 catch (MalformedURLException e) 1395 { return new Ret2<URL, MalformedURLException>(null, e); } 1396 1397 catch (Exception e) 1398 { 1399 return new Ret2<URL, MalformedURLException> 1400 (null, 1401 new MalformedURLException(e.getClass().getCanonicalName() + 1402 ":" + e.getMessage()) 1403 ); 1404 } 1405 } 1406 1407 1408 String root = 1409 sourcePage.getProtocol().toLowerCase() + "://" + 1410 sourcePage.getHost().toLowerCase(); 1411 1412 String path = sourcePage.getPath().trim(); 1413 int pos = StringParse.findLastFrontSlashPos(path); 1414 1415 if (pos == -1) throw new StringIndexOutOfBoundsException( 1416 "The URL you have provided: " + sourcePage.toString() + 1417 " does not have a '/' front-slash character in it's path." + 1418 "Cannot proceed resolving relative-URL's without this." 1419 ); 1420 1421 path = path.substring(0, pos + 1); 1422 1423 try 1424 { return new Ret2<URL, MalformedURLException>(new URL(root + path + src), null); } 1425 1426 catch (MalformedURLException e) 1427 { return new Ret2<URL, MalformedURLException>(null, e); } 1428 } 1429}