001package Torello.HTML; 002 003import java.net.*; 004import java.util.*; 005import java.util.stream.IntStream; 006 007import Torello.Java.*; 008 009import Torello.HTML.NodeSearch.InnerTagFind; // Used for an @see reference 010import Torello.HTML.NodeSearch.TagNodeFind; // Used in getBaseURL 011import Torello.Java.Additional.Ret2; 012import Torello.Java.Additional.Ret3; 013 014/** 015 * Utilities for de-refrencing 'partially-completed' {@code URL's} in a Web-Page {@code Vector}. 016 * 017 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=LINKS> 018 * @see ReplaceNodes 019 * @see ReplaceFunction 020 * @see HTMLPage 021 * @see InnerTagFind 022 * @see Ret2 023 */ 024@Torello.JavaDoc.StaticFunctional 025public class Links 026{ 027 private Links() { } 028 029 /** 030 * List of documented "starter-strings" that are sometimes used in Anchor URL 031 * {@code 'HREF=...'} attributes. 032 * 033 * @see #NON_URL_HREFS 034 */ 035 protected static final String[] _NON_URL_HREFS = 036 { "tel:", "magnet:", "javascript:", "mailto:", "ftp:", "file:", "data:", "blog:", "#" }; 037 038 /** 039 * This small method just returns the complete list of commonly found Anchor 040 * {@code 'HREF' String's} that do not actually constitute an HTML {@code 'URL'.} This method 041 * actually returns a "clone" of an internally stored {@code String[]} Array. This is to 042 * protect and make sure that the list of potential HTML Anchor-Tag {@code 'HREF'} Attributes 043 * is not changed, doctored or modified 044 * 045 * @return A clone of the {@code String}-array {@code '_NON_URL_HREFS'} 046 * 047 * @see #_NON_URL_HREFS 048 */ 049 public static String[] NON_URL_HREFS() 050 { return _NON_URL_HREFS.clone(); } 051 052 /** 053 * The methods in this class <I><B>will not automatically extract</I></B> any HTML 054 * {@code <BASE HREF=URL>} definitions that are found on this page. If the user wishes to 055 * dereference partial / relative {@code URL} definitions that exist on the input page, all the 056 * while respecting any {@code <BASE HREF=URL>} definitions found on the input page, then this 057 * method should be utilized. 058 * 059 * @param page This may be any HTML page or partial page. If this page has a valid HTML 060 * {@code <BASE HREF=URL>}, it will be extracted and returned as an instance of 061 * {@code class URL}. 062 * 063 * @return This shall return the HTML {@code <BASE HREF="http://...">} element found available 064 * within the input-page parameter {@code 'page'}. If the page provided does not contain a 065 * {@code BASE URL} definition, then null shall be returned. 066 * 067 * <BR /><BR /><B>NOTE:</B> The HTML Specification clearly states that only one {@code URL} 068 * may be defined using the HTML Element {@code <BASE>}. Clearly, due to the browser wars, 069 * unspecified / non-deterministic behavior is possible if multiple definitions are provided. 070 * For the purposes of this class, if such a situation arises, an exception is thrown. 071 * 072 * @throws MalformedHTMLException If the HTML page provided contains multiple definitions of 073 * the element {@code <BASE HREF=URL>}, then this exception will throw. 074 * 075 * @throws MalformedURLException If the {@code <BASE HREF=URL>} found / identified within the 076 * input page, but that {@code URL} is invalid, then this exception shall throw. 077 * 078 * @see TagNodeFind 079 * @see Attributes#retrieve(Vector, int[], String) 080 */ 081 public static URL getBaseURL(Vector<? extends HTMLNode> page) 082 throws MalformedHTMLException, MalformedURLException 083 { 084 int[] posArr = TagNodeFind.all(page, TC.OpeningTags, "base"); 085 086 if (posArr.length == 0) return null; 087 088 // NOTE: The cast is all right because 'posArr' only points to TagNode's 089 // Attributes expects to avoid processing Vector<TextNode>, and Vector<CommentNode> 090 // Above, there will be nothing in the 'posArr' if either of those was passed. 091 092 @SuppressWarnings("unchecked") 093 String[] urls = Attributes.retrieve((Vector<HTMLNode>) page, posArr, "href"); 094 095 boolean found = false; 096 String ret = null; 097 098 for (String url : urls) 099 if ((url != null) && (url.length() > 0)) 100 if (found) 101 throw new MalformedHTMLException( 102 "The page you have provided has multiple <BASE HREF=URL> definitions. " + 103 "However, the HTML Specifications state that pages may provide just one " + 104 "definition. If you wish to proceed, retrieve the definitions manually " + 105 "using class TagNodeFind.all and Attributes.retrieve, as explained in " + 106 "the JavaDoc pages for this class." 107 ); 108 else 109 { 110 found = true; 111 ret = url; 112 } 113 114 return new URL(ret); 115 } 116 117 118 // ******************************************************************************************** 119 // ******************************************************************************************** 120 // Complete Vector-Resolve Methods - SRC-ATTRIBUTE 121 // ******************************************************************************************** 122 // ******************************************************************************************** 123 124 125 /** 126 * Convenience Method. 127 * <BR />Invokes: {@link #resolveAllSRC(Vector, int, int, URL, SD, boolean)} 128 */ 129 public static Ret3<int[], int[], int[]> resolveAllSRC( 130 Vector<? super TagNode> html, URL sourcePage, SD quote, 131 boolean askForReturnArraysOrReturnNull 132 ) 133 { return resolveAllSRC(html, 0, -1, sourcePage, quote, askForReturnArraysOrReturnNull); } 134 135 /** 136 * Convenience Method. 137 * <BR />Accepts: {@code DotPair}. 138 * <BR />Invokes: {@link #resolveAllSRC(Vector, int, int, URL, SD, boolean)} 139 */ 140 public static Ret3<int[], int[], int[]> resolveAllSRC( 141 Vector<? super TagNode> html, DotPair dp, URL sourcePage, SD quote, 142 boolean askForReturnArraysOrReturnNull 143 ) 144 { 145 return resolveAllSRC 146 (html, dp.start, dp.end + 1, sourcePage, quote, askForReturnArraysOrReturnNull); 147 } 148 149 /** 150 * This method shall resolve all partial {@code URL} addresses that are found within 151 * {@code TagNode} elements having {@code 'SRC=...'} attributes. Each instance of 152 * {@code TagNode} found in the input HTML {@code Vector} that has an {@code 'SRC'} 153 * attribute - if the {@code 'URL'} is only partially resolve - shall be updated and replaced 154 * with a new {@code TagNode} with a fully resolved {@code URL}. 155 * 156 * <EMBED CLASS='external-html' DATA-FILE-ID=BASE_URL_NOTE> 157 * 158 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVECSUP> 159 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 160 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 161 * 162 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 163 * (possibly-relative) {@code URL's} in the HTML-{@code Vector} will be resolved. 164 * 165 * @param quote A choice for the quotes to use. In most cases, {@code URL} attribute 166 * <B STYLE="color: red;">values</B> do not contain quotation-marks. So likely either 167 * choice would work just fine, without exceptions. 168 * 169 * <BR /><BR /><B>NOTE:</B> <I>null may be passed to this parameter</I>, and if it is 170 * the original quotation marks found in the {@code TagNode's 'SRC'} attribute will be 171 * reused. Passing null to this parameter should almost always be easiest, safest. 172 * 173 * @param askForReturnArraysOrReturnNull This (long-named) parameter is merely here to 174 * facilitate retrieving more information from this method - <I>if necessary</I>. When this 175 * parameter receives the following values: 176 * 177 * <BR /><BR /><UL CLASS=JDUL> 178 * <LI> <B>TRUE:</B> Three integer {@code int[]} arrays will be returned as listed in the 179 * <B>{@code Returns:}</B> section of this method's documentation. 180 * </LI> 181 * 182 * <LI><B>FALSE:</B> This method shall return null.</LI> 183 * </UL> 184 * 185 * @return If input parameter {@code 'askForReturnArraysOrReturnNull'} has been passed 186 * {@code FALSE}, this method shall return null. Otherwise, (if passed {@code TRUE}), then 187 * this method shall return an instance of {@code 'Ret3<int[], int[], int[]>'} - which is 188 * <I>returning three separate integer-arrays about what was found, and what has occurred.</I> 189 * 190 * <BR /><BR /> 191 * Three arrays are returned as a result of this method's invocation. Keep in mind that 192 * though the information might be superfluous, rejecting these arrays away is easy. 193 * They are provided as a matter of convenience for cases where more details information is 194 * mandatory for ensuring that long lists of {@code HTMLNode's} were properly updated. 195 * 196 * <BR /><BR /><OL CLASS=JDOL> 197 * <LI> {@code Ret3.a (int[])} 198 * <BR /><BR /> 199 * The first {@code int[] array} shall contain a list of the index of every 200 * {@code TagNode} in the input-{@code Vector} parameter's range that <B><I>contained</B> 201 * </I> a non-null HTML {@code 'SRC'} Attribute. 202 * <BR /><BR /> 203 * </LI> 204 * 205 * <LI> {@code Ret3.b (int[])} 206 * <BR /><BR /> 207 * The second {@code int[] array} will contain an index-list of the indices 208 * which contained {@code TagNode's} that were <B><I>replaced</I></B> by the 209 * internal-resolve logic. 210 * <BR /><BR /> 211 * </LI> 212 * 213 * <LI> {@code Ret3.c (int[])} 214 * <BR /><BR /> 215 * The third {@code int[] array} will contain an index-list of the indices 216 * which contained {@code TagNode's} whose {@code 'SRC=...'} attribute 217 * <I><B>failed</I></B> to be resolved by the internal-resolve logic, <I>or</I> caused a 218 * {@code QuotesException} to throw. 219 * </LI> 220 * </OL> 221 * 222 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 223 * 224 * @see #resolve(String, URL) 225 * @see TagNode#AV(String) 226 * @see TagNode#setAV(String, String, SD) 227 */ 228 public static Ret3<int[], int[], int[]> resolveAllSRC( 229 Vector<? super TagNode> html, int sPos, int ePos, URL sourcePage, SD quote, 230 boolean askForReturnArraysOrReturnNull 231 ) 232 { 233 // Retrieve the Vector-location of any TagNode on the page that has 234 // a "SRC=..." attribute. These are almost always HTML <IMG> elements. 235 // NOTE: FIND Method's are "READ ONLY" - the Cast will make no difference at run-time. 236 // The @SuppressWarnings is to overcome the cast of 'html' 237 238 @SuppressWarnings("unchecked") 239 int[] hasSrcPosArr = InnerTagFind.all((Vector<HTMLNode>) html, sPos, ePos, "src"); 240 241 // Java Stream's are convenient for keeping "Growing Lists" of return values. 242 // This builder shall keep a list of all URL's that failed to update - for any reason 243 // **UNLESS** the reason is that the URL was already a fully-resolved, non-partial URL 244 245 IntStream.Builder failedUpdate = askForReturnArraysOrReturnNull 246 ? IntStream.builder() 247 : null; 248 249 // This stream will keep a list of all URL's that were updated, and whose TagNode's 250 // were replaced inside the input HTML Vector 251 252 IntStream.Builder replaced = askForReturnArraysOrReturnNull 253 ? IntStream.builder() 254 : null; 255 256 for (int pos : hasSrcPosArr) 257 { 258 // Get the node at the index 259 TagNode tn = (TagNode) html.elementAt(pos); 260 261 // 1) Retrieve the SRC Attribute 262 // 2) if it is a partial-URL resolve it 263 // 3) Convert to a String 264 265 String oldURL = tn.AV("src"); 266 URL newURL = resolve(oldURL, sourcePage); 267 268 // Some URL's cannot be resolved, if so, just skip this TagNode. 269 // Log the index to the stream (if requested), and continue. 270 271 if (newURL == null) 272 { if (askForReturnArraysOrReturnNull) failedUpdate.accept(pos); continue; } 273 274 // If the URL was already a fully-resolved-URL, continue - don't replace the TagNode; 275 // No logging needed here, the URL was *already* resolved... 276 277 if (oldURL.length() == newURL.toString().length()) continue; 278 279 // Replace the SRC Attribute in the TagNode. This builds a new instance of TagNode 280 // If there is an exception, log the index to the stream (if requested), and continue. 281 282 try 283 { tn = tn.setAV("src", newURL.toString(), quote); } 284 285 catch (QuotesException qex) 286 { if (askForReturnArraysOrReturnNull) failedUpdate.accept(pos); continue; } 287 288 // Replace the index in the Vector containing the old TagNode with the new one. 289 html.setElementAt(tn , pos); 290 291 // The Vector-Index at this position had it's old TagNode removed and replaced with a 292 // new updated one. Log this to the stream-list so to allow the user to know. 293 294 if (askForReturnArraysOrReturnNull) replaced.accept(pos); 295 } 296 297 return askForReturnArraysOrReturnNull 298 299 ? new Ret3<int[], int[], int[]> 300 (hasSrcPosArr, replaced.build().toArray(), failedUpdate.build().toArray()) 301 : null; 302 } 303 304 305 // ******************************************************************************************** 306 // ******************************************************************************************** 307 // Complete Vector-Resolve Methods - HREF-ATTRIBUTE 308 // ******************************************************************************************** 309 // ******************************************************************************************** 310 311 312 /** 313 * Convenience Method. 314 * <BR />Invokes: {@link #resolveAllHREF(Vector, int, int, URL, SD, boolean)} 315 */ 316 public static Ret3<int[], int[], int[]> resolveAllHREF( 317 Vector<? super TagNode> html, URL sourcePage, SD quote, 318 boolean askForReturnArraysOrReturnNull 319 ) 320 { return resolveAllHREF(html, 0, -1, sourcePage, quote, askForReturnArraysOrReturnNull); } 321 322 /** 323 * Convenience Method. 324 * <BR />Accepts: {@code DotPair}. 325 * <BR />Invokes: {@link #resolveAllHREF(Vector, int, int, URL, SD, boolean)} 326 */ 327 public static Ret3<int[], int[], int[]> resolveAllHREF( 328 Vector<? super TagNode> html, DotPair dp, URL sourcePage, SD quote, 329 boolean askForReturnArraysOrReturnNull 330 ) 331 { 332 return resolveAllHREF 333 (html, dp.start, dp.end + 1, sourcePage, quote, askForReturnArraysOrReturnNull); 334 } 335 336 /** 337 * This method shall resolve all partial {@code URL} addresses that are found within 338 * {@code TagNode} elements having {@code 'HREF=...'} attributes. Each instance of 339 * {@code TagNode} found in the input HTML {@code Vector} that has an {@code 'HREF'} 340 * attribute - if the {@code 'URL'} is only partially resolve - shall be updated and replaced 341 * with a new {@code TagNode} with a fully resolved {@code URL}. 342 * 343 * <EMBED CLASS='external-html' DATA-FILE-ID=BASE_URL_NOTE> 344 * 345 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVECSUP> 346 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 347 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 348 * 349 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 350 * (possibly-relative) {@code URL's} in the HTML-{@code Vector} will be resolved. 351 * 352 * @param quote A choice for the quotes to use. In most cases, {@code URL} attribute 353 * <B STYLE="color: red;">values</B> do not contain quotation-marks. So likely either 354 * choice would work just fine, without exceptions. 355 * 356 * <BR /><BR /><B>NOTE:</B> <I>null may be passed to this parameter</I>, and if it is 357 * the original quotation marks found in the {@code TagNode's 'HREF'} attribute will be 358 * reused. Passing null to this parameter should almost always be easiest, safest. 359 * 360 * @param askForReturnArraysOrReturnNull This (long-named) parameter is merely here to 361 * facilitate retrieving more information from this method - <I>if necessary</I>. When this 362 * parameter receives the following values: 363 * 364 * <BR /><BR /><UL CLASS=JDUL> 365 * <LI> <B>TRUE:</B> Three integer {@code int[]} arrays will be returned as listed in the 366 * <B>{@code Returns:}</B> section of this method's documentation. 367 * </LI> 368 * 369 * <LI><B>FALSE:</B> This method shall return null. </LI> 370 * </UL> 371 * 372 * @return If input parameter {@code 'askForReturnArraysOrReturnNull'} has been passed 373 * {@code FALSE}, this method shall return null. Otherwise, (if passed {@code TRUE}), then 374 * this method shall return an instance of {@code 'Ret3<int[], int[], int[]>'} - which is 375 * <I>returning three separate integer-arrays about what was found, and what has occurred.</I> 376 * 377 * <BR /><BR /> 378 * Three arrays are returned as a result of this method's invocation. Keep in mind that 379 * though the information might be superfluous, rejecting these arrays away is easy. 380 * They are provided as a matter of convenience for cases where more details information is 381 * mandatory for ensuring that long lists of {@code HTMLNode's} were properly updated. 382 * 383 * <BR /><BR /><OL CLASS=JDOL> 384 * <LI> {@code Ret3.a (int[])} 385 * <BR /><BR /> 386 * The first {@code int[] array} shall contain a list of the index of every 387 * {@code TagNode} in the input-{@code Vector} parameter's range that <B><I>contained</B> 388 * </I> a non-null HTML {@code 'HREF'} Attribute. 389 * <BR /><BR /> 390 * </LI> 391 * 392 * <LI> {@code Ret3.b (int[])} 393 * <BR /><BR /> 394 * The second {@code int[] array} will contain an index-list of the indices 395 * which contained {@code TagNode's} that were <B><I>replaced</I></B> by the 396 * internal-resolve logic. 397 * <BR /><BR /> 398 * </LI> 399 * 400 * <LI> {@code Ret3.c (int[])} 401 * <BR /><BR /> 402 * The third {@code int[] array} will contain an index-list of the indices 403 * which contained {@code TagNode's} whose {@code 'HREF=...'} attribute 404 * <I><B>failed</I></B> to be resolved by the internal-resolve logic, <I>or</I> caused a 405 * {@code QuotesException} to throw. 406 * </LI> 407 * </OL> 408 * 409 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 410 * 411 * @see #resolve(String, URL) 412 * @see TagNode#AV(String) 413 * @see TagNode#setAV(String, String, SD) 414 */ 415 public static Ret3<int[], int[], int[]> resolveAllHREF( 416 Vector<? super TagNode> html, int sPos, int ePos, URL sourcePage, SD quote, 417 boolean askForReturnArraysOrReturnNull 418 ) 419 { 420 // Retrieve the Vector-location of any TagNode on the page that has 421 // a "HREF=..." attribute. These are almost always HTML <IMG> elements. 422 // NOTE: FIND Method's are "READ ONLY" - the Cast will make no difference at run-time. 423 // The @SuppressWarnings is to overcome the cast of 'html' 424 425 @SuppressWarnings("unchecked") 426 int[] hasHRefPosArr = InnerTagFind.all((Vector<HTMLNode>) html, sPos, ePos, "href"); 427 428 // Java Stream's are convenient for keeping "Growing Lists" of return values. 429 // This builder shall keep a list of all URL's that failed to update - for any reason 430 // **UNLESS** the reason is that the URL was already a fully-resolved, non-partial URL 431 432 IntStream.Builder failedUpdate = askForReturnArraysOrReturnNull 433 ? IntStream.builder() 434 : null; 435 436 // This stream will keep a list of all URL's that were updated, and whose TagNode's 437 // were replaced inside the input HTML Vector 438 439 IntStream.Builder replaced = askForReturnArraysOrReturnNull 440 ? IntStream.builder() 441 : null; 442 443 for (int pos : hasHRefPosArr) 444 { 445 // Get the node at the index 446 TagNode tn = (TagNode) html.elementAt(pos); 447 448 // 1) Retrieve the HREF Attribute 449 // 2) if it is a partial-URL resolve it 450 // 3) Convert to a String 451 452 String oldURL = tn.AV("HREF"); 453 URL newURL = resolve(oldURL, sourcePage); 454 455 // Some URL's cannot be resolved, if so, just skip this TagNode. 456 // Log the index to the stream (if requested), and continue. 457 458 if (newURL == null) 459 { if (askForReturnArraysOrReturnNull) failedUpdate.accept(pos); continue; } 460 461 // If the URL was already a fully-resolved-URL, continue - don't replace the TagNode; 462 // No logging needed here, the URL was *already* resolved... 463 464 if (oldURL.length() == newURL.toString().length()) continue; 465 466 // Replace the HREF Attribute in the TagNode. This builds a new instance of TagNode 467 // If there is an exception, log the index to the stream (if requested), and continue. 468 469 try 470 { tn = tn.setAV("href", newURL.toString(), quote); } 471 472 catch (QuotesException qex) 473 { if (askForReturnArraysOrReturnNull) failedUpdate.accept(pos); continue; } 474 475 // Replace the index in the Vector containing the old TagNode with the new one. 476 html.setElementAt(tn , pos); 477 478 // The Vector-Index at this position had it's old TagNode removed and replaced with a 479 // new updated one. Log this to the stream-list so to allow the user to know. 480 481 if (askForReturnArraysOrReturnNull) replaced.accept(pos); 482 } 483 484 return askForReturnArraysOrReturnNull 485 486 ? new Ret3<int[], int[], int[]> 487 (hasHRefPosArr, replaced.build().toArray(), failedUpdate.build().toArray()) 488 : null; 489 } 490 491 492 // ******************************************************************************************** 493 // ******************************************************************************************** 494 // Resolve, Not Keep Exceptions 495 // ******************************************************************************************** 496 // ******************************************************************************************** 497 498 499 /** 500 * Convenience Method. 501 * <BR />Invokes: {@link #resolveHREF(TagNode, URL)}. 502 * <BR />And-Then: {@link TagNode#setAV(String, String, SD)} 503 */ 504 public static TagNode resolveHREFAndUpdate(TagNode tnWithHREF, URL sourcePage) 505 { 506 URL url = resolveHREF(tnWithHREF, sourcePage); 507 508 return (url == null) 509 ? null 510 : tnWithHREF.setAV("href", url.toString(), null); 511 } 512 513 514 /** 515 * This should be used for {@code TagNode's} that contain an {@code 'HREF'} inner-tag 516 * (attribute). 517 * 518 * @param tnWithHREF <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_TN_HREF> 519 * 520 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode} 521 * (possibly-relative) {@code URL} will be resolved. 522 * 523 * @return A complete-{@code URL} without any missing "presumed data" - such as host/domain or 524 * directory. Null is returned if attempting to build the {@code URL} generated a 525 * {@code MalformedURLException}. 526 * 527 * <BR /><BR /><B STYLE="color: red;">SPECIFICALLY:</B> This method shall catch all 528 * {@code MalformedURLException's}. 529 * 530 * @throws HREFException If the {@code TagNode} passed to parameter {@code 'tnWithHREF'} does 531 * not actually contain an {@code HREF} attribute, then this exception shall throw. 532 * 533 * @see #resolve(String, URL) 534 * @see TagNode#AV(String) 535 */ 536 public static URL resolveHREF(TagNode tnWithHREF, URL sourcePage) 537 { 538 String href = tnWithHREF.AV("href"); 539 540 if (href == null) throw new HREFException( 541 "The TagNode passed to parameter tnWithHREF does not actually contain an " + 542 "HREF attribute." 543 ); 544 545 return resolve(href, sourcePage); 546 } 547 548 549 /** 550 * Convenience Method. 551 * <BR />Invokes: {@link #resolveSRC(TagNode, URL)} 552 * <BR />And-Then: {@link TagNode#setAV(String, String, SD)} 553 */ 554 public static TagNode resolveSRCAndUpdate(TagNode tnWithSRC, URL sourcePage) 555 { 556 URL url = resolveSRC(tnWithSRC, sourcePage); 557 558 return (url == null) 559 ? null 560 : tnWithSRC.setAV("src", url.toString(), null); 561 } 562 563 564 /** 565 * This should be used for {@code TagNode's} that contain a {@code 'SRC'} inner-tag 566 * (attribute). 567 * 568 * @param tnWithSRC <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_TN_SRC> 569 * 570 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode} 571 * (possibly-relative) {@code URL} will be resolved. 572 * 573 * @return A complete-{@code URL} without any missing "presumed data" - such as host/domain or 574 * directory. Null is returned if attempting to build the {@code URL} generated a 575 * {@code MalformedURLException}. 576 * 577 * <BR /><BR /><B STYLE="color: red;">SPECIFICALLY:</B> This method shall catch all 578 * {@code MalformedURLException's}. 579 * 580 * @throws SRCException If the {@code TagNode} passed to parameter {@code 'tnWithSRC'} does not 581 * actually contain a {@code SRC} attribute, then this exception shall throw. 582 * 583 * @see #resolve(String, URL) 584 * @see TagNode#AV(String) 585 */ 586 public static URL resolveSRC(TagNode tnWithSRC, URL sourcePage) 587 { 588 String src = tnWithSRC.AV("src"); 589 590 if (src == null) throw new SRCException( 591 "The TagNode passed to parameter tnWithSRC does not actually contain a " + 592 "SRC attribute." 593 ); 594 595 return resolve(src, sourcePage); 596 } 597 598 /** 599 * This should be used for lists of {@code TagNode's}, each of which contain an {@code 'HREF'} 600 * inner-tag (attribute). 601 * 602 * @param tnListWithHREF <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_TNLIST_HREF> 603 * 604 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 605 * (possibly-relative) {@code URL's} in the {@code Iterable} will be resolved. 606 * 607 * @return A list of {@code URL's}, each of which have been completed/resolved with the 608 * {@code 'sourcePage'} parameter. Any {@code TagNode} which generated an exception, will 609 * result in a null value in the {@code Vector}. 610 * 611 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_NO_HREF> 612 * 613 * @see #resolve(String, URL) 614 * @see TagNode#AV(String) 615 */ 616 public static Vector<URL> resolveHREFs(Iterable<TagNode> tnListWithHREF, URL sourcePage) 617 { 618 Vector<URL> ret = new Vector<>(); 619 620 for (TagNode tn : tnListWithHREF) ret.addElement(resolve(tn.AV("href"), sourcePage)); 621 622 return ret; 623 } 624 625 626 /** 627 * This should be used for lists of {@code TagNode's}, each of which contain a {@code 'SRC'} 628 * inner-tag (attribute). 629 * 630 * @param tnListWithSRC <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_TNLIST_SRC> 631 * 632 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 633 * (possibly-relative) {@code URL's} in the {@code Iterable} will be resolved. 634 * 635 * @return A list of {@code URL's}, each of which have been completed/resolved with the 636 * {@code 'sourcePage'} parameter. Any {@code TagNode} which generated an exception, will 637 * result in a null value in the {@code Vector.} 638 * 639 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_NO_SRC> 640 * 641 * @see #resolve(String, URL) 642 * @see TagNode#AV(String) 643 */ 644 public static Vector<URL> resolveSRCs(Iterable<TagNode> tnListWithSRC, URL sourcePage) 645 { 646 Vector<URL> ret = new Vector<>(); 647 648 for (TagNode tn : tnListWithSRC) ret.addElement(resolve(tn.AV("src"), sourcePage)); 649 650 return ret; 651 } 652 653 654 /** 655 * This will use a "pointer array" - an array containing indexes into the downloaded page to 656 * retrieve {@code TagNode's}. The {@code TagNode's} to which this pointer-array points - 657 * must each contain an {@code HREF} inner-tag with a {@code URL}, or a partial {@code URL}. 658 * 659 * <EMBED CLASS='external-html' DATA-FILE-ID=BASE_URL_NOTE> 660 * 661 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 662 * 663 * @param nodePosArr An array of pointers into the page or sub-page. The pointers must 664 * reference {@code TagNode's} that contain {@code HREF} attributes. Integer-pointer Arrays 665 * are usually returned from the {@code package 'NodeSearch'} "Find" methods. 666 * 667 * <DIV CLASS="EXAMPLE">{@code 668 * // Retrieve 'pointers' to all the '<A HREF=...>' TagNode's. The term 'pointer' refers to 669 * // integer-indices into the vectorized-html variable 'page' 670 * int[] anchorPosArr = TagNodeFind.all(page, TC.OpeningTags, "a"); 671 * 672 * // Extract each HREF inner-tag, and construct a {@code URL}. Use the 'sourcePage' parameter 673 * // if the URL is only partially-resolved 674 * Vector<URL> urls = Links.resolveHREFs(page, anchorPosArr, mySourcePage); 675 * }</DIV> 676 * <BR /><I>which would obtain a pointer-array / (a.k.a. a "vector-index-array") to every HTML 677 * {@code "<A ...>"} element</I> that was available in the HTML page-{@code Vector} parameter 678 * {@code 'html'}, and then resolve any shortened {@code URL's}. 679 * 680 * @param sourcePage This is the source page {@code URL} from whence the (possibly relative) 681 * {@code TagNode URL's} in the {@code Vector} are to be resolved. 682 * 683 * @return A list of {@code URL's}, each of which have been completed/resolved with the 684 * {@code 'sourcePage'} parameter. Any {@code TagNode} which generated an exception, will 685 * result in a null value in the {@code Vector}. However, if any of the nodes pointed to by 686 * the {@code 'nodePosArr'} parameter do not contain opening {@code TagNode} elements, then 687 * this mistake shall generate {@code TagNodeExpectedException's}. 688 * 689 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_NO_HREF> 690 * 691 * @throws ArrayIndexOutOfBoundsException 692 * <EMBED CLASS='external-html' DATA-FILE-ID=ATTR_AIOOB_EX> 693 * @throws OpeningTagNodeExpectedException 694 * <EMBED CLASS='external-html' DATA-FILE-ID=OPEN_TNE_EX> 695 * 696 * @throws TagNodeExpectedException <EMBED CLASS='external-html' DATA-FILE-ID=TNE_EX> 697 * 698 * @see #resolve(String, URL) 699 * @see TagNode#AV(String) 700 */ 701 public static Vector<URL> resolveHREFs 702 (Vector<? extends HTMLNode> html, int[] nodePosArr, URL sourcePage) 703 { 704 // Return Vector 705 Vector<URL> ret = new Vector<>(); 706 707 for (int nodePos : nodePosArr) 708 { 709 HTMLNode n = html.elementAt(nodePos); 710 711 // Must be an HTML TagNode 712 if (! n.isTagNode()) throw new TagNodeExpectedException(nodePos); 713 714 TagNode tn = (TagNode) n; 715 716 // Must be an "Opening" HTML TagNode 717 if (tn.isClosing) throw new OpeningTagNodeExpectedException(nodePos); 718 719 // Resolve the 'HREF', save the URL 720 ret.addElement(resolve(tn.AV("href"), sourcePage)); 721 } 722 723 return ret; 724 } 725 726 727 /** 728 * This will use a "pointer array" - an array containing indexes into the downloaded page to 729 * retrieve {@code TagNode's}. The {@code TagNode's} to which this pointer-array points - must 730 * each contain a {@code SRC} inner-tag with a {@code URL}, or a partial {@code URL}. 731 * 732 * <EMBED CLASS='external-html' DATA-FILE-ID=BASE_URL_NOTE> 733 * 734 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> Any HTML page (or sub-page) 735 * 736 * @param nodePosArr An array of pointers into the page or sub-page. The pointers must 737 * reference {@code TagNode's} that contain {@code SRC} attributes. Integer-pointer Arrays are 738 * usually returned from the {@code package 'NodeSearch'} "Find" methods. 739 * 740 * <DIV CLASS="EXAMPLE">{@code 741 * // Retrieve 'pointers' to all the '<IMG SRC=...>' TagNode's. The term 'pointer' refers to 742 * // integer-indices into the vectorized-html variable 'page' 743 * 744 * int[] picturePosArr = TagNodeFind.all(page, TC.OpeningTags, "img"); 745 * 746 * // Extract each SRC inner-tag, and construct a {@code URL}. Use the 'sourcePage' parameter 747 * // if the URL is only partially-resolved 748 * 749 * Vector<URL> urls = Links.resolveSRCs(page, picturePosArr, mySourcePage); 750 * }</DIV> 751 * 752 * <BR /><I>which would obtain a pointer-array / (a.k.a. a "vector-index-array") to every HTML 753 * {@code "<IMG ...>"} element</I> that was available in the HTML page-{@code Vector} parameter 754 * {@code 'html'}, and then resolve any shorted image {@code URL's}. 755 * 756 * @param sourcePage This is the source page {@code URL} from whence the (possibly relative) 757 * {@code TagNode URL's} in the {@code Vector} are to be resolved. 758 * 759 * @return A list of {@code URL's}, each of which have been completed/resolved with the 760 * {@code 'sourcePage'} parameter. Any {@code TagNode} which generated an exception, will 761 * result in a null value in the {@code Vector}. However, if any of the nodes pointed to by 762 * the {@code 'nodePosArr'} parameter do not contain opening {@code TagNode} elements, then 763 * this mistake shall generate {@code TagNodeExpectedException's}. 764 * 765 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_NO_SRC> 766 * 767 * @throws ArrayIndexOutOfBoundsException 768 * <EMBED CLASS='external-html' DATA-FILE-ID=ATTR_AIOOB_EX> 769 * @throws OpeningTagNodeExpectedException 770 * <EMBED CLASS='external-html' DATA-FILE-ID=OPEN_TNE_EX> 771 * 772 * @throws TagNodeExpectedException <EMBED CLASS='external-html' DATA-FILE-ID=TNE_EX> 773 * 774 * @see #resolve(String, URL) 775 * @see TagNode#AV(String) 776 */ 777 public static Vector<URL> resolveSRCs 778 (Vector<? extends HTMLNode> html, int[] nodePosArr, URL sourcePage) 779 { 780 // Return Vector 781 Vector<URL> ret = new Vector<>(); 782 783 for (int nodePos : nodePosArr) 784 { 785 HTMLNode n = html.elementAt(nodePos); 786 787 // Must be an HTML TagNode 788 if (! n.isTagNode()) throw new TagNodeExpectedException(nodePos); 789 790 TagNode tn = (TagNode) n; 791 792 // Must be an "Opening" HTML TagNode 793 if (tn.isClosing) throw new OpeningTagNodeExpectedException(nodePos); 794 795 // Resolve the "SRC", save the URL 796 ret.addElement(resolve(tn.AV("src"), sourcePage)); 797 } 798 799 return ret; 800 } 801 802 803 /** 804 * This will convert <I><B>a list of </B></I> simple java {@code String's} to a 805 * list/{@code Vector} of {@code URL's}, de-referencing any missing information using the 806 * {@code 'sourcePage'} parameter. 807 * 808 * @param src a list of strings - usually partially or totally completed Internet {@code URL's} 809 * 810 * @param sourcePage This is the source page {@code URL} from which the {@code String's} 811 * (possibly-relative) {@code URL's} in the {@code Vector} will be resolved. 812 * 813 * @return A list of {@code URL's}, each of which have been completed/resolved with the 814 * {@code 'sourcePage'} parameter. If there were any {@code String's} that were zero-length or 815 * null, then null is returned in the related {@code Vector} position. If any 816 * {@code TagNode} causes a {@code MalformedURLException}, then that position in the 817 * {@code Vector} will be null. 818 * 819 * @see #resolve(String, URL) 820 */ 821 public static Vector<URL> resolve(Vector<String> src, URL sourcePage) 822 { 823 Vector<URL> ret = new Vector<>(); 824 825 for (String s : src) ret.addElement(resolve(s, sourcePage)); 826 827 return ret; 828 } 829 830 /** 831 * This will convert a simple java {@code String} to a {@code URL}, de-referencing any missing 832 * information using the {@code 'sourcePage'} parameter. 833 * 834 * @param src Any java {@code String}, usually one which was scraped from an HTML-Page, and 835 * needs to be "completed." 836 * 837 * @param sourcePage This is the source page {@code URL} from which the String 838 * (possibly-relative) {@code URL} will be resolved. 839 * 840 * @return A {@code URL}, which has been completed/resolved with the {@code 'sourcePage'} 841 * parameter. If parameter {@code 'src'} is null or zero-length, then this method will also 842 * return null. If a {@code MalformedURLException} is generated, null will also be returned. 843 */ 844 public static URL resolve(String src, URL sourcePage) 845 { 846 if (sourcePage == null) throw new NullPointerException( 847 "Though you may provide null to the partial-URL to dereference parameter, null " + 848 "may not be passed to the Source-Page Parameter. The purpose of the 'resolve' " + 849 "operation is to resolve partial-URLs against a source-page (root) URL. " + 850 "Therefore this is not allowed." 851 ); 852 853 if (src == null) return null; 854 855 src = src.trim(); 856 857 if (src.length() == 0) return null; 858 859 String srcLC = src.toLowerCase(); 860 861 if (StrCmpr.startsWithXOR(srcLC, _NON_URL_HREFS)) return null; 862 863 if (srcLC.startsWith("http://") || srcLC.startsWith("https://")) 864 865 try 866 { return new URL(src); } 867 868 catch (MalformedURLException e) { return null; } 869 870 if (src.startsWith("//") && (src.charAt(3) != '/')) 871 872 try 873 { return new URL(sourcePage.getProtocol().toLowerCase() + ":" + src); } 874 875 catch (MalformedURLException e) { return null; } 876 877 if (src.startsWith("/")) 878 879 try 880 { 881 return new URL( 882 sourcePage.getProtocol().toLowerCase() + "://" + 883 sourcePage.getHost().toLowerCase() + 884 src 885 ); 886 } 887 888 catch (MalformedURLException e) { return null; } 889 890 if (src.startsWith("../")) 891 { 892 String sourcePageStr = sourcePage.toString(); 893 short nLevels = 0; 894 895 do { nLevels++; src = src.substring(3); } 896 while (src.startsWith("../")); 897 898 String directory = StringParse.dotDotParentDirectory(sourcePage.toString(), nLevels); 899 900 try { return new URL(directory + src); } 901 catch (Exception e) { return null; } 902 } 903 904 String root = 905 sourcePage.getProtocol().toLowerCase() + "://" + 906 sourcePage.getHost().toLowerCase(); 907 908 String path = sourcePage.getPath().trim(); 909 int pos = StringParse.findLastFrontSlashPos(path); 910 911 if (pos == -1) throw new StringIndexOutOfBoundsException( 912 "The URL you have provided: " + sourcePage.toString() + " does not have a '/' " + 913 "front-slash character in it's path. Cannot proceed resolving relative-URL's " + 914 "without this." 915 ); 916 917 path = path.substring(0, pos + 1); 918 919 try { return new URL(root + path + src); } 920 catch (MalformedURLException e) { return null; } 921 } 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 // ******************************************************************************************** 944 // ******************************************************************************************** 945 // Resolve, KE - Keep Exceptions 946 // ******************************************************************************************** 947 // ******************************************************************************************** 948 949 950 /** 951 * This should be used for {@code TagNode's} that contain an {@code 'HREF'} inner-tag 952 * (attribute). 953 * 954 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_KE> 955 * 956 * @param tnWithHREF <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_TN_HREF> 957 * 958 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 959 * (possibly-relative) {@code URL} will be resolved. 960 * 961 * @return A complete-{@code URL} without any missing "presumed data" - such as host/domain or 962 * directory. If there were no {@code HREF} tag, then null is returned. If 963 * the {@code TagNode} causes a {@code MalformedURLException}, that is returned in 964 * {@code Ret2.b} 965 * 966 * <BR /><BR /><B STYLE="color: red;">SPECIFICALLY:</B> This method shall catch all 967 * {@code MalformedURLException's}. 968 * 969 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_RET2> 970 * 971 * @throws HREFException If the {@code TagNode} passed to parameter {@code 'tnWithHREF'} does 972 * not actually contain an {@code HREF} attribute, then this exception shall throw. 973 * 974 * @see #resolve_KE(String, URL) 975 * @see TagNode#AV(String) 976 * @see Ret2 977 */ 978 public static Ret2<URL, MalformedURLException> resolveHREF_KE 979 (TagNode tnWithHREF, URL sourcePage) 980 { 981 String href = tnWithHREF.AV("href"); 982 983 if (href == null) throw new HREFException( 984 "The TagNode passed to parameter tnWithHREF does not actually contain an " + 985 "HREF attribute." 986 ); 987 988 return resolve_KE(href, sourcePage); 989 } 990 991 992 /** 993 * This should be used for {@code TagNode's} that contain a {@code 'SRC'} inner-tag 994 * (attribute). 995 * 996 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_KE> 997 * 998 * @param tnWithSRC <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_TN_SRC> 999 * 1000 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 1001 * (possibly-relative) {@code URL} will be resolved. 1002 * 1003 * @return A complete-{@code URL} without any missing "presumed data" - such as host/domain or 1004 * directory. If there were no {@code SRC} tag, then null is returned. If the 1005 * {@code TagNode} causes a {@code MalformedURLException}, that is returned in {@code Ret2.b} 1006 * 1007 * <BR /><BR /><B STYLE="color: red;">SPECIFICALLY:</B> This method shall catch all 1008 * {@code MalformedURLException's}. 1009 * 1010 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_RET2> 1011 * 1012 * @throws SRCException If the {@code TagNode} passed to parameter {@code 'tnWithSRC'} does not 1013 * actually contain a {@code SRC} attribute, then this exception shall throw. 1014 * 1015 * @see #resolve_KE(String, URL) 1016 * @see TagNode#AV(String) 1017 * @see Ret2 1018 */ 1019 public static Ret2<URL, MalformedURLException> resolveSRC_KE 1020 (TagNode tnWithSRC, URL sourcePage) 1021 { 1022 String src = tnWithSRC.AV("src"); 1023 1024 if (src == null) throw new SRCException( 1025 "The TagNode passed to parameter tnWithSRC does not actually contain a " + 1026 "SRC attribute." 1027 ); 1028 1029 return resolve_KE(src, sourcePage); 1030 } 1031 1032 1033 /** 1034 * This should be used for lists of {@code TagNode's}, each of which contain an {@code 'HREF'} 1035 * inner-tag (attribute). 1036 * 1037 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_KE> 1038 * 1039 * @param tnListWithHREF <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_TNLIST_HREF> 1040 * 1041 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 1042 * (possibly-relative) {@code URL's} in the {@code Iterable} will be resolved. 1043 * 1044 * @return A list of {@code URL's}, each of which have been completed/resolved with the 1045 * {@code 'sourcePage'} parameter. If there were any {@code TagNode} with no {@code HREF} tag, 1046 * then null is returned in the related {@code Vector} position. If any {@code TagNode} causes 1047 * a {@code MalformedURLException}, then that position in the {@code Vector} will contain the 1048 * exception in {@code Ret2.b} 1049 * 1050 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_NO_HREF> 1051 * 1052 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_RET2> 1053 * 1054 * @see #resolve_KE(String, URL) 1055 * @see TagNode#AV(String) 1056 * @see Ret2 1057 */ 1058 public static Vector<Ret2<URL, MalformedURLException>> resolveHREFs_KE 1059 (Iterable<TagNode> tnListWithHREF, URL sourcePage) 1060 { 1061 Vector<Ret2<URL, MalformedURLException>> ret = new Vector<>(); 1062 1063 for (TagNode tn : tnListWithHREF) ret.addElement(resolve_KE(tn.AV("href"), sourcePage)); 1064 1065 return ret; 1066 } 1067 1068 1069 /** 1070 * This should be used for lists of {@code TagNode's}, each of which contain a {@code 'SRC'} 1071 * inner-tag (attribute). 1072 * 1073 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_KE> 1074 * 1075 * @param tnListWithSRC <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_TNLIST_SRC> 1076 * 1077 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 1078 * (possibly-relative) {@code URL's} in the {@code Iterable} will be resolved. 1079 * 1080 * @return A list of {@code URL's}, each of which have been completed/resolved with the 1081 * {@code 'sourcePage'} parameter. If there were any {@code TagNode} with no {@code SRC} tag, 1082 * then null is returned in the related {@code Vector} position. If any {@code TagNode} causes 1083 * a {@code MalformedURLException}, then that position in the {@code Vector} will contain the 1084 * exception in {@code Ret2.b} 1085 * 1086 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_NO_SRC> 1087 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_RET2> 1088 * 1089 * @see #resolve_KE(String, URL) 1090 * @see TagNode#AV(String) 1091 * @see Ret2 1092 */ 1093 public static Vector<Ret2<URL, MalformedURLException>> resolveSRCs_KE 1094 (Iterable<TagNode> tnListWithSRC, URL sourcePage) 1095 { 1096 Vector<Ret2<URL, MalformedURLException>> ret = new Vector<>(); 1097 1098 for (TagNode tn : tnListWithSRC) ret.addElement(resolve_KE(tn.AV("src"), sourcePage)); 1099 1100 return ret; 1101 } 1102 1103 1104 /** 1105 * This will use a "pointer array" - an array containing indexes into the downloaded page to 1106 * retrieve {@code TagNode's}. The {@code TagNode} to which this pointer-array points - must 1107 * contain {@code HREF} inner-tags with {@code URL's}, or partial {@code URL's}. 1108 * 1109 * <EMBED CLASS='external-html' DATA-FILE-ID=BASE_URL_NOTE> 1110 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_KE> 1111 * 1112 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> Any HTML page (or sub-page) 1113 * 1114 * @param nodePosArr An array of pointers into the page or sub-page. The pointers must 1115 * reference {@code TagNode's} that contain {@code HREF} attributes. Integer-pointer Arrays 1116 * are usually return from the {@code package 'NodeSearch'} "Find" methods. 1117 * 1118 * <DIV CLASS="EXAMPLE">{@code 1119 * // Retrieve 'pointers' to all the '<A HREF=...>' TagNode's. The term 'pointer' refers to 1120 * // integer-indices into the vectorized-html variable 'page' 1121 * 1122 * int[] anchorPosArr = TagNodeFind.all(page, TC.OpeningTags, "a"); 1123 * 1124 * // Extract each HREF inner-tag, and construct a URL. Use the 'sourcePage' parameter if 1125 * // the URL is only partially-resolved. If any URL's on the original-page are invalid, the 1126 * // method shall not crash, but save the exception instead. 1127 * 1128 * Vector<Ret2<URL, MalformedURLException> urlsWithEx = 1129 * Links.resolveHREFs_KE(page, picturePosArr, mySourcePage); 1130 * 1131 * // Print out any "failed" urls 1132 * for (Ret2<URL, MalformedURLException> r : urlsWithEx) 1133 * if (r.b != null) 1134 * System.out.println("There was an exception: " + r.b.toString()); 1135 * }</DIV> 1136 * 1137 * <BR /><I>which would obtain a pointer-array / (a.k.a. a "vector-index-array") to every HTML 1138 * {@code "<A ...>"} element</I> that was available in the HTML page-{@code Vector} parameter 1139 * {@code 'html'}., and then resolve any shortened {@code URL's}. 1140 * 1141 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 1142 * (possibly-relative) {@code URL's} in the {@code Vector} will be resolved. 1143 * 1144 * @return A list of {@code URL's}, each of which have been completed/resolved with the 1145 * {@code 'sourcePage'} parameter. If there were any {@code TagNode} with no {@code HREF} tag, 1146 * then null is returned in the related {@code Vector} position. If any {@code TagNode} causes 1147 * a {@code MalformedURLException}, then that position in the {@code Vector} will contain the 1148 * exception in {@code Ret2.b} 1149 * 1150 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_NO_HREF> 1151 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_RET2> 1152 * 1153 * @throws ArrayIndexOutOfBoundsException 1154 * <EMBED CLASS='external-html' DATA-FILE-ID=ATTR_AIOOB_EX> 1155 * @throws OpeningTagNodeExpectedException 1156 * <EMBED CLASS='external-html' DATA-FILE-ID=OPEN_TNE_EX> 1157 * 1158 * @throws TagNodeExpectedException <EMBED CLASS='external-html' DATA-FILE-ID=TNE_EX> 1159 * 1160 * @see #resolve_KE(String, URL) 1161 * @see TagNode#AV(String) 1162 * @see Ret2 1163 */ 1164 public static Vector<Ret2<URL, MalformedURLException>> resolveHREFs_KE 1165 (Vector<? extends HTMLNode> html, int[] nodePosArr, URL sourcePage) 1166 { 1167 // Return Vector 1168 Vector<Ret2<URL, MalformedURLException>> ret = new Vector<>(); 1169 1170 for (int nodePos : nodePosArr) 1171 { 1172 HTMLNode n = html.elementAt(nodePos); 1173 1174 // Must be an HTML TagNode 1175 if (! n.isTagNode()) throw new TagNodeExpectedException(nodePos); 1176 1177 TagNode tn = (TagNode) n; 1178 1179 // Must be an "Opening" HTML TagNode 1180 if (tn.isClosing) throw new OpeningTagNodeExpectedException(nodePos); 1181 1182 // Resolve the "HREF", keep the URL 1183 ret.addElement(resolve_KE(tn.AV("href"), sourcePage)); 1184 } 1185 1186 return ret; 1187 } 1188 1189 /** 1190 * This will use a "pointer array" - an array containing indexes into the downloaded page to 1191 * retrieve {@code TagNode's}. The {@code TagNode} to which this pointer-array points - must 1192 * contain {@code SRC} inner-tags with {@code URL's}, or partial {@code URL's}. 1193 * 1194 * <EMBED CLASS='external-html' DATA-FILE-ID=BASE_URL_NOTE> 1195 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_KE> 1196 * 1197 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> Any HTML page (or sub-page) 1198 * 1199 * @param nodePosArr An array of pointers into the page or sub-page. The pointers must 1200 * reference {@code TagNode's} that contain {@code SRC} attributes. Integer-pointer Arrays are 1201 * usually return from the {@code package 'NodeSearch'} "Find" methods. 1202 * 1203 * <DIV CLASS="EXAMPLE">{@code 1204 * // Retrieve 'pointers' to all the '<IMG SRC=...>' TagNode's. The term 'pointer' refers to 1205 * // integer-indices into the vectorized-html variable 'page' 1206 * 1207 * int[] picturePosArr = TagNodeFind.all(page, TC.OpeningTags, "img"); 1208 * 1209 * // Extract each SRC inner-tag, and construct a URL. Use the 'sourcePage' parameter if 1210 * // the URL is only partially-resolved. If any URL's on the original-page are invalid, 1211 * // the method shall not crash, but save the exception instead. 1212 * 1213 * Vector<Ret2<URL, MalformedURLException> urlsWithEx = 1214 * Links.resolveSRCs_KE(page, picturePosArr, mySourcePage); 1215 * 1216 * // Print out any "failed" urls 1217 * for (Ret2<URL, MalformedURLException> r : urlsWithEx) 1218 * if (r.b != null) 1219 * System.out.println("There was an exception: " + r.b.toString()); 1220 * }</DIV> 1221 * 1222 * <BR /><I>which would obtain a pointer-array / (a.k.a. a "vector-index-array") to every HTML 1223 * {@code "<IMG ...>"} element</I> that was available in the HTML page-{@code Vector} parameter 1224 * {@code 'html'}, and then resolve any shortened {@code URL's}. 1225 * 1226 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 1227 * (possibly-relative) {@code URL's} in the {@code Vector} will be resolved. 1228 * 1229 * @return A list of {@code URL's}, each of which have been completed/resolved with the 1230 * {@code 'sourcePage'} parameter. If there were any {@code TagNode} with no {@code SRC} tag, 1231 * then null is returned in the related {@code Vector} position. If any {@code TagNode} causes 1232 * a {@code MalformedURLException}, then that position in the {@code Vector} will contain the 1233 * exception in {@code Ret2.b} 1234 * 1235 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_NO_SRC> 1236 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_RET2> 1237 * 1238 * @throws ArrayIndexOutOfBoundsException 1239 * <EMBED CLASS='external-html' DATA-FILE-ID=ATTR_AIOOB_EX> 1240 * @throws OpeningTagNodeExpectedException 1241 * <EMBED CLASS='external-html' DATA-FILE-ID=OPEN_TNE_EX> 1242 * 1243 * @throws TagNodeExpectedException <EMBED CLASS='external-html' DATA-FILE-ID=TNE_EX> 1244 * 1245 * @see #resolve_KE(String, URL) 1246 * @see TagNode#AV(String) 1247 * @see Ret2 1248 */ 1249 public static Vector<Ret2<URL, MalformedURLException>> resolveSRCs_KE 1250 (Vector<? extends HTMLNode> html, int[] nodePosArr, URL sourcePage) 1251 { 1252 // Return Vector 1253 Vector<Ret2<URL, MalformedURLException>> ret = new Vector<>(); 1254 1255 for (int nodePos : nodePosArr) 1256 { 1257 HTMLNode n = html.elementAt(nodePos); 1258 1259 // Must be an HTML TagNode 1260 if (! n.isTagNode()) throw new TagNodeExpectedException(nodePos); 1261 1262 TagNode tn = (TagNode) n; 1263 1264 // Must be an "Opening" HTML TagNode 1265 if (tn.isClosing) throw new OpeningTagNodeExpectedException(nodePos); 1266 1267 // Resolve "SRC" and keep URL's 1268 ret.addElement(resolve_KE(tn.AV("src"), sourcePage)); 1269 } 1270 1271 return ret; 1272 } 1273 1274 /** 1275 * Resolve all {@code URL's}, represented as {@code String's}, inside of a {@code Vector}. 1276 * 1277 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_KE> 1278 * 1279 * @param src a list of {@code String's} - usually partially or totally completed Internet 1280 * {@code URL's} 1281 * 1282 * @param sourcePage This is the source page {@code URL} from which the {@code String's} 1283 * (possibly-relative) {@code URL's} in the {@code Vector} will be resolved. 1284 * 1285 * @return A list of {@code URL's}, each of which have been completed/resolved with the 1286 * {@code 'sourcePage'} parameter. If there were any {@code String's} that were zero-length or 1287 * null, then null is returned in the related {@code Vector} position. If any {@code TagNode} 1288 * causes a {@code MalformedURLException}, then that position in the {@code Vector} will 1289 * contain the exception in {@code Ret2.b} 1290 * 1291 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_RET2> 1292 * 1293 * @see #resolve_KE(String, URL) 1294 * @see Ret2 1295 */ 1296 public static Vector<Ret2<URL, MalformedURLException>> resolve_KE 1297 (Vector<String> src, URL sourcePage) 1298 { 1299 Vector<Ret2<URL, MalformedURLException>> ret = new Vector<>(); 1300 1301 for (String s : src) ret.addElement(resolve_KE(s, sourcePage)); 1302 1303 return ret; 1304 } 1305 1306 /** 1307 * This will convert a simple java {@code String} to a {@code URL}, de-referencing any missing 1308 * information using the {@code 'sourcePage'} parameter. 1309 * 1310 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_KE> 1311 * 1312 * @param src Any java {@code String}, usually one which was scraped from an HTML-Page, and 1313 * needs to be "completed." 1314 * 1315 * @param sourcePage This is the source page {@code URL} from which the String (possibly 1316 * relative) {@code URL} will be resolved. 1317 * 1318 * @return A {@code URL}, which has been completed/resolved with the {@code 'sourcePage'} 1319 * parameter. If parameter {@code 'src'} is null or zero-length, null will be returned. If a 1320 * {@code MalformedURLException} is thrown, that will be included with the {@code Ret2<>} 1321 * result. 1322 * 1323 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_RET2> 1324 * 1325 * @see Ret2 1326 */ 1327 public static Ret2<URL, MalformedURLException> resolve_KE(String src, URL sourcePage) 1328 { 1329 if (sourcePage == null) throw new NullPointerException( 1330 "Though you may provide null to the partial-URL to dereference parameter, null " + 1331 "may not be passed to the Source-Page Parameter. The purpose of the 'resolve' " + 1332 "operation is to resolve partial-URLs against a source-page (root) URL. " + 1333 "Therefore this is not allowed." 1334 ); 1335 1336 if (src == null) return null; 1337 1338 src = src.trim(); 1339 1340 if (src.length() == 0) return null; 1341 1342 String srcLC = src.toLowerCase(); 1343 1344 if (StrCmpr.startsWithXOR 1345 (srcLC, "tel:", "javascript:", "mailto:", "magnet:", "file:", "ftp:", "#")) 1346 1347 return new Ret2<URL, MalformedURLException> 1348 (null, new MalformedURLException( 1349 "InnerTag/Attribute begins with: " + src.substring(0, 1 + src.indexOf(":")) + 1350 ", so it is not a hyper-link." 1351 )); 1352 1353 1354 // Includes the first few characters of the URL - for reporting/convenience. 1355 // If this is an "image", the image-type & name will be included 1356 1357 if (StrCmpr.startsWithXOR(srcLC, "data:", "blob:")) 1358 1359 return new Ret2<URL, MalformedURLException>(null, new MalformedURLException( 1360 "InnerTag/Attribute begins with: " + 1361 ((src.length() > 25) ? src.substring(0, 25) : src) + 1362 ", not a URL." 1363 )); 1364 1365 1366 if (srcLC.startsWith("http://") || srcLC.startsWith("https://")) 1367 1368 try 1369 { return new Ret2<URL, MalformedURLException>(new URL(src), null); } 1370 1371 catch (MalformedURLException e) 1372 { return new Ret2<URL, MalformedURLException>(null, e); } 1373 1374 1375 if (src.startsWith("//") && (src.charAt(3) != '/')) 1376 1377 try 1378 { 1379 return new Ret2<URL, MalformedURLException> 1380 (new URL( sourcePage.getProtocol().toLowerCase() + ":" + src), null); 1381 } 1382 1383 catch (MalformedURLException e) 1384 { return new Ret2<URL, MalformedURLException>(null, e); } 1385 1386 1387 if (src.startsWith("/")) 1388 1389 try 1390 { 1391 return new Ret2<URL, MalformedURLException>(new URL( 1392 sourcePage.getProtocol().toLowerCase() + "://" + 1393 sourcePage.getHost().toLowerCase() + 1394 src), null 1395 ); 1396 } 1397 1398 catch (MalformedURLException e) 1399 { return new Ret2<URL, MalformedURLException>(null, e); } 1400 1401 1402 if (src.startsWith("../")) 1403 { 1404 String sourcePageStr = sourcePage.toString(); 1405 short nLevels = 0; 1406 1407 do 1408 { nLevels++; src = src.substring(3); } 1409 while (src.startsWith("../")); 1410 1411 String directory = StringParse.dotDotParentDirectory(sourcePage.toString(), nLevels); 1412 1413 try 1414 { return new Ret2<URL, MalformedURLException>(new URL(directory + src), null); } 1415 1416 catch (MalformedURLException e) 1417 { return new Ret2<URL, MalformedURLException>(null, e); } 1418 1419 catch (Exception e) 1420 { 1421 return new Ret2<URL, MalformedURLException> 1422 (null, 1423 new MalformedURLException(e.getClass().getCanonicalName() + 1424 ":" + e.getMessage()) 1425 ); 1426 } 1427 } 1428 1429 1430 String root = 1431 sourcePage.getProtocol().toLowerCase() + "://" + 1432 sourcePage.getHost().toLowerCase(); 1433 1434 String path = sourcePage.getPath().trim(); 1435 int pos = StringParse.findLastFrontSlashPos(path); 1436 1437 if (pos == -1) throw new StringIndexOutOfBoundsException( 1438 "The URL you have provided: " + sourcePage.toString() + 1439 " does not have a '/' front-slash character in it's path." + 1440 "Cannot proceed resolving relative-URL's without this." 1441 ); 1442 1443 path = path.substring(0, pos + 1); 1444 1445 try 1446 { return new Ret2<URL, MalformedURLException>(new URL(root + path + src), null); } 1447 1448 catch (MalformedURLException e) 1449 { return new Ret2<URL, MalformedURLException>(null, e); } 1450 } 1451}