001 002package Torello.HTML.Tools.Images; 003 004import Torello.Java.*; 005 006// Needed for a JavaDoc Comment {@link ...} 007import Torello.HTML.TagNode; 008 009import java.net.URL; 010import java.io.Serializable; 011import java.util.Arrays; 012 013/** 014 * After downloading all of the user's requested images, the class {@link ImageScraper} returns an 015 * instance of this class. 016 * 017 * <BR /><BR />When a download request has completed, this class will be instantiated and returned. 018 * Care has been taken to ensure this class does not freeze nor fail when a particular image fails 019 * to download. This level of control is customizable, so if the programmer would prefer download 020 * execution to halt immediately upon exception, there is are several settings for this in the 021 * class {@link Request}. 022 * 023 * <BR /><BR />The link below contains the output of invoking the {@link #toString} method on a 024 * {@code 'Results'} instance after downloading all of the HTML {@code <IMG SRC=...>} tags that 025 * were scraped from the Web-Server {@code news.yahoo.com}. 026 * 027 * <BR /><BR /><B><CODE><A HREF='doc-files/results.txt'>results.txt</A></CODE></B> 028 * 029 * <BR /><BR /><B CLASS=JDDescLabel>Initializations:</B> 030 * 031 * <BR />Many of the values in the arrays for class {@code 'Results'} (this class) may contain 032 * null-values, or a {@code '-1'}. This happens if an exceptions is thrown while downloading or 033 * saving any one particular image, which prevents the process from running to completion. 034 * 035 * <BR /><BR />To view the exact initialization-value for elements of any of these array, simply 036 * click on the <B><CODE><A HREF='hilite-files/Results.java.html'>HiLited Source-Code</A></CODE> 037 * </B> link, and scroll down to the <B>Package-Private</B> <CODE>Results</CODE> class constructor. 038 * 039 * <BR /><BR /><B CLASS=JDDescLabel>The 'skipped' Array:</B> 040 * 041 * <BR />There is an easy way to check which {@code URL's} have failed or were skipped (due 042 * to user-request), and which {@code URL's} were successfully obtained. One of several arrays 043 * that are {@code public} in this class is the {@link #skipped} {@code boolean[]}-array. 044 * 045 * <BR /><BR /> Whenever a particular {@code URL}-index corresponds to a {@code skipped}-index that 046 * contains a {@code FALSE} boolean-value, it indicates that that particular {@code URL} ultimately 047 * was not properly saved or re-transmitted. 048 * 049 * @see ImageScraper 050 */ 051@Torello.JavaDoc.JDHeaderBackgroundImg(EmbedTagFileID="IMAGE_SCRAPER_CLASS") 052public class Results implements Serializable, Cloneable 053{ 054 /** <EMBED CLASS='external-html' DATA-FILE-ID=SVUID> */ 055 public static final long serialVersionUID = 1; 056 057 058 // ******************************************************************************************** 059 // ******************************************************************************************** 060 // Public Array Fields: User may inspect these fields when an instance of 'Results' is returned 061 // ******************************************************************************************** 062 // ******************************************************************************************** 063 064 065 /** 066 * This will contain a complete list of the {@code URL's} that were retrieved (or generated- 067 * <I>if partially-resolved 'relative' {@code URL's} were provided</I>) from the 068 * {@link Request}-instances {@code static}-builder. Every image downloaded (or attempted for 069 * download) will have its {@code URL} saved here, in this array. 070 * 071 * <BR /><BR /><B CLASS=JDDescLabel>Null's in the {@code urls} Array</B> 072 * 073 * <BR />An array-element of the {@code urls}-array will contain null under the following 074 * two circumstances: 075 * 076 * <BR /><BR /><UL CLASS=JDUL> 077 * 078 * <LI> No image-{@code URL} was provided, becasue the picture was a Base-64 Encoded Image, and 079 * instead was a {@code String} that had been retrieved from a {@link TagNode}'s 080 * {@code SRC}-Attribute. 081 * <BR /><BR /> 082 * </LI> 083 * 084 * <LI> The user provided a {@code String} to the {@link Request} Class builder, but that 085 * {@code String} caused a {@code MalformedURLException}, and no {@code URL}-instance 086 * was ever built. (Note that in this scenario, the {@link #exceptions} array would be 087 * storing the {@code URL}-Exception that was thrown). 088 * <BR /><BR /> 089 * </LI> 090 * 091 * </UL> 092 * 093 * <EMBED CLASS='external-html' DATA-FILE-ID=REQ_PARALLEL_NOTE> 094 */ 095 public final URL[] urls; 096 097 /** 098 * When constructing an {@link ImageScraper}'s {@link Request} object, one of the options for 099 * building the instance is to pass a list of HTML {@link TagNode} instances containing HTML 100 * {@code '<IMG SRC=...>'} tags. 101 * 102 * <BR /><BR />HTML {@code TagNode} elements will sometimes / occasionally have a variant of an 103 * image source known as the <B STYLE='color: red;'>Base-64 Encoded Image</B>. These are 104 * images where the actual pictures fully stored & encoded inside the {@code SRC}-Attribute 105 * of the HTML {@link TagNode}'s {@code SRC}-Attribute. 106 * 107 * <BR /><BR />Base-64 Images are just pictures that have been converted into actual character 108 * data, in the form of a simple {@code String}, and saved into the {@code <IMG>} tag's 109 * {@code SRC}-URL - <I>instead of an actual HTTP {@code URL} being saved there</I>. Note that 110 * this practice is generally used for much smaller pictures, thumbnails or logo signs (images 111 * that wouldn't use up a lot of data). 112 * 113 * <BR /><BR />A full explanation of HTML's {@code Base-64} Image-Encoding is beyone the scope 114 * of this Java-Doc Comment. 115 * 116 * <BR /><BR />If any image was "converted" from a B-64 Image-Encoding (rather than downloaded 117 * from a {@code URL}), then the boolean for the image's index will be {@code TRUE} rather than 118 * {@code FALSE}. The default value for all elements of this array is {@code FALSE}. 119 * 120 * <EMBED CLASS='external-html' DATA-FILE-ID=REQ_PARALLEL_NOTE> 121 */ 122 public final boolean[] b64EncodedImg; 123 124 /** 125 * An images {@code Results}-data in this particular paralell-array will be {@code TRUE} under 126 * any of the following circumstances: 127 * 128 * <BR /><BR /><UL CLASS=JDUL> 129 * 130 * <LI> If the user provided a {@link Request#skipURL}-Predicate, and that predicate rejected 131 * the image (telling the downloader not to download the picture). 132 * <BR /><BR /> 133 * </LI> 134 * 135 * <LI> If the user provided a {@link Request#keeperPredicate}, and that predicate, after 136 * image-downloade complettion, rejected the image (telling the downloader not save the 137 * picture to disk the picture). 138 * <BR /><BR /> 139 * </LI> 140 * 141 * <LI> If there were any exceptions thrown while downloading the image that forced the 142 * downloader-logic to abandon the image (and either throw the exception, or 143 * skip-and-move-on to the next image). 144 * <BR /><BR /> 145 * </LI> 146 * 147 * <LI> If the original {@code Iterable} that was provided to the {@link Request}-instance 148 * had entries that had caused {@code exceptions} to be thrown while building the 149 * {@link Request}-instance. 150 * </LI> 151 * 152 * </UL> 153 * 154 * <BR /><B STYLE='color: red;'><I>Under any / all other circumstances, if an image was 155 * successfully downloaded and written to disk, then the corresponding element in this array 156 * will contain {@code FALSE}!</I></B> 157 * 158 * <EMBED CLASS='external-html' DATA-FILE-ID=REQ_PARALLEL_NOTE> 159 */ 160 public final boolean[] skipped; 161 162 /** 163 * The names of the files that were retrieved and/or stored will be in this array. 164 * If this image were skipped or an exception occurred, the array position for that 165 * {@code URL} would contain 'null'. 166 * 167 * <BR /><BR />It is important to note that if an element of this array contains a valid, 168 * non-null, file-name - <I><B>it does not guarantee that the image was properly saved</B></I>. 169 * The value stored in the corresponding (parallel) {@link #skipped}-array index is the only 170 * way to ascertain whether an image was ultimately written to disk (or transmitted to a 171 * User-Provided {@link Request#imageReceiver}). 172 * 173 * <EMBED CLASS='external-html' DATA-FILE-ID=REQ_PARALLEL_NOTE> 174 */ 175 public final String[] fileNames; 176 177 /** 178 * The location of the file-name saved directory, if an image did not successfully save to 179 * the file system, or if an {@code imageReceiver} were used, then the array-location would 180 * contain {@code 'null'}. 181 * 182 * <EMBED CLASS='external-html' DATA-FILE-ID=REQ_PARALLEL_NOTE> 183 */ 184 public final String[] saveDirectories; 185 186 /** 187 * The image type of the files that were retrieved will be stored in this array. 188 * 189 * <EMBED CLASS='external-html' DATA-DEFVAL=null DATA-SPEC="image-format" 190 * DATA-FILE-ID=REQ_SKIPPED_NOTE> 191 * 192 * <EMBED CLASS='external-html' DATA-FILE-ID=REQ_PARALLEL_NOTE> 193 */ 194 public final IF[] imageFormats; 195 196 /** 197 * If any stage of the image download, conversion or disk-write fails, then this array will 198 * store a record the exception that was thrown. 199 * 200 * <BR /><BR />If the download succeeded, then the {@code 'exceptions'}-array element at that 201 * index would contain 'null.' Any {@code exceptions}-array index that contains a non-null 202 * {@code Exception} will be an index for which the {@link #skipped}-array contains a 203 * {@code TRUE}-value stored at the same location. 204 * 205 * <EMBED CLASS='external-html' DATA-FILE-ID=REQ_PARALLEL_NOTE> 206 */ 207 public final Exception[] exceptions; 208 209 /** 210 * This will contain a list of long-integers, each of which will have the file-size of the 211 * downloaded image. 212 * 213 * <EMBED CLASS='external-html' DATA-DEFVAL="-1" DATA-SPEC=size DATA-FILE-ID=REQ_SKIPPED_NOTE> 214 * 215 * <EMBED CLASS='external-html' DATA-FILE-ID=REQ_PARALLEL_NOTE> 216 */ 217 public final long[] sizes; 218 219 /** 220 * This will contain a list of integers, each of which shall have the image-widths of the 221 * downloaded images. 222 * 223 * <EMBED CLASS='external-html' DATA-DEFVAL="-1" DATA-SPEC=width DATA-FILE-ID=REQ_SKIPPED_NOTE> 224 * 225 * <EMBED CLASS='external-html' DATA-FILE-ID=REQ_PARALLEL_NOTE> 226 */ 227 public final int[] widths; 228 229 /** 230 * This shall contain a list of integers, each of which shall have the image-heights of 231 * the downloaded images. 232 * 233 * <EMBED CLASS='external-html' DATA-DEFVAL="-1" DATA-SPEC=height 234 * DATA-FILE-ID=REQ_SKIPPED_NOTE> 235 * 236 * <EMBED CLASS='external-html' DATA-FILE-ID=REQ_PARALLEL_NOTE> 237 */ 238 public final int[] heights; 239 240 241 // ******************************************************************************************** 242 // ******************************************************************************************** 243 // Some Package-Private Fields, Used here and by class ImageScraper 244 // ******************************************************************************************** 245 // ******************************************************************************************** 246 247 248 // next result received array position. 249 int pos = 0; 250 251 // number of successfully saved images. 252 int successCounter = 0; 253 254 255 // ******************************************************************************************** 256 // ******************************************************************************************** 257 // Package-Private Constructor 258 // ******************************************************************************************** 259 // ******************************************************************************************** 260 261 262 Results(int size) 263 { 264 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 265 // Create each of these arrays 266 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 267 268 urls = new URL[size]; 269 b64EncodedImg = new boolean[size]; 270 skipped = new boolean[size]; 271 fileNames = new String[size]; 272 saveDirectories = new String[size]; 273 imageFormats = new IF[size]; 274 exceptions = new Exception[size]; 275 sizes = new long[size]; 276 widths = new int[size]; 277 heights = new int[size]; 278 279 280 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 281 // Initialize each element of the above arrays 282 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 283 284 for (int i=0; i < size; i++) 285 { 286 urls[i] = null; 287 b64EncodedImg[i] = false; 288 skipped[i] = false; 289 fileNames[i] = null; 290 saveDirectories[i] = null; 291 imageFormats[i] = null; 292 exceptions[i] = null; 293 sizes[i] = -1; 294 widths[i] = -1; 295 heights[i] = -1; 296 } 297 } 298 299 300 // ******************************************************************************************** 301 // ******************************************************************************************** 302 // No Image Downloaded 303 // ******************************************************************************************** 304 // ******************************************************************************************** 305 306 307 // Request static-builder generated an "Exception URL" 308 // Called From: ImageScraper.loopBody(RECORD) 309 310 void tagNodeSRCError(Exception e) 311 { 312 skipped[pos] = true; 313 exceptions[pos] = e; 314 315 pos++; 316 } 317 318 // User-Provided "Predicate<URL> skipURL" 319 // Called From: ImageScraper.downloadImage(RECORD) 320 321 void skippedURL(URL url) 322 { 323 urls[pos] = url; 324 skipped[pos] = true; 325 326 pos++; 327 } 328 329 // User-Provided "boolean skipBase64EncodedImages" 330 // Called From: ImageScraper.convertB64Image(RECORD) 331 332 void skipB64() 333 { 334 b64EncodedImg[pos] = true; 335 skipped[pos] = true; 336 337 pos++; 338 } 339 340 // Called from many places. This method is the biggest of the Results-Reporters 341 // * Exception-thrown 342 // * 'ImageInfo' instance hasn't been constructed yet 343 344 void exceptionFail(URL url, Exception e) 345 { 346 urls[pos] = url; 347 b64EncodedImg[pos] = (url == null); 348 skipped[pos] = true; 349 exceptions[pos] = e; 350 351 pos++; 352 } 353 354 355 // ******************************************************************************************** 356 // ******************************************************************************************** 357 // "ImageInfo" instance available now: Image Successfully Downloaded and Converted to Array. 358 // ******************************************************************************************** 359 // ******************************************************************************************** 360 361 362 // There are 3 different User-Provided Lambda-Target's that might throw exceptions 363 // This is called from within "ImageScraper.RECORD.userLambdaEx(...)" 364 365 void userLambdaException(ImageInfo imageInfo, Exception e) 366 { 367 skipped[pos] = true; 368 exceptions[pos] = e; 369 370 copyImageInfo(imageInfo); 371 } 372 373 // The User Keep/Reject Predicate rejected this image 374 // Called From: ImageScraper.handleImageByteArray(RECORD) 375 376 void predicateReject(ImageInfo imageInfo) 377 { 378 skipped[pos] = true; 379 copyImageInfo(imageInfo); 380 } 381 382 // Image was written to disk somewhere, or accepted by the Request.imageReceiver 383 // Called From: ImageScraper.writeOrTransmit(RECORD) 384 385 void success(ImageInfo imageInfo, String targetDirectory) 386 { 387 // Directory where the image was saved, if called by "ImageReceiver", this will be null 388 saveDirectories[pos] = targetDirectory; 389 390 copyImageInfo(imageInfo); 391 392 // Only time this is ever incremented 393 successCounter++; 394 } 395 396 397 // ******************************************************************************************** 398 // ******************************************************************************************** 399 // SMALL HELPER 400 // ******************************************************************************************** 401 // ******************************************************************************************** 402 403 404 // Private Method, used in all 5 previous methods directly above here 405 private void copyImageInfo(ImageInfo imageInfo) 406 { 407 urls[pos] = imageInfo.url; 408 b64EncodedImg[pos] = imageInfo.isB64EncodedImage; 409 imageFormats[pos] = imageInfo.actualExtension; 410 sizes[pos] = imageInfo.imgByteArr.length; 411 widths[pos] = imageInfo.width; 412 heights[pos] = imageInfo.height; 413 fileNames[pos] = imageInfo.fileName() + '.' + imageInfo.actualExtension; 414 415 pos++; 416 } 417 418 419 // ******************************************************************************************** 420 // ******************************************************************************************** 421 // interface java.lang.Cloneable 422 // ******************************************************************************************** 423 // ******************************************************************************************** 424 425 426 /** 427 * Generates a <B STYLE='color: red;'>Deep Copy</B> of {@code 'this'} instance. This means 428 * all internal arrays are also cloned / copied 429 * 430 * @return A duplicate instance of this class, with all arrays having been copied. 431 */ 432 public Results clone() 433 { return new Results(this); } 434 435 // Private Constructor, used only for the 'clone()' method 436 private Results(Results r) 437 { 438 this.urls = r.urls.clone(); 439 this.b64EncodedImg = r.b64EncodedImg.clone(); 440 this.skipped = r.skipped.clone(); 441 this.fileNames = r.fileNames.clone(); 442 this.saveDirectories = r.saveDirectories.clone(); 443 this.imageFormats = r.imageFormats.clone(); 444 this.exceptions = r.exceptions.clone(); 445 this.sizes = r.sizes.clone(); 446 this.widths = r.widths.clone(); 447 this.heights = r.heights.clone(); 448 449 this.pos = r.pos; 450 this.successCounter = r.successCounter; 451 } 452 453 454 // ******************************************************************************************** 455 // ******************************************************************************************** 456 // java.lang.Object 457 // ******************************************************************************************** 458 // ******************************************************************************************** 459 460 461 /** 462 * Checks whether {@code 'this'} instance is equal to another instance of class 463 * {@code Results}. This method performs a <B STYLE='color: red;'>Deep Equals</B> comparison 464 * using the {@code equals(...)} method suite found in class' {@code java.util.Arrays}. 465 * 466 * @param other This may be any Java Object, but only an instance class {@code 'Results'} has 467 * any chance of being marked as <B STYLE='color: red;'>equal</B> to this instance. 468 * 469 * @return {@code TRUE} if and only if {@code 'o'} has a type that's assignable to 470 * {@code Results} - and if each of the internal arrays in this instance are equal to the 471 * arrays in parameter {@code 'o'}. 472 */ 473 public boolean equals(Object other) 474 { 475 if (other == null) return false; 476 477 if (! Results.class.isAssignableFrom(other.getClass())) return false; 478 479 Results r = (Results) other; 480 481 482 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 483 // NOTE: These arrays cannot ever be null, that is an "Unreachable Situation" 484 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 485 486 return 487 Arrays.equals(this.urls, r.urls) 488 && Arrays.equals(this.b64EncodedImg, r.b64EncodedImg) 489 && Arrays.equals(this.skipped, r.skipped) 490 && Arrays.equals(this.fileNames, r.fileNames) 491 && Arrays.equals(this.saveDirectories, r.saveDirectories) 492 && Arrays.equals(this.imageFormats, r.imageFormats) 493 && Arrays.equals(this.exceptions, r.exceptions) 494 && Arrays.equals(this.sizes, r.sizes) 495 && Arrays.equals(this.widths, r.widths) 496 && Arrays.equals(this.heights, r.heights); 497 } 498 499 // Used in the above toString() method 500 private static final String COMMA = ", "; 501 502 /** 503 * Returns a {@code java.lang.String} representation of {@code 'this'} instance 504 * 505 * @return A Java {@code String} containing the data inside this class. 506 */ 507 public String toString() 508 { 509 StringBuilder sb = new StringBuilder(); 510 511 512 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 513 // NOTE: These arrays, themselves can never be null - BUT THEIR CONTENTS ARE OFTEN NULL 514 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 515 516 for (int i=0; i < urls.length; i++) 517 { 518 if (b64EncodedImg[i]) sb.append("Base 64 Encoded Image\n"); 519 520 else sb.append( 521 "URL: " + ((urls[i] == null) 522 ? "null" 523 : StrPrint.abbrev(urls[i].toString(), 50, true, " ... ", 100)) + 524 '\n' 525 ); 526 527 boolean comma = false; 528 529 if (skipped[i] == true) 530 { 531 sb.append(" SKIPPED"); 532 comma = true; 533 } 534 535 if (imageFormats[i] != null) 536 { 537 sb.append(comma ? COMMA : " "); 538 sb.append("Format: " + imageFormats[i]); 539 comma = true; 540 } 541 542 if (sizes[i] > 0) 543 { 544 sb.append(comma ? COMMA : " "); 545 sb.append("Size: " + StringParse.commas(sizes[i])); 546 comma = true; 547 } 548 549 if (widths[i] > 0) 550 { 551 sb.append(comma ? COMMA : " "); 552 sb.append("W: " + StringParse.commas(widths[i])); 553 comma = true; 554 } 555 556 if (heights[i] > 0) 557 { 558 sb.append(comma ? COMMA : " "); 559 sb.append("H: " + StringParse.commas(heights[i])); 560 comma = true; 561 } 562 563 if (comma) sb.append('\n'); 564 565 comma = false; 566 567 if (fileNames[i] != null) 568 { 569 sb.append(" FileName: [" + fileNames[i] + ']'); 570 comma = true; 571 } 572 573 if (saveDirectories[i] != null) 574 { 575 sb.append(comma ? COMMA : " "); 576 sb.append("Dir: [" + StrPrint.abbrev(saveDirectories[i], 30, true, null, 60) + ']'); 577 comma = true; 578 } 579 580 if (comma) sb.append('\n'); 581 582 if (exceptions[i] != null) 583 sb.append(" Thrown: " + exceptions[i].getClass().getName() + '\n'); 584 585 if (i < (urls.length - 1)) sb.append('\n'); 586 } 587 588 return sb.toString(); 589 } 590 591 /** 592 * Java's hash-code requirement. The code is computed by summing the first 10 {@link #sizes} 593 * array elements. 594 * 595 * @return A hash-code that may be used when storing this node in a java sorted-collection. 596 */ 597 public int hashCode() 598 { 599 int sum = 0; 600 601 for (int i=0; (i < 10) && (i < sizes.length); i++) sum += sizes[i]; 602 603 return sum; 604 } 605}