001package Torello.HTML.Tools.Images; 002 003import Torello.Java.StringParse; 004import Torello.Java.WritableDirectoryException; 005 006import java.io.*; 007 008import javax.imageio.ImageIO; 009import java.net.URL; 010import java.awt.Image; 011import java.awt.image.BufferedImage; 012import java.util.Vector; 013 014/** 015 * A simple class for scraping & downloading images using a URL, or list of URL's. 016 * 017 * <EMBED CLASS='external-html' DATA-FILE-ID=IMSC> 018 */ 019public class ImageScrape 020{ 021 // No public constructor 022 private ImageScrape() { } 023 024 /** {@code String}-Array having the list of file-formats */ 025 public static final String[] imageExts = { "jpg", "png", "gif", "bmp", "jpeg" }; 026 027 /** 028 * This will extract the file-extension from an image {@code URL.} Not all images on the 029 * internet have {@code URL's} that end with the actual image-file-type. In that case, or in 030 * the case that the {@code 'urStr'} is a pointer to a non-image-file, null will be returned. 031 * 032 * @param urlStr Is the {@code url} of the image. 033 * 034 * @return If it has a file-extension that is listed in the {@code 'imageExts'} array - that 035 * file-extension will be returned, otherwise {@code null} will be returned. 036 */ 037 public static String getImageTypeFromURL(String urlStr) 038 { 039 if (urlStr == null) return null; 040 041 String ext = StringParse.fromExtension(urlStr, false); 042 043 if (ext == null) return null; 044 045 ext = ext.toLowerCase(); 046 047 for (int i=0; i < imageExts.length; i++) if (imageExts[i].equals(ext)) return imageExts[i]; 048 049 return null; 050 } 051 052 /** 053 * Convenience Method. 054 * <BR />Invokes: {@link #downloadImageGuessType(String, String, String)} 055 */ 056 public static String downloadImageGuessType(String urlStr, String outputFileStr) 057 throws IOException 058 { 059 // We need to check whether the file-name that was passed is just a filename; or if it 060 // has a directory component in its name. 061 062 int sep = outputFileStr.lastIndexOf(File.separator) + 1; 063 064 if (sep == 0) 065 return downloadImageGuessType(urlStr, outputFileStr, ""); 066 067 else if (sep == outputFileStr.length()) 068 return downloadImageGuessType(urlStr, "IMAGE", outputFileStr); 069 070 else return downloadImageGuessType 071 (urlStr, outputFileStr.substring(sep), outputFileStr.substring(0, sep)); 072 } 073 074 /** 075 * This will download an image, and try to guess if it is one of the following types: 076 * {@code .jpg, .png, .bmp, .gif or .jpeg}. If the {@code 'urlStr'} has a valid image-type 077 * extension as a filename, then that format will be used to save to a file. If that fails, 078 * an exception of type {@code javax.imageio.IIOException} is thrown. 079 * 080 * <DIV CLASS="EXAMPLE">{@code 081 * // Retrieve all images found on a random Yahoo! News web-page article 082 * URL url = new URL("https://news.yahoo.com/former-fox-news-employees [actual URL hidden].html"); 083 * 084 * // Parse & Scrape the Web-Page, store it in a local html-vector 085 * Vector<HTMLNode> page = HTMLPage.getPageTokens(url, false); 086 * 087 * // Skip ahead to the "article body." The body is surrounded by an <ARTICLE>...</ARTICLE> 088 * // HTML Element. Retrieve (using 'Inclusive') - everything between the HTML "ARTICLE" Tags. 089 * page = TagNodeGetInclusive.first(page, "article"); 090 * 091 * // Get the SECOND picture (HTML <IMG SRC=...>) element found on the page. 092 * // For the news-article used in this example, the first image was an icon thumbnail. 093 * // The second image contained the "Main Article Photo" 094 * TagNode firstPic = TagNodeGet.nth(page, 2, TC.OpeningTags, "img"); 095 * String urlStr = Links.resolveSRC(firstPic, url).toString(); 096 * 097 * // Run this method. A file named 'img.jpg' is saved. 098 * System.out.println("Image URL to Download:" + urlStr); 099 * ImageScrape.downloadImageGuessType(urlStr, "img"); 100 * }</DIV> 101 * 102 * @param urlStr Is the {@code url} of the image. <B>Yahoo! Images</B>, for instance, have 103 * really long {@code URL's} and don't have any extensions at the end. If {@code 'urlStr'} 104 * does contain an image extension in the {@code 'String'}, then this method will attempt to 105 * save the image using the appropriate file-extension, and throw an {@code 'IIOException'} if 106 * it fails. 107 * 108 * @param outputFileStr This is the target or destination name for the output image file. 109 * 110 * <BR /><BR /><B>NOTE:</B> This file is not intended to have an extension. The extension will 111 * be generated by the code in this method, and it will match whatever image-file-encoding 112 * was successfully used to download the file. If this is a {@code '.png'}, for instance, but 113 * it did not download until {@code '.bmp'} was used (mis-labeled), this output file will be 114 * saved as {@code 'outputFileStr'} + {@code '.bmp'}. 115 * 116 * <BR /><BR /><B STYLE='color: red;'>URL vs. File Names:</B> This parameter 117 * {@code 'outputFileStr'} <B><I>may NOT be null</I></B>. It is important to realize, here, 118 * that file-names and {@code URL's} <I>do not obey the same naming conventions.</I> Because 119 * it is often seen on the internet image-{@code URL's} that have a plethora of file-system 120 * 'irreverent' characters in their name, this method simply cannot pick out the file-name of 121 * an image from its {@code URL}. 122 * 123 * <BR /><BR />It may seem counter-intuitive to expect a "filename" parameter be provided as 124 * input here, <I>given that an image-{@code URL} is also required</I> (since in most cases 125 * the file-name of the image being downloaded is included in the image's {@code URL}). 126 * However, because many of the modern content-providers on the internet use many layers of 127 * naming conventions for their image-{@code URL's}, the user must provide the file-name of 128 * the image (as a {@code String}) to avoid crashing this method in situations / cases where 129 * the image file-name is "too difficult" to discern from it's {@code URL}. 130 * 131 * @param outputDirectory This is just "prepended" to the file-save name. This 132 * {@code 'String'} is not included in the returned filename. <B>Specifically</B> The returned 133 * file name <I>only includes</I> the file-name and the file-name-extension. It does not 134 * include the whole "canonical" or "absolute" directory-path name for this image. 135 * 136 * @return It will return the name of the file as a result - including the extension type 137 * which did not throw a {@code javax.imageio.IIOException.} This exception is thrown 138 * whenever an image, of - for instance {@code '.png'} format tries to save as a 139 * {@code '.jpg'}, or any other incorrect image-format. 140 * 141 * <BR /><BR /><B>NOTE:</B> {@code 'null'} will be returned if the image failed to save at all. 142 * 143 * <BR /><BR /><B>ALSO:</B> If the passed {@code 'urlStr'} does not save properly, 144 * {@code javax.imageio.IIOException} will also be thrown. 145 * 146 * <BR /><BR /><B>It is important to return the filename, since the extension identifies in 147 * what format the image was saved - {@code .jpg, .gif, .png,} etc...</B> 148 * 149 * @throws WritableDirectoryException If the provided output directory must exist and be 150 * writable, or else this exception shall throw. Java will attempt to write a small, 151 * temporary file to the directory-name provided. It will be deleted immediately afterwards. 152 * 153 * @see #imageExts 154 */ 155 public static String downloadImageGuessType 156 (String urlStr, String outputFileStr, String outputDirectory) 157 throws IOException 158 { 159 // If the "file name" has directory components... it is just "better" to flag this as 160 // an exception 161 162 if (outputFileStr.indexOf(File.separator) != -1) throw new IllegalArgumentException( 163 "This method expects parameter 'outputFileStr' to be a simple file-name, without " + 164 "any directory-names attached. If directory names need to be attached to ensure " + 165 "that the file is ultimately saved to the proper location in the file-system, " + 166 "pass the directory to the 'outputDirectory' parameter to this method.\n" + 167 "You have passed: " + outputFileStr + "\nwhich contains the file-name separator " + 168 "character." 169 ); 170 171 if (outputDirectory == null) outputDirectory = ""; 172 173 // Make sure the directory exists on the file-system, and that it is writable. 174 WritableDirectoryException.check(outputDirectory); 175 176 // Unless writing the "current directory" - make sure the directory name ends with the 177 // Operating System file-separator character. 178 179 if ((outputDirectory.length() > 0) && (! outputDirectory.endsWith(File.separator))) 180 outputDirectory = outputDirectory + File.separator; 181 182 BufferedImage image = ImageIO.read(new URL(urlStr)); 183 String ext = getImageTypeFromURL(urlStr); 184 File f = null; 185 186 if (ext != null) 187 188 try 189 { 190 String fName = outputFileStr + '.' + ext; 191 f = new File(outputDirectory + fName); 192 193 ImageIO.write(image, ext, f); 194 195 return fName; 196 } 197 198 // NOTE: If saving the file using the named image-extension fails, try the other. 199 catch (javax.imageio.IIOException e) { f.delete(); } 200 201 for (int i=0; i < imageExts.length; i++) 202 203 try 204 { 205 f = new File(outputFileStr + '.' + imageExts[i]); 206 207 ImageIO.write(image, imageExts[i], f); 208 209 return outputFileStr + '.' + imageExts[i]; 210 } 211 212 catch (javax.imageio.IIOException e) { f.delete(); continue; } 213 214 System.out.println 215 ("NOTE: Image " + urlStr + "\nAttempted to save to:" + outputFileStr + "\nFAILED."); 216 217 return null; 218 } 219 220 /** 221 * Convenience Method. 222 * <BR />Invokes: {@link #downloadImagesGuessTypes(String, Iterable, String)} 223 */ 224 public static Vector<String> downloadImagesGuessTypes(String rootURL, Iterable<String> urls) 225 throws IOException 226 { return downloadImagesGuessTypes(rootURL, urls, ""); } 227 228 /** 229 * Convenience Method. 230 * <BR />Invokes: {@link #downloadImagesGuessTypes(String, Iterable, String)} 231 */ 232 public static Vector<String> downloadImagesGuessTypes(Iterable<String> urls) 233 throws IOException 234 { return downloadImagesGuessTypes("", urls, ""); } 235 236 /** 237 * Convenience Method. 238 * <BR />Invokes: {@link #downloadImagesGuessTypes(String, Iterable, String)} 239 */ 240 public static Vector<String> downloadImagesGuessTypes 241 (Iterable<String> urls, String outputDirectory) 242 throws IOException 243 { return downloadImagesGuessTypes("", urls, outputDirectory); } 244 245 /** 246 * This will download an entire {@code Vector<String>} of {@code URL's}, and save the 247 * output fileNames which were used to save these images. It will use a the 248 * {@code StringParse.zeroPad(int)} method to generate filenames - starting with 249 * {@code 001.jpg} - or whatever extension was correct. It will use the <B><I>guessed 250 * file-name extension</I></B> that is appropriate for this image. 251 * 252 * <BR /><BR /><B>NOTE:</B> As the images are downloaded, the fileName is printed via 253 * {@code System.out.println()} 254 * 255 * <DIV CLASS="EXAMPLE">{@code 256 * // Retrieve all images found on the Wikipedia (Encyclopedia) Page for Galileo 257 * URL url = new URL("https://en.wikipedia.org/wiki/Galileo_Galilei"); 258 * 259 * // Parse & Scrape the Web-Page, store it in a local html-vector 260 * Vector<HTMLNode> page = HTMLPage.getPageTokens(url, false); 261 * 262 * // Get the "Vector Index Array" for every HTML <IMG> element found on the page. 263 * int[] imgPosArr = TagNodeFind.all(page, TC.OpeningTags, "img"); 264 * 265 * // Since there are many "relative" or "partial" URL's, make sure to resolve them 266 * // against the main Wikipedia page-url. Also, note, that Links.resolve returns a 267 * // Vector<URL>, but that ImageScraper.downloadImagesGuessTypes requires a 268 * // Vector<String>, so make sure to convert the output url's to strings. 269 * 270 * Vector<String> urls = new Vector<String>(imgPosArr.length); 271 * 272 * Links.resolveSRCs(page, imgPosArr, url).forEach((URL u) -> urls.add(u.toString())); 273 * 274 * // Run this method. A series of '.png' and '.jpg' files will be saved to the current 275 * // working directory. 276 * 277 * ImageScrape.downloadImagesGuessTypes(urls); 278 * }</DIV> 279 * 280 * @param urls is a {@code Vector} of {@code String's} that are to contain image pointers 281 * 282 * @param rootURL if these are "sub-urls", with a root {@code URL}, this root {@code URL} 283 * is pre-pended to each of the {@code String's} in the {@code 'urls' Vector}. This parameter 284 * may contain the empty string ({@code ""}) (and if it is, it will be ignored) 285 * 286 * @param outputDirectory The files that are downloaded are saved to this directory. 287 * 288 * @return a {@code Vector} of {@code String's} which contains the output filenames of these 289 * files. 290 * 291 * @throws WritableDirectoryException If the provided output directory must exist and be 292 * writable, or else this exception shall throw. Java will attempt to write a small, temporary 293 * file to the directory-name provided. It will be deleted immediately afterwards. 294 * 295 * @see StringParse#zeroPad(int) 296 * @see #downloadImageGuessType(String, String) 297 */ 298 public static Vector<String> downloadImagesGuessTypes 299 (String rootURL, Iterable<String> urls, String outputDirectory) 300 throws IOException 301 { 302 if (outputDirectory == null) outputDirectory = ""; 303 304 // Make sure the directory exists on the file-system, and that it is writable. 305 WritableDirectoryException.check(outputDirectory); 306 307 // Unless writing the "current directory" - make sure the directory name ends with the 308 // Operating System file-separator character. 309 310 if ((outputDirectory.length() > 0) && (! outputDirectory.endsWith(File.separator))) 311 outputDirectory = outputDirectory + File.separator; 312 313 if (rootURL == null) rootURL = ""; 314 315 Vector<String> ret = new Vector<String>(); 316 int count = 0; 317 318 for (String url : urls) 319 { 320 String fileName = downloadImageGuessType 321 (rootURL + url, StringParse.zeroPad(++count), outputDirectory); 322 323 System.out.print(fileName + ((fileName.length() < 10) ? ' ' : '\n')); 324 325 ret.addElement(fileName); 326 } 327 328 return ret; 329 } 330 331 332 /** 333 * This downloads an image to a a file named {@code 'outputFileStr'}. A valid image-extension 334 * needs to be provided for the java {@code ImageIO.write(...)} method to work properly. The 335 * {@code 'extensionStr'} should be {@code String's} such as: {@code '.jpg'} or {@code '.png'} 336 * 337 * @param urlStr The {@code URL} of the image which generated the exception 338 * @param outputFileStr The intended file-name root to which the image is supposed to save 339 * @param extensionStr The intended file-name extension to which this image was to be saved. 340 * @throws java.imageio.IIOException - if this file type / {@code 'extensionStr'} are incorrect 341 */ 342 public static void getImage(String urlStr, String outputFileStr, String extensionStr) 343 throws IOException 344 { 345 File f = new File(outputFileStr); 346 BufferedImage image = ImageIO.read(new URL(urlStr)); 347 348 ImageIO.write(image, extensionStr, f); 349 } 350 351 /** 352 * This method will read from a text-file, which must have a list of image-{@code URL's} from 353 * the internet - and download them, one by one, to a directory. Messages will be printed as 354 * each file is downloaded via {@code System.out.print()} 355 * 356 * @param f A file pointer to a text-file that contains a list of {@code String's}. Each 357 * {@code String} is intended to be a {@code URL} to an image on the internet. 358 * 359 * @return a {@code Vector} containing the file-names of these images. 360 */ 361 public static Vector<String> downloadImages(File f) throws IOException, FileNotFoundException 362 { 363 BufferedReader br = new BufferedReader(new FileReader(f)); 364 Vector<String> pics = new Vector<String>(); 365 String s; 366 367 while ((s = br.readLine()) != null) pics.addElement(s); 368 369 return downloadImagesGuessTypes(pics); 370 } 371}