1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 | package Torello.HTML.Tools.Images; import Torello.Java.StringParse; import Torello.Java.WritableDirectoryException; import java.io.*; import javax.imageio.ImageIO; import java.net.URL; import java.awt.Image; import java.awt.image.BufferedImage; import java.util.Vector; /** * A simple class for scraping & downloading images using a URL, or list of URL's. * * <EMBED CLASS='external-html' DATA-FILE-ID=IMSC> */ public class ImageScrape { // No public constructor private ImageScrape() { } /** {@code String}-Array having the list of file-formats */ public static final String[] imageExts = { "jpg", "png", "gif", "bmp", "jpeg" }; /** * This will extract the file-extension from an image {@code URL.} Not all images on the * internet have {@code URL's} that end with the actual image-file-type. In that case, or in * the case that the {@code 'urStr'} is a pointer to a non-image-file, null will be returned. * * @param urlStr Is the {@code url} of the image. * * @return If it has a file-extension that is listed in the {@code 'imageExts'} array - that * file-extension will be returned, otherwise {@code null} will be returned. */ public static String getImageTypeFromURL(String urlStr) { if (urlStr == null) return null; String ext = StringParse.fromExtension(urlStr, false); if (ext == null) return null; ext = ext.toLowerCase(); for (int i=0; i < imageExts.length; i++) if (imageExts[i].equals(ext)) return imageExts[i]; return null; } /** * Convenience Method. * <BR />Invokes: {@link #downloadImageGuessType(String, String, String)} */ public static String downloadImageGuessType(String urlStr, String outputFileStr) throws IOException { // We need to check whether the file-name that was passed is just a filename; or if it // has a directory component in its name. int sep = outputFileStr.lastIndexOf(File.separator) + 1; if (sep == 0) return downloadImageGuessType(urlStr, outputFileStr, ""); else if (sep == outputFileStr.length()) return downloadImageGuessType(urlStr, "IMAGE", outputFileStr); else return downloadImageGuessType (urlStr, outputFileStr.substring(sep), outputFileStr.substring(0, sep)); } /** * This will download an image, and try to guess if it is one of the following types: * {@code .jpg, .png, .bmp, .gif or .jpeg}. If the {@code 'urlStr'} has a valid image-type * extension as a filename, then that format will be used to save to a file. If that fails, * an exception of type {@code javax.imageio.IIOException} is thrown. * * <DIV CLASS="EXAMPLE">{@code * // Retrieve all images found on a random Yahoo! News web-page article * URL url = new URL("https://news.yahoo.com/former-fox-news-employees [actual URL hidden].html"); * * // Parse & Scrape the Web-Page, store it in a local html-vector * Vector<HTMLNode> page = HTMLPage.getPageTokens(url, false); * * // Skip ahead to the "article body." The body is surrounded by an <ARTICLE>...</ARTICLE> * // HTML Element. Retrieve (using 'Inclusive') - everything between the HTML "ARTICLE" Tags. * page = TagNodeGetInclusive.first(page, "article"); * * // Get the SECOND picture (HTML <IMG SRC=...>) element found on the page. * // For the news-article used in this example, the first image was an icon thumbnail. * // The second image contained the "Main Article Photo" * TagNode firstPic = TagNodeGet.nth(page, 2, TC.OpeningTags, "img"); * String urlStr = Links.resolveSRC(firstPic, url).toString(); * * // Run this method. A file named 'img.jpg' is saved. * System.out.println("Image URL to Download:" + urlStr); * ImageScrape.downloadImageGuessType(urlStr, "img"); * }</DIV> * * @param urlStr Is the {@code url} of the image. <B>Yahoo! Images</B>, for instance, have * really long {@code URL's} and don't have any extensions at the end. If {@code 'urlStr'} * does contain an image extension in the {@code 'String'}, then this method will attempt to * save the image using the appropriate file-extension, and throw an {@code 'IIOException'} if * it fails. * * @param outputFileStr This is the target or destination name for the output image file. * * <BR /><BR /><B>NOTE:</B> This file is not intended to have an extension. The extension will * be generated by the code in this method, and it will match whatever image-file-encoding * was successfully used to download the file. If this is a {@code '.png'}, for instance, but * it did not download until {@code '.bmp'} was used (mis-labeled), this output file will be * saved as {@code 'outputFileStr'} + {@code '.bmp'}. * * <BR /><BR /><B STYLE='color: red;'>URL vs. File Names:</B> This parameter * {@code 'outputFileStr'} <B><I>may NOT be null</I></B>. It is important to realize, here, * that file-names and {@code URL's} <I>do not obey the same naming conventions.</I> Because * it is often seen on the internet image-{@code URL's} that have a plethora of file-system * 'irreverent' characters in their name, this method simply cannot pick out the file-name of * an image from its {@code URL}. * * <BR /><BR />It may seem counter-intuitive to expect a "filename" parameter be provided as * input here, <I>given that an image-{@code URL} is also required</I> (since in most cases * the file-name of the image being downloaded is included in the image's {@code URL}). * However, because many of the modern content-providers on the internet use many layers of * naming conventions for their image-{@code URL's}, the user must provide the file-name of * the image (as a {@code String}) to avoid crashing this method in situations / cases where * the image file-name is "too difficult" to discern from it's {@code URL}. * * @param outputDirectory This is just "prepended" to the file-save name. This * {@code 'String'} is not included in the returned filename. <B>Specifically</B> The returned * file name <I>only includes</I> the file-name and the file-name-extension. It does not * include the whole "canonical" or "absolute" directory-path name for this image. * * @return It will return the name of the file as a result - including the extension type * which did not throw a {@code javax.imageio.IIOException.} This exception is thrown * whenever an image, of - for instance {@code '.png'} format tries to save as a * {@code '.jpg'}, or any other incorrect image-format. * * <BR /><BR /><B>NOTE:</B> {@code 'null'} will be returned if the image failed to save at all. * * <BR /><BR /><B>ALSO:</B> If the passed {@code 'urlStr'} does not save properly, * {@code javax.imageio.IIOException} will also be thrown. * * <BR /><BR /><B>It is important to return the filename, since the extension identifies in * what format the image was saved - {@code .jpg, .gif, .png,} etc...</B> * * @throws WritableDirectoryException If the provided output directory must exist and be * writable, or else this exception shall throw. Java will attempt to write a small, * temporary file to the directory-name provided. It will be deleted immediately afterwards. * * @see #imageExts */ public static String downloadImageGuessType (String urlStr, String outputFileStr, String outputDirectory) throws IOException { // If the "file name" has directory components... it is just "better" to flag this as // an exception if (outputFileStr.indexOf(File.separator) != -1) throw new IllegalArgumentException( "This method expects parameter 'outputFileStr' to be a simple file-name, without " + "any directory-names attached. If directory names need to be attached to ensure " + "that the file is ultimately saved to the proper location in the file-system, " + "pass the directory to the 'outputDirectory' parameter to this method.\n" + "You have passed: " + outputFileStr + "\nwhich contains the file-name separator " + "character." ); if (outputDirectory == null) outputDirectory = ""; // Make sure the directory exists on the file-system, and that it is writable. WritableDirectoryException.check(outputDirectory); // Unless writing the "current directory" - make sure the directory name ends with the // Operating System file-separator character. if ((outputDirectory.length() > 0) && (! outputDirectory.endsWith(File.separator))) outputDirectory = outputDirectory + File.separator; BufferedImage image = ImageIO.read(new URL(urlStr)); String ext = getImageTypeFromURL(urlStr); File f = null; if (ext != null) try { String fName = outputFileStr + '.' + ext; f = new File(outputDirectory + fName); ImageIO.write(image, ext, f); return fName; } // NOTE: If saving the file using the named image-extension fails, try the other. catch (javax.imageio.IIOException e) { f.delete(); } for (int i=0; i < imageExts.length; i++) try { f = new File(outputFileStr + '.' + imageExts[i]); ImageIO.write(image, imageExts[i], f); return outputFileStr + '.' + imageExts[i]; } catch (javax.imageio.IIOException e) { f.delete(); continue; } System.out.println ("NOTE: Image " + urlStr + "\nAttempted to save to:" + outputFileStr + "\nFAILED."); return null; } /** * Convenience Method. * <BR />Invokes: {@link #downloadImagesGuessTypes(String, Iterable, String)} */ public static Vector<String> downloadImagesGuessTypes(String rootURL, Iterable<String> urls) throws IOException { return downloadImagesGuessTypes(rootURL, urls, ""); } /** * Convenience Method. * <BR />Invokes: {@link #downloadImagesGuessTypes(String, Iterable, String)} */ public static Vector<String> downloadImagesGuessTypes(Iterable<String> urls) throws IOException { return downloadImagesGuessTypes("", urls, ""); } /** * Convenience Method. * <BR />Invokes: {@link #downloadImagesGuessTypes(String, Iterable, String)} */ public static Vector<String> downloadImagesGuessTypes (Iterable<String> urls, String outputDirectory) throws IOException { return downloadImagesGuessTypes("", urls, outputDirectory); } /** * This will download an entire {@code Vector<String>} of {@code URL's}, and save the * output fileNames which were used to save these images. It will use a the * {@code StringParse.zeroPad(int)} method to generate filenames - starting with * {@code 001.jpg} - or whatever extension was correct. It will use the <B><I>guessed * file-name extension</I></B> that is appropriate for this image. * * <BR /><BR /><B>NOTE:</B> As the images are downloaded, the fileName is printed via * {@code System.out.println()} * * <DIV CLASS="EXAMPLE">{@code * // Retrieve all images found on the Wikipedia (Encyclopedia) Page for Galileo * URL url = new URL("https://en.wikipedia.org/wiki/Galileo_Galilei"); * * // Parse & Scrape the Web-Page, store it in a local html-vector * Vector<HTMLNode> page = HTMLPage.getPageTokens(url, false); * * // Get the "Vector Index Array" for every HTML <IMG> element found on the page. * int[] imgPosArr = TagNodeFind.all(page, TC.OpeningTags, "img"); * * // Since there are many "relative" or "partial" URL's, make sure to resolve them * // against the main Wikipedia page-url. Also, note, that Links.resolve returns a * // Vector<URL>, but that ImageScraper.downloadImagesGuessTypes requires a * // Vector<String>, so make sure to convert the output url's to strings. * * Vector<String> urls = new Vector<String>(imgPosArr.length); * * Links.resolveSRCs(page, imgPosArr, url).forEach((URL u) -> urls.add(u.toString())); * * // Run this method. A series of '.png' and '.jpg' files will be saved to the current * // working directory. * * ImageScrape.downloadImagesGuessTypes(urls); * }</DIV> * * @param urls is a {@code Vector} of {@code String's} that are to contain image pointers * * @param rootURL if these are "sub-urls", with a root {@code URL}, this root {@code URL} * is pre-pended to each of the {@code String's} in the {@code 'urls' Vector}. This parameter * may contain the empty string ({@code ""}) (and if it is, it will be ignored) * * @param outputDirectory The files that are downloaded are saved to this directory. * * @return a {@code Vector} of {@code String's} which contains the output filenames of these * files. * * @throws WritableDirectoryException If the provided output directory must exist and be * writable, or else this exception shall throw. Java will attempt to write a small, temporary * file to the directory-name provided. It will be deleted immediately afterwards. * * @see StringParse#zeroPad(int) * @see #downloadImageGuessType(String, String) */ public static Vector<String> downloadImagesGuessTypes (String rootURL, Iterable<String> urls, String outputDirectory) throws IOException { if (outputDirectory == null) outputDirectory = ""; // Make sure the directory exists on the file-system, and that it is writable. WritableDirectoryException.check(outputDirectory); // Unless writing the "current directory" - make sure the directory name ends with the // Operating System file-separator character. if ((outputDirectory.length() > 0) && (! outputDirectory.endsWith(File.separator))) outputDirectory = outputDirectory + File.separator; if (rootURL == null) rootURL = ""; Vector<String> ret = new Vector<String>(); int count = 0; for (String url : urls) { String fileName = downloadImageGuessType (rootURL + url, StringParse.zeroPad(++count), outputDirectory); System.out.print(fileName + ((fileName.length() < 10) ? ' ' : '\n')); ret.addElement(fileName); } return ret; } /** * This downloads an image to a a file named {@code 'outputFileStr'}. A valid image-extension * needs to be provided for the java {@code ImageIO.write(...)} method to work properly. The * {@code 'extensionStr'} should be {@code String's} such as: {@code '.jpg'} or {@code '.png'} * * @param urlStr The {@code URL} of the image which generated the exception * @param outputFileStr The intended file-name root to which the image is supposed to save * @param extensionStr The intended file-name extension to which this image was to be saved. * @throws java.imageio.IIOException - if this file type / {@code 'extensionStr'} are incorrect */ public static void getImage(String urlStr, String outputFileStr, String extensionStr) throws IOException { File f = new File(outputFileStr); BufferedImage image = ImageIO.read(new URL(urlStr)); ImageIO.write(image, extensionStr, f); } /** * This method will read from a text-file, which must have a list of image-{@code URL's} from * the internet - and download them, one by one, to a directory. Messages will be printed as * each file is downloaded via {@code System.out.print()} * * @param f A file pointer to a text-file that contains a list of {@code String's}. Each * {@code String} is intended to be a {@code URL} to an image on the internet. * * @return a {@code Vector} containing the file-names of these images. */ public static Vector<String> downloadImages(File f) throws IOException, FileNotFoundException { BufferedReader br = new BufferedReader(new FileReader(f)); Vector<String> pics = new Vector<String>(); String s; while ((s = br.readLine()) != null) pics.addElement(s); return downloadImagesGuessTypes(pics); } } |