Source code

001package Torello.HTML.Tools.Images;
002
003import Torello.Java.StringParse;
004import Torello.Java.WritableDirectoryException;
005
006import java.io.*;
007
008import javax.imageio.ImageIO;
009import java.net.URL;
010import java.awt.Image;
011import java.awt.image.BufferedImage;
012import java.util.Vector;
013
014/**
015 * A simple class for scraping &amp; downloading images using a URL, or list of URL's.
016 * 
017 * <EMBED CLASS='external-html' DATA-FILE-ID=IMSC>
018 */
019public class ImageScrape
020{
021    // No public constructor
022    private ImageScrape() { }
023
024    /** {@code String}-Array having the list of file-formats */
025    public static final String[] imageExts = { "jpg", "png", "gif", "bmp", "jpeg" };
026
027    /**
028     * This will extract the file-extension from an image {@code URL.}  Not all images on the
029     * internet have {@code URL's} that end with the actual image-file-type.  In that case, or in
030     * the case that the {@code 'urStr'} is a pointer to a non-image-file, null will be returned.
031     * 
032     * @param urlStr Is the {@code url} of the image. 
033     * 
034     * @return If it has a file-extension that is listed in the {@code 'imageExts'} array - that
035     * file-extension will be returned, otherwise {@code null} will be returned.
036     */
037    public static String getImageTypeFromURL(String urlStr)
038    {
039        if (urlStr == null) return null;
040
041        String ext = StringParse.fromExtension(urlStr, false);
042
043        if (ext == null) return null;
044
045        ext = ext.toLowerCase();
046
047        for (int i=0; i < imageExts.length; i++) if (imageExts[i].equals(ext)) return imageExts[i];
048
049        return null;
050    }
051
052    /**
053     * Convenience Method.
054     * <BR />Invokes: {@link #downloadImageGuessType(String, String, String)}
055     */
056    public static String downloadImageGuessType(String urlStr, String outputFileStr)
057        throws IOException
058    { 
059        // We need to check whether the file-name that was passed is just a filename; or if it
060        // has a directory component in its name.
061
062        int sep = outputFileStr.lastIndexOf(File.separator) + 1;
063
064        if (sep == 0)
065            return downloadImageGuessType(urlStr, outputFileStr, "");
066
067        else if (sep == outputFileStr.length())
068            return downloadImageGuessType(urlStr, "IMAGE", outputFileStr);
069
070        else return downloadImageGuessType
071            (urlStr, outputFileStr.substring(sep), outputFileStr.substring(0, sep));
072    }
073
074    /**
075     * This will download an image, and try to guess if it is one of the following types:
076     * {@code .jpg, .png, .bmp, .gif or .jpeg}.  If the {@code 'urlStr'} has a valid image-type
077     * extension as a filename, then that format will be used to save to a file.  If that fails,
078     * an exception of type {@code javax.imageio.IIOException} is thrown.
079     *
080     * <DIV CLASS="EXAMPLE">{@code
081     *  // Retrieve all images found on a random Yahoo! News web-page article
082     *  URL url = new URL("https://news.yahoo.com/former-fox-news-employees [actual URL hidden].html");
083     *  
084     *  // Parse & Scrape the Web-Page, store it in a local html-vector
085     *  Vector<HTMLNode> page = HTMLPage.getPageTokens(url, false);
086     *  
087     *  // Skip ahead to the "article body."  The body is surrounded by an <ARTICLE>...</ARTICLE>
088     *  // HTML Element.  Retrieve (using 'Inclusive') - everything between the HTML "ARTICLE" Tags.
089     *  page = TagNodeGetInclusive.first(page, "article");
090     *  
091     *  // Get the SECOND picture (HTML <IMG SRC=...>) element found on the page.
092     *  // For the news-article used in this example, the first image was an icon thumbnail.
093     *  // The second image contained the "Main Article Photo"
094     *  TagNode firstPic    = TagNodeGet.nth(page, 2, TC.OpeningTags, "img");
095     *  String  urlStr      = Links.resolveSRC(firstPic, url).toString();
096     *  
097     *  // Run this method.  A file named 'img.jpg' is saved.
098     *  System.out.println("Image URL to Download:" + urlStr);
099     *  ImageScrape.downloadImageGuessType(urlStr, "img");
100     * }</DIV>
101     *
102     * @param urlStr Is the {@code url} of the image.  <B>Yahoo! Images</B>, for instance, have
103     * really long {@code URL's} and don't have any extensions at the end.  If {@code 'urlStr'}
104     * does contain an image extension in the {@code 'String'}, then this method will attempt to
105     * save the image using the appropriate file-extension, and throw an {@code 'IIOException'} if
106     * it fails.
107     * 
108     * @param outputFileStr This is the target or destination name for the output image file.
109     *
110     * <BR /><BR /><B>NOTE:</B> This file is not intended to have an extension.  The extension will
111     * be generated by the code in this method, and it will match whatever image-file-encoding
112     * was successfully used to download the file.  If this is a  {@code '.png'}, for instance, but
113     * it did not download until {@code '.bmp'} was used (mis-labeled), this output file will be
114     * saved as {@code 'outputFileStr'} + {@code '.bmp'}.
115     * 
116     * <BR /><BR /><B STYLE='color: red;'>URL vs. File Names:</B> This parameter
117     * {@code 'outputFileStr'} <B><I>may NOT be null</I></B>.  It is important to realize, here,
118     * that file-names and {@code URL's} <I>do not obey the same naming conventions.</I>  Because
119     * it is often seen on the internet image-{@code URL's} that have a plethora of file-system
120     * 'irreverent' characters in their name, this method simply cannot pick out the file-name of
121     * an image from its {@code URL}.
122     * 
123     * <BR /><BR />It may seem counter-intuitive to expect a "filename" parameter be provided as
124     * input here, <I>given that an image-{@code URL} is also required</I> (since in most cases
125     * the file-name of the image being downloaded is included in the image's {@code URL}).  
126     * However, because many of the modern content-providers on the internet use many layers of
127     * naming conventions for their image-{@code URL's}, the user must provide the file-name of
128     * the image (as a {@code String}) to avoid crashing this method in situations / cases where
129     * the image file-name is "too difficult" to discern from it's {@code URL}.
130     *
131     * @param outputDirectory This is just "prepended" to the file-save name.  This
132     * {@code 'String'} is not included in the returned filename.  <B>Specifically</B> The returned
133     * file name <I>only includes</I> the file-name and the file-name-extension.  It does not
134     * include the whole "canonical" or "absolute" directory-path name for this image.
135     *
136     * @return It will return the name of the file as a result - including the extension type
137     * which did not throw a {@code javax.imageio.IIOException.}  This exception is thrown
138     * whenever an image, of - for instance {@code '.png'} format tries to save as a
139     * {@code '.jpg'}, or any other incorrect image-format.
140     *
141     * <BR /><BR /><B>NOTE:</B> {@code 'null'} will be returned if the image failed to save at all.
142     *
143     * <BR /><BR /><B>ALSO:</B> If the passed {@code 'urlStr'} does not save properly,
144     * {@code javax.imageio.IIOException} will also be thrown.
145     *
146     * <BR /><BR /><B>It is important to return the filename, since the extension identifies in
147     * what format the image was saved - {@code .jpg, .gif, .png,} etc...</B>
148     *
149     * @throws WritableDirectoryException If the provided output directory must exist and be
150     * writable, or else this exception shall throw.  Java will attempt to write a small,
151     * temporary file to the directory-name provided.  It will be deleted immediately afterwards.
152     * 
153     * @see #imageExts
154     */
155    public static String downloadImageGuessType
156        (String urlStr, String outputFileStr, String outputDirectory)
157        throws IOException
158    {
159        // If the "file name" has directory components...  it is just "better" to flag this as
160        // an exception
161
162        if (outputFileStr.indexOf(File.separator) != -1) throw new IllegalArgumentException(
163            "This method expects parameter 'outputFileStr' to be a simple file-name, without " +
164            "any directory-names attached.  If directory names need to be attached to ensure " +
165            "that the file is ultimately saved to the proper location in the file-system, " +
166            "pass the directory to the 'outputDirectory' parameter to this method.\n" +
167            "You have passed: " + outputFileStr + "\nwhich contains the file-name separator " +
168            "character."
169        );
170
171        if (outputDirectory == null) outputDirectory = "";
172
173        // Make sure the directory exists on the file-system, and that it is writable.
174        WritableDirectoryException.check(outputDirectory);
175
176        // Unless writing the "current directory" - make sure the directory name ends with the
177        // Operating System file-separator character.
178
179        if ((outputDirectory.length() > 0) && (! outputDirectory.endsWith(File.separator)))
180            outputDirectory = outputDirectory + File.separator;
181
182        BufferedImage   image   = ImageIO.read(new URL(urlStr));
183        String          ext     = getImageTypeFromURL(urlStr);
184        File            f       = null;
185
186        if (ext != null) 
187
188            try
189            {
190                String fName = outputFileStr + '.' + ext;
191                f = new File(outputDirectory + fName);
192
193                ImageIO.write(image, ext, f);
194
195                return fName;
196            }
197
198            // NOTE: If saving the file using the named image-extension fails, try the other.
199            catch (javax.imageio.IIOException e) { f.delete(); }
200
201        for (int i=0; i < imageExts.length; i++)
202
203            try
204            {
205                f = new File(outputFileStr + '.' + imageExts[i]);
206
207                ImageIO.write(image, imageExts[i], f);
208
209                return outputFileStr + '.' + imageExts[i];
210            }
211
212            catch (javax.imageio.IIOException e) { f.delete(); continue; }
213
214        System.out.println
215            ("NOTE: Image " + urlStr + "\nAttempted to save to:" + outputFileStr + "\nFAILED.");
216
217        return null;
218    }
219
220    /**
221     * Convenience Method.
222     * <BR />Invokes: {@link #downloadImagesGuessTypes(String, Iterable, String)}
223     */
224    public static Vector<String> downloadImagesGuessTypes(String rootURL, Iterable<String> urls)
225        throws IOException
226    { return downloadImagesGuessTypes(rootURL, urls, ""); }
227
228    /**
229     * Convenience Method.
230     * <BR />Invokes: {@link #downloadImagesGuessTypes(String, Iterable, String)}
231     */
232    public static Vector<String> downloadImagesGuessTypes(Iterable<String> urls)
233        throws IOException
234    { return downloadImagesGuessTypes("", urls, ""); }
235
236    /**
237     * Convenience Method.
238     * <BR />Invokes: {@link #downloadImagesGuessTypes(String, Iterable, String)}
239     */
240    public static Vector<String> downloadImagesGuessTypes
241        (Iterable<String> urls, String outputDirectory)
242        throws IOException
243    { return downloadImagesGuessTypes("", urls, outputDirectory); }
244
245    /**
246     * This will download an entire {@code Vector<String>} of {@code URL's}, and save the
247     * output fileNames which were used to save these images.  It will use a the 
248     * {@code StringParse.zeroPad(int)} method to generate filenames - starting with 
249     * {@code 001.jpg} - or whatever extension was correct.  It will use the <B><I>guessed 
250     * file-name extension</I></B> that is appropriate for this image.
251     *
252     * <BR /><BR /><B>NOTE:</B> As the images are downloaded, the fileName is printed via 
253     * {@code System.out.println()}
254     *
255     * <DIV CLASS="EXAMPLE">{@code
256     *  // Retrieve all images found on the Wikipedia (Encyclopedia) Page for Galileo
257     *  URL url = new URL("https://en.wikipedia.org/wiki/Galileo_Galilei");
258     *  
259     *  // Parse & Scrape the Web-Page, store it in a local html-vector
260     *  Vector<HTMLNode> page = HTMLPage.getPageTokens(url, false);
261     *  
262     *  // Get the "Vector Index Array" for every HTML <IMG> element found on the page.
263     *  int[] imgPosArr = TagNodeFind.all(page, TC.OpeningTags, "img");
264     *  
265     *  // Since there are many "relative" or "partial" URL's, make sure to resolve them
266     *  // against the main Wikipedia page-url.  Also, note, that Links.resolve returns a
267     *  // Vector<URL>, but that ImageScraper.downloadImagesGuessTypes requires a 
268     *  // Vector<String>, so make sure to convert the output url's to strings.
269     * 
270     *  Vector<String> urls = new Vector<String>(imgPosArr.length);
271     * 
272     *  Links.resolveSRCs(page, imgPosArr, url).forEach((URL u) -> urls.add(u.toString()));
273     *  
274     *  // Run this method.  A series of '.png' and '.jpg' files will be saved to the current
275     *  // working directory.
276     * 
277     *  ImageScrape.downloadImagesGuessTypes(urls);
278     * }</DIV>
279     *
280     * @param urls is a {@code Vector} of {@code String's} that are to contain image pointers
281     * 
282     * @param rootURL if these are "sub-urls", with a root {@code URL}, this root {@code URL}
283     * is pre-pended to each of the {@code String's} in the {@code 'urls' Vector}.  This parameter
284     * may contain the empty string ({@code ""}) (and if it is, it will be ignored)
285     * 
286     * @param outputDirectory The files that are downloaded are saved to this directory.
287     * 
288     * @return a {@code Vector} of {@code String's} which contains the output filenames of these
289     * files.
290     * 
291     * @throws WritableDirectoryException If the provided output directory must exist and be
292     * writable, or else this exception shall throw.  Java will attempt to write a small, temporary
293     * file to the directory-name provided.  It will be deleted immediately afterwards.
294     * 
295     * @see StringParse#zeroPad(int)
296     * @see #downloadImageGuessType(String, String)
297     */
298    public static Vector<String> downloadImagesGuessTypes
299        (String rootURL, Iterable<String> urls, String outputDirectory)
300        throws IOException
301    {
302        if (outputDirectory == null) outputDirectory = "";
303
304        // Make sure the directory exists on the file-system, and that it is writable.
305        WritableDirectoryException.check(outputDirectory);
306
307        // Unless writing the "current directory" - make sure the directory name ends with the
308        // Operating System file-separator character.
309
310        if ((outputDirectory.length() > 0) && (! outputDirectory.endsWith(File.separator)))
311            outputDirectory = outputDirectory + File.separator;
312
313        if (rootURL == null) rootURL = "";
314
315        Vector<String>  ret     = new Vector<String>();
316        int             count   = 0;
317
318        for (String url : urls)
319        {
320            String fileName = downloadImageGuessType
321                (rootURL + url, StringParse.zeroPad(++count), outputDirectory);
322
323            System.out.print(fileName + ((fileName.length() < 10) ? ' ' : '\n'));
324
325            ret.addElement(fileName);
326        }
327
328        return ret;
329    }
330
331
332    /**
333     * This downloads an image to a a file named {@code 'outputFileStr'}.  A valid image-extension
334     * needs to be provided for the java {@code ImageIO.write(...)} method to work properly.  The
335     * {@code 'extensionStr'} should be {@code String's} such as: {@code '.jpg'} or {@code '.png'}
336     * 
337     * @param urlStr The {@code URL} of the image which generated the exception
338     * @param outputFileStr The intended file-name root to which the image is supposed to save
339     * @param extensionStr The intended file-name extension to which this image was to be saved.
340     * @throws java.imageio.IIOException - if this file type / {@code 'extensionStr'} are incorrect
341    */
342    public static void getImage(String urlStr, String outputFileStr, String extensionStr)
343        throws IOException
344    {
345        File            f       = new File(outputFileStr);
346        BufferedImage   image   = ImageIO.read(new URL(urlStr));
347
348        ImageIO.write(image, extensionStr, f);
349    }
350
351    /**
352     * This method will read from a text-file, which must have a list of image-{@code URL's} from
353     * the internet - and download them, one by one, to a directory.  Messages will be printed as
354     * each file is downloaded via {@code System.out.print()}
355     * 
356     * @param f A file pointer to a text-file that contains a list of {@code String's}.  Each 
357     * {@code String} is intended to be a {@code URL} to an image on the internet.
358     * 
359     * @return a {@code Vector} containing the file-names of these images.
360     */
361    public static Vector<String> downloadImages(File f) throws IOException, FileNotFoundException
362    {
363        BufferedReader  br      = new BufferedReader(new FileReader(f));
364        Vector<String>  pics    = new Vector<String>();
365        String          s;
366
367        while ((s = br.readLine()) != null) pics.addElement(s);
368
369        return downloadImagesGuessTypes(pics);
370    }
371}