1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
package Torello.HTML.Tools.Images;

import Torello.Java.StringParse;
import Torello.Java.WritableDirectoryException;

import java.io.*;

import javax.imageio.ImageIO;
import java.net.URL;
import java.awt.Image;
import java.awt.image.BufferedImage;
import java.util.Vector;

/**
 * A simple class for scraping & downloading images using a URL, or list of URL's.
 * 
 * <EMBED CLASS='external-html' DATA-FILE-ID=IMSC>
 */
public class ImageScrape
{
    // No public constructor
    private ImageScrape() { }

    /** {@code String}-Array having the list of file-formats */
    public static final String[] imageExts = { "jpg", "png", "gif", "bmp", "jpeg" };

    /**
     * This will extract the file-extension from an image {@code URL.}  Not all images on the
     * internet have {@code URL's} that end with the actual image-file-type.  In that case, or in
     * the case that the {@code 'urStr'} is a pointer to a non-image-file, null will be returned.
     * 
     * @param urlStr Is the {@code url} of the image. 
     * 
     * @return If it has a file-extension that is listed in the {@code 'imageExts'} array - that
     * file-extension will be returned, otherwise {@code null} will be returned.
     */
    public static String getImageTypeFromURL(String urlStr)
    {
        if (urlStr == null) return null;

        String ext = StringParse.fromExtension(urlStr, false);

        if (ext == null) return null;

        ext = ext.toLowerCase();

        for (int i=0; i < imageExts.length; i++) if (imageExts[i].equals(ext)) return imageExts[i];

        return null;
    }

    /**
     * Convenience Method.
     * <BR />Invokes: {@link #downloadImageGuessType(String, String, String)}
     */
    public static String downloadImageGuessType(String urlStr, String outputFileStr)
        throws IOException
    { 
        // We need to check whether the file-name that was passed is just a filename; or if it
        // has a directory component in its name.

        int sep = outputFileStr.lastIndexOf(File.separator) + 1;

        if (sep == 0)
            return downloadImageGuessType(urlStr, outputFileStr, "");

        else if (sep == outputFileStr.length())
            return downloadImageGuessType(urlStr, "IMAGE", outputFileStr);

        else return downloadImageGuessType
            (urlStr, outputFileStr.substring(sep), outputFileStr.substring(0, sep));
    }

    /**
     * This will download an image, and try to guess if it is one of the following types:
     * {@code .jpg, .png, .bmp, .gif or .jpeg}.  If the {@code 'urlStr'} has a valid image-type
     * extension as a filename, then that format will be used to save to a file.  If that fails,
     * an exception of type {@code javax.imageio.IIOException} is thrown.
     *
     * <DIV CLASS="EXAMPLE">{@code
     *  // Retrieve all images found on a random Yahoo! News web-page article
     *  URL url = new URL("https://news.yahoo.com/former-fox-news-employees [actual URL hidden].html");
     *  
     *  // Parse & Scrape the Web-Page, store it in a local html-vector
     *  Vector<HTMLNode> page = HTMLPage.getPageTokens(url, false);
     *  
     *  // Skip ahead to the "article body."  The body is surrounded by an <ARTICLE>...</ARTICLE>
     *  // HTML Element.  Retrieve (using 'Inclusive') - everything between the HTML "ARTICLE" Tags.
     *  page = TagNodeGetInclusive.first(page, "article");
     *  
     *  // Get the SECOND picture (HTML <IMG SRC=...>) element found on the page.
     *  // For the news-article used in this example, the first image was an icon thumbnail.
     *  // The second image contained the "Main Article Photo"
     *  TagNode firstPic    = TagNodeGet.nth(page, 2, TC.OpeningTags, "img");
     *  String  urlStr      = Links.resolveSRC(firstPic, url).toString();
     *  
     *  // Run this method.  A file named 'img.jpg' is saved.
     *  System.out.println("Image URL to Download:" + urlStr);
     *  ImageScrape.downloadImageGuessType(urlStr, "img");
     * }</DIV>
     *
     * @param urlStr Is the {@code url} of the image.  <B>Yahoo! Images</B>, for instance, have
     * really long {@code URL's} and don't have any extensions at the end.  If {@code 'urlStr'}
     * does contain an image extension in the {@code 'String'}, then this method will attempt to
     * save the image using the appropriate file-extension, and throw an {@code 'IIOException'} if
     * it fails.
     * 
     * @param outputFileStr This is the target or destination name for the output image file.
     *
     * <BR /><BR /><B>NOTE:</B> This file is not intended to have an extension.  The extension will
     * be generated by the code in this method, and it will match whatever image-file-encoding
     * was successfully used to download the file.  If this is a  {@code '.png'}, for instance, but
     * it did not download until {@code '.bmp'} was used (mis-labeled), this output file will be
     * saved as {@code 'outputFileStr'} + {@code '.bmp'}.
     * 
     * <BR /><BR /><B STYLE='color: red;'>URL vs. File Names:</B> This parameter
     * {@code 'outputFileStr'} <B><I>may NOT be null</I></B>.  It is important to realize, here,
     * that file-names and {@code URL's} <I>do not obey the same naming conventions.</I>  Because
     * it is often seen on the internet image-{@code URL's} that have a plethora of file-system
     * 'irreverent' characters in their name, this method simply cannot pick out the file-name of
     * an image from its {@code URL}.
     * 
     * <BR /><BR />It may seem counter-intuitive to expect a "filename" parameter be provided as
     * input here, <I>given that an image-{@code URL} is also required</I> (since in most cases
     * the file-name of the image being downloaded is included in the image's {@code URL}).  
     * However, because many of the modern content-providers on the internet use many layers of
     * naming conventions for their image-{@code URL's}, the user must provide the file-name of
     * the image (as a {@code String}) to avoid crashing this method in situations / cases where
     * the image file-name is "too difficult" to discern from it's {@code URL}.
     *
     * @param outputDirectory This is just "prepended" to the file-save name.  This
     * {@code 'String'} is not included in the returned filename.  <B>Specifically</B> The returned
     * file name <I>only includes</I> the file-name and the file-name-extension.  It does not
     * include the whole "canonical" or "absolute" directory-path name for this image.
     *
     * @return It will return the name of the file as a result - including the extension type
     * which did not throw a {@code javax.imageio.IIOException.}  This exception is thrown
     * whenever an image, of - for instance {@code '.png'} format tries to save as a
     * {@code '.jpg'}, or any other incorrect image-format.
     *
     * <BR /><BR /><B>NOTE:</B> {@code 'null'} will be returned if the image failed to save at all.
     *
     * <BR /><BR /><B>ALSO:</B> If the passed {@code 'urlStr'} does not save properly,
     * {@code javax.imageio.IIOException} will also be thrown.
     *
     * <BR /><BR /><B>It is important to return the filename, since the extension identifies in
     * what format the image was saved - {@code .jpg, .gif, .png,} etc...</B>
     *
     * @throws WritableDirectoryException If the provided output directory must exist and be
     * writable, or else this exception shall throw.  Java will attempt to write a small,
     * temporary file to the directory-name provided.  It will be deleted immediately afterwards.
     * 
     * @see #imageExts
     */
    public static String downloadImageGuessType
        (String urlStr, String outputFileStr, String outputDirectory)
        throws IOException
    {
        // If the "file name" has directory components...  it is just "better" to flag this as
        // an exception

        if (outputFileStr.indexOf(File.separator) != -1) throw new IllegalArgumentException(
            "This method expects parameter 'outputFileStr' to be a simple file-name, without " +
            "any directory-names attached.  If directory names need to be attached to ensure " +
            "that the file is ultimately saved to the proper location in the file-system, " +
            "pass the directory to the 'outputDirectory' parameter to this method.\n" +
            "You have passed: " + outputFileStr + "\nwhich contains the file-name separator " +
            "character."
        );

        if (outputDirectory == null) outputDirectory = "";

        // Make sure the directory exists on the file-system, and that it is writable.
        WritableDirectoryException.check(outputDirectory);

        // Unless writing the "current directory" - make sure the directory name ends with the
        // Operating System file-separator character.

        if ((outputDirectory.length() > 0) && (! outputDirectory.endsWith(File.separator)))
            outputDirectory = outputDirectory + File.separator;

        BufferedImage   image   = ImageIO.read(new URL(urlStr));
        String          ext     = getImageTypeFromURL(urlStr);
        File            f       = null;

        if (ext != null) 

            try
            {
                String fName = outputFileStr + '.' + ext;
                f = new File(outputDirectory + fName);

                ImageIO.write(image, ext, f);

                return fName;
            }

            // NOTE: If saving the file using the named image-extension fails, try the other.
            catch (javax.imageio.IIOException e) { f.delete(); }

        for (int i=0; i < imageExts.length; i++)

            try
            {
                f = new File(outputFileStr + '.' + imageExts[i]);

                ImageIO.write(image, imageExts[i], f);

                return outputFileStr + '.' + imageExts[i];
            }

            catch (javax.imageio.IIOException e) { f.delete(); continue; }

        System.out.println
            ("NOTE: Image " + urlStr + "\nAttempted to save to:" + outputFileStr + "\nFAILED.");

        return null;
    }

    /**
     * Convenience Method.
     * <BR />Invokes: {@link #downloadImagesGuessTypes(String, Iterable, String)}
     */
    public static Vector<String> downloadImagesGuessTypes(String rootURL, Iterable<String> urls)
        throws IOException
    { return downloadImagesGuessTypes(rootURL, urls, ""); }

    /**
     * Convenience Method.
     * <BR />Invokes: {@link #downloadImagesGuessTypes(String, Iterable, String)}
     */
    public static Vector<String> downloadImagesGuessTypes(Iterable<String> urls)
        throws IOException
    { return downloadImagesGuessTypes("", urls, ""); }

    /**
     * Convenience Method.
     * <BR />Invokes: {@link #downloadImagesGuessTypes(String, Iterable, String)}
     */
    public static Vector<String> downloadImagesGuessTypes
        (Iterable<String> urls, String outputDirectory)
        throws IOException
    { return downloadImagesGuessTypes("", urls, outputDirectory); }

    /**
     * This will download an entire {@code Vector<String>} of {@code URL's}, and save the
     * output fileNames which were used to save these images.  It will use a the 
     * {@code StringParse.zeroPad(int)} method to generate filenames - starting with 
     * {@code 001.jpg} - or whatever extension was correct.  It will use the <B><I>guessed 
     * file-name extension</I></B> that is appropriate for this image.
     *
     * <BR /><BR /><B>NOTE:</B> As the images are downloaded, the fileName is printed via 
     * {@code System.out.println()}
     *
     * <DIV CLASS="EXAMPLE">{@code
     *  // Retrieve all images found on the Wikipedia (Encyclopedia) Page for Galileo
     *  URL url = new URL("https://en.wikipedia.org/wiki/Galileo_Galilei");
     *  
     *  // Parse & Scrape the Web-Page, store it in a local html-vector
     *  Vector<HTMLNode> page = HTMLPage.getPageTokens(url, false);
     *  
     *  // Get the "Vector Index Array" for every HTML <IMG> element found on the page.
     *  int[] imgPosArr = TagNodeFind.all(page, TC.OpeningTags, "img");
     *  
     *  // Since there are many "relative" or "partial" URL's, make sure to resolve them
     *  // against the main Wikipedia page-url.  Also, note, that Links.resolve returns a
     *  // Vector<URL>, but that ImageScraper.downloadImagesGuessTypes requires a 
     *  // Vector<String>, so make sure to convert the output url's to strings.
     * 
     *  Vector<String> urls = new Vector<String>(imgPosArr.length);
     * 
     *  Links.resolveSRCs(page, imgPosArr, url).forEach((URL u) -> urls.add(u.toString()));
     *  
     *  // Run this method.  A series of '.png' and '.jpg' files will be saved to the current
     *  // working directory.
     * 
     *  ImageScrape.downloadImagesGuessTypes(urls);
     * }</DIV>
     *
     * @param urls is a {@code Vector} of {@code String's} that are to contain image pointers
     * 
     * @param rootURL if these are "sub-urls", with a root {@code URL}, this root {@code URL}
     * is pre-pended to each of the {@code String's} in the {@code 'urls' Vector}.  This parameter
     * may contain the empty string ({@code ""}) (and if it is, it will be ignored)
     * 
     * @param outputDirectory The files that are downloaded are saved to this directory.
     * 
     * @return a {@code Vector} of {@code String's} which contains the output filenames of these
     * files.
     * 
     * @throws WritableDirectoryException If the provided output directory must exist and be
     * writable, or else this exception shall throw.  Java will attempt to write a small, temporary
     * file to the directory-name provided.  It will be deleted immediately afterwards.
     * 
     * @see StringParse#zeroPad(int)
     * @see #downloadImageGuessType(String, String)
     */
    public static Vector<String> downloadImagesGuessTypes
        (String rootURL, Iterable<String> urls, String outputDirectory)
        throws IOException
    {
        if (outputDirectory == null) outputDirectory = "";

        // Make sure the directory exists on the file-system, and that it is writable.
        WritableDirectoryException.check(outputDirectory);

        // Unless writing the "current directory" - make sure the directory name ends with the
        // Operating System file-separator character.

        if ((outputDirectory.length() > 0) && (! outputDirectory.endsWith(File.separator)))
            outputDirectory = outputDirectory + File.separator;

        if (rootURL == null) rootURL = "";

        Vector<String>  ret     = new Vector<String>();
        int             count   = 0;

        for (String url : urls)
        {
            String fileName = downloadImageGuessType
                (rootURL + url, StringParse.zeroPad(++count), outputDirectory);

            System.out.print(fileName + ((fileName.length() < 10) ? ' ' : '\n'));

            ret.addElement(fileName);
        }

        return ret;
    }


    /**
     * This downloads an image to a a file named {@code 'outputFileStr'}.  A valid image-extension
     * needs to be provided for the java {@code ImageIO.write(...)} method to work properly.  The
     * {@code 'extensionStr'} should be {@code String's} such as: {@code '.jpg'} or {@code '.png'}
     * 
     * @param urlStr The {@code URL} of the image which generated the exception
     * @param outputFileStr The intended file-name root to which the image is supposed to save
     * @param extensionStr The intended file-name extension to which this image was to be saved.
     * @throws java.imageio.IIOException - if this file type / {@code 'extensionStr'} are incorrect
    */
    public static void getImage(String urlStr, String outputFileStr, String extensionStr)
        throws IOException
    {
        File            f       = new File(outputFileStr);
        BufferedImage   image   = ImageIO.read(new URL(urlStr));

        ImageIO.write(image, extensionStr, f);
    }

    /**
     * This method will read from a text-file, which must have a list of image-{@code URL's} from
     * the internet - and download them, one by one, to a directory.  Messages will be printed as
     * each file is downloaded via {@code System.out.print()}
     * 
     * @param f A file pointer to a text-file that contains a list of {@code String's}.  Each 
     * {@code String} is intended to be a {@code URL} to an image on the internet.
     * 
     * @return a {@code Vector} containing the file-names of these images.
     */
    public static Vector<String> downloadImages(File f) throws IOException, FileNotFoundException
    {
        BufferedReader  br      = new BufferedReader(new FileReader(f));
        Vector<String>  pics    = new Vector<String>();
        String          s;

        while ((s = br.readLine()) != null) pics.addElement(s);

        return downloadImagesGuessTypes(pics);
    }
}