ImageScrape.java.html

package Torello.HTML.Tools.Images;

import Torello.Java.StringParse;
import Torello.Java.WritableDirectoryException;

import java.io.*;

import javax.imageio.ImageIO;
import java.net.URL;
import java.awt.Image;
import java.awt.image.BufferedImage;
import java.util.Vector;

/**
 * A simple class for scraping &amp; downloading images using a URL, or list of URL's.
 * 
 * <EMBED CLASS='external-html' DATA-FILE-ID=IMSC>
 */
public class ImageScrape
{
    // No public constructor
    private ImageScrape() { }

    /** {@code String}-Array having the list of file-formats */
    public static final String[] imageExts = { "jpg", "png", "gif", "bmp", "jpeg" };

    /**
     * This will extract the file-extension from an image {@code URL.}  Not all images on the
     * internet have {@code URL's} that end with the actual image-file-type.  In that case, or in
     * the case that the {@code 'urStr'} is a pointer to a non-image-file, null will be returned.
     * 
     * @param urlStr Is the {@code url} of the image. 
     * 
     * @return If it has a file-extension that is listed in the {@code 'imageExts'} array - that
     * file-extension will be returned, otherwise {@code null} will be returned.
     */
    public static String getImageTypeFromURL(String urlStr)
    {
        if (urlStr == null) return null;

        String ext = StringParse.fromExtension(urlStr, false);

        if (ext == null) return null;

        ext = ext.toLowerCase();

        for (int i=0; i < imageExts.length; i++) if (imageExts[i].equals(ext)) return imageExts[i];

        return null;
    }

    /**
     * Convenience Method.
     * <BR />Invokes: {@link #downloadImageGuessType(String, String, String)}
     */
    public static String downloadImageGuessType(String urlStr, String outputFileStr)
        throws IOException
    { 
        // We need to check whether the file-name that was passed is just a filename; or if it
        // has a directory component in its name.

        int sep = outputFileStr.lastIndexOf(File.separator) + 1;

        if (sep == 0)
            return downloadImageGuessType(urlStr, outputFileStr, "");

        else if (sep == outputFileStr.length())
            return downloadImageGuessType(urlStr, "IMAGE", outputFileStr);

        else return downloadImageGuessType
            (urlStr, outputFileStr.substring(sep), outputFileStr.substring(0, sep));
    }

    /**
     * This will download an image, and try to guess if it is one of the following types:
     * {@code .jpg, .png, .bmp, .gif or .jpeg}.  If the {@code 'urlStr'} has a valid image-type
     * extension as a filename, then that format will be used to save to a file.  If that fails,
     * an exception of type {@code javax.imageio.IIOException} is thrown.
     *
     * <DIV CLASS="EXAMPLE">{@code
     *  // Retrieve all images found on a random Yahoo! News web-page article
     *  URL url = new URL("https://news.yahoo.com/former-fox-news-employees [actual URL hidden].html");
     *  
     *  // Parse & Scrape the Web-Page, store it in a local html-vector
     *  Vector<HTMLNode> page = HTMLPage.getPageTokens(url, false);
     *  
     *  // Skip ahead to the "article body."  The body is surrounded by an <ARTICLE>...</ARTICLE>
     *  // HTML Element.  Retrieve (using 'Inclusive') - everything between the HTML "ARTICLE" Tags.
     *  page = TagNodeGetInclusive.first(page, "article");
     *  
     *  // Get the SECOND picture (HTML <IMG SRC=...>) element found on the page.
     *  // For the news-article used in this example, the first image was an icon thumbnail.
     *  // The second image contained the "Main Article Photo"
     *  TagNode firstPic    = TagNodeGet.nth(page, 2, TC.OpeningTags, "img");
     *  String  urlStr      = Links.resolveSRC(firstPic, url).toString();
     *  
     *  // Run this method.  A file named 'img.jpg' is saved.
     *  System.out.println("Image URL to Download:" + urlStr);
     *  ImageScrape.downloadImageGuessType(urlStr, "img");
     * }</DIV>
     *
     * @param urlStr Is the {@code url} of the image.  <B>Yahoo! Images</B>, for instance, have
     * really long {@code URL's} and don't have any extensions at the end.  If {@code 'urlStr'}
     * does contain an image extension in the {@code 'String'}, then this method will attempt to
     * save the image using the appropriate file-extension, and throw an {@code 'IIOException'} if
     * it fails.
     * 
     * @param outputFileStr This is the target or destination name for the output image file.
     *
     * <BR /><BR /><B>NOTE:</B> This file is not intended to have an extension.  The extension will
     * be generated by the code in this method, and it will match whatever image-file-encoding
     * was successfully used to download the file.  If this is a  {@code '.png'}, for instance, but
     * it did not download until {@code '.bmp'} was used (mis-labeled), this output file will be
     * saved as {@code 'outputFileStr'} + {@code '.bmp'}.
     * 
     * <BR /><BR /><B STYLE='color: red;'>URL vs. File Names:</B> This parameter
     * {@code 'outputFileStr'} <B><I>may NOT be null</I></B>.  It is important to realize, here,
     * that file-names and {@code URL's} <I>do not obey the same naming conventions.</I>  Because
     * it is often seen on the internet image-{@code URL's} that have a plethora of file-system
     * 'irreverent' characters in their name, this method simply cannot pick out the file-name of
     * an image from its {@code URL}.
     * 
     * <BR /><BR />It may seem counter-intuitive to expect a "filename" parameter be provided as
     * input here, <I>given that an image-{@code URL} is also required</I> (since in most cases
     * the file-name of the image being downloaded is included in the image's {@code URL}).  
     * However, because many of the modern content-providers on the internet use many layers of
     * naming conventions for their image-{@code URL's}, the user must provide the file-name of
     * the image (as a {@code String}) to avoid crashing this method in situations / cases where
     * the image file-name is "too difficult" to discern from it's {@code URL}.
     *
     * @param outputDirectory This is just "prepended" to the file-save name.  This
     * {@code 'String'} is not included in the returned filename.  <B>Specifically</B> The returned
     * file name <I>only includes</I> the file-name and the file-name-extension.  It does not
     * include the whole "canonical" or "absolute" directory-path name for this image.
     *
     * @return It will return the name of the file as a result - including the extension type
     * which did not throw a {@code javax.imageio.IIOException.}  This exception is thrown
     * whenever an image, of - for instance {@code '.png'} format tries to save as a
     * {@code '.jpg'}, or any other incorrect image-format.
     *
     * <BR /><BR /><B>NOTE:</B> {@code 'null'} will be returned if the image failed to save at all.
     *
     * <BR /><BR /><B>ALSO:</B> If the passed {@code 'urlStr'} does not save properly,
     * {@code javax.imageio.IIOException} will also be thrown.
     *
     * <BR /><BR /><B>It is important to return the filename, since the extension identifies in
     * what format the image was saved - {@code .jpg, .gif, .png,} etc...</B>
     *
     * @throws WritableDirectoryException If the provided output directory must exist and be
     * writable, or else this exception shall throw.  Java will attempt to write a small,
     * temporary file to the directory-name provided.  It will be deleted immediately afterwards.
     * 
     * @see #imageExts
     */
    public static String downloadImageGuessType
        (String urlStr, String outputFileStr, String outputDirectory)
        throws IOException
    {
        // If the "file name" has directory components...  it is just "better" to flag this as
        // an exception

        if (outputFileStr.indexOf(File.separator) != -1) throw new IllegalArgumentException(
            "This method expects parameter 'outputFileStr' to be a simple file-name, without " +
            "any directory-names attached.  If directory names need to be attached to ensure " +
            "that the file is ultimately saved to the proper location in the file-system, " +
            "pass the directory to the 'outputDirectory' parameter to this method.\n" +
            "You have passed: " + outputFileStr + "\nwhich contains the file-name separator " +
            "character."
        );

        if (outputDirectory == null) outputDirectory = "";

        // Make sure the directory exists on the file-system, and that it is writable.
        WritableDirectoryException.check(outputDirectory);

        // Unless writing the "current directory" - make sure the directory name ends with the
        // Operating System file-separator character.

        if ((outputDirectory.length() > 0) && (! outputDirectory.endsWith(File.separator)))
            outputDirectory = outputDirectory + File.separator;

        BufferedImage   image   = ImageIO.read(new URL(urlStr));
        String          ext     = getImageTypeFromURL(urlStr);
        File            f       = null;

        if (ext != null) 

            try
            {
                String fName = outputFileStr + '.' + ext;
                f = new File(outputDirectory + fName);

                ImageIO.write(image, ext, f);

                return fName;
            }

            // NOTE: If saving the file using the named image-extension fails, try the other.
            catch (javax.imageio.IIOException e) { f.delete(); }

        for (int i=0; i < imageExts.length; i++)

            try
            {
                f = new File(outputFileStr + '.' + imageExts[i]);

                ImageIO.write(image, imageExts[i], f);

                return outputFileStr + '.' + imageExts[i];
            }

            catch (javax.imageio.IIOException e) { f.delete(); continue; }

        System.out.println
            ("NOTE: Image " + urlStr + "\nAttempted to save to:" + outputFileStr + "\nFAILED.");

        return null;
    }

    /**
     * Convenience Method.
     * <BR />Invokes: {@link #downloadImagesGuessTypes(String, Iterable, String)}
     */
    public static Vector<String> downloadImagesGuessTypes(String rootURL, Iterable<String> urls)
        throws IOException
    { return downloadImagesGuessTypes(rootURL, urls, ""); }

    /**
     * Convenience Method.
     * <BR />Invokes: {@link #downloadImagesGuessTypes(String, Iterable, String)}
     */
    public static Vector<String> downloadImagesGuessTypes(Iterable<String> urls)
        throws IOException
    { return downloadImagesGuessTypes("", urls, ""); }

    /**
     * Convenience Method.
     * <BR />Invokes: {@link #downloadImagesGuessTypes(String, Iterable, String)}
     */
    public static Vector<String> downloadImagesGuessTypes
        (Iterable<String> urls, String outputDirectory)
        throws IOException
    { return downloadImagesGuessTypes("", urls, outputDirectory); }

    /**
     * This will download an entire {@code Vector<String>} of {@code URL's}, and save the
     * output fileNames which were used to save these images.  It will use a the 
     * {@code StringParse.zeroPad(int)} method to generate filenames - starting with 
     * {@code 001.jpg} - or whatever extension was correct.  It will use the <B><I>guessed 
     * file-name extension</I></B> that is appropriate for this image.
     *
     * <BR /><BR /><B>NOTE:</B> As the images are downloaded, the fileName is printed via 
     * {@code System.out.println()}
     *
     * <DIV CLASS="EXAMPLE">{@code
     *  // Retrieve all images found on the Wikipedia (Encyclopedia) Page for Galileo
     *  URL url = new URL("https://en.wikipedia.org/wiki/Galileo_Galilei");
     *  
     *  // Parse & Scrape the Web-Page, store it in a local html-vector
     *  Vector<HTMLNode> page = HTMLPage.getPageTokens(url, false);
     *  
     *  // Get the "Vector Index Array" for every HTML <IMG> element found on the page.
     *  int[] imgPosArr = TagNodeFind.all(page, TC.OpeningTags, "img");
     *  
     *  // Since there are many "relative" or "partial" URL's, make sure to resolve them
     *  // against the main Wikipedia page-url.  Also, note, that Links.resolve returns a
     *  // Vector<URL>, but that ImageScraper.downloadImagesGuessTypes requires a 
     *  // Vector<String>, so make sure to convert the output url's to strings.
     * 
     *  Vector<String> urls = new Vector<String>(imgPosArr.length);
     * 
     *  Links.resolveSRCs(page, imgPosArr, url).forEach((URL u) -> urls.add(u.toString()));
     *  
     *  // Run this method.  A series of '.png' and '.jpg' files will be saved to the current
     *  // working directory.
     * 
     *  ImageScrape.downloadImagesGuessTypes(urls);
     * }</DIV>
     *
     * @param urls is a {@code Vector} of {@code String's} that are to contain image pointers
     * 
     * @param rootURL if these are "sub-urls", with a root {@code URL}, this root {@code URL}
     * is pre-pended to each of the {@code String's} in the {@code 'urls' Vector}.  This parameter
     * may contain the empty string ({@code ""}) (and if it is, it will be ignored)
     * 
     * @param outputDirectory The files that are downloaded are saved to this directory.
     * 
     * @return a {@code Vector} of {@code String's} which contains the output filenames of these
     * files.
     * 
     * @throws WritableDirectoryException If the provided output directory must exist and be
     * writable, or else this exception shall throw.  Java will attempt to write a small, temporary
     * file to the directory-name provided.  It will be deleted immediately afterwards.
     * 
     * @see StringParse#zeroPad(int)
     * @see #downloadImageGuessType(String, String)
     */
    public static Vector<String> downloadImagesGuessTypes
        (String rootURL, Iterable<String> urls, String outputDirectory)
        throws IOException
    {
        if (outputDirectory == null) outputDirectory = "";

        // Make sure the directory exists on the file-system, and that it is writable.
        WritableDirectoryException.check(outputDirectory);

        // Unless writing the "current directory" - make sure the directory name ends with the
        // Operating System file-separator character.

        if ((outputDirectory.length() > 0) && (! outputDirectory.endsWith(File.separator)))
            outputDirectory = outputDirectory + File.separator;

        if (rootURL == null) rootURL = "";

        Vector<String>  ret     = new Vector<String>();
        int             count   = 0;

        for (String url : urls)
        {
            String fileName = downloadImageGuessType
                (rootURL + url, StringParse.zeroPad(++count), outputDirectory);

            System.out.print(fileName + ((fileName.length() < 10) ? ' ' : '\n'));

            ret.addElement(fileName);
        }

        return ret;
    }


    /**
     * This downloads an image to a a file named {@code 'outputFileStr'}.  A valid image-extension
     * needs to be provided for the java {@code ImageIO.write(...)} method to work properly.  The
     * {@code 'extensionStr'} should be {@code String's} such as: {@code '.jpg'} or {@code '.png'}
     * 
     * @param urlStr The {@code URL} of the image which generated the exception
     * @param outputFileStr The intended file-name root to which the image is supposed to save
     * @param extensionStr The intended file-name extension to which this image was to be saved.
     * @throws java.imageio.IIOException - if this file type / {@code 'extensionStr'} are incorrect
    */
    public static void getImage(String urlStr, String outputFileStr, String extensionStr)
        throws IOException
    {
        File            f       = new File(outputFileStr);
        BufferedImage   image   = ImageIO.read(new URL(urlStr));

        ImageIO.write(image, extensionStr, f);
    }

    /**
     * This method will read from a text-file, which must have a list of image-{@code URL's} from
     * the internet - and download them, one by one, to a directory.  Messages will be printed as
     * each file is downloaded via {@code System.out.print()}
     * 
     * @param f A file pointer to a text-file that contains a list of {@code String's}.  Each 
     * {@code String} is intended to be a {@code URL} to an image on the internet.
     * 
     * @return a {@code Vector} containing the file-names of these images.
     */
    public static Vector<String> downloadImages(File f) throws IOException, FileNotFoundException
    {
        BufferedReader  br      = new BufferedReader(new FileReader(f));
        Vector<String>  pics    = new Vector<String>();
        String          s;

        while ((s = br.readLine()) != null) pics.addElement(s);

        return downloadImagesGuessTypes(pics);
    }
}