1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 | package Torello.HTML.Tools.Images; import java.net.URL; import java.net.MalformedURLException; import java.util.Iterator; import java.util.NoSuchElementException; import java.util.function.IntFunction; /** * An <CODE>Iterator</CODE> that is intended to be used for retrieving the image-URL's from * the page. * * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=PBSURLI> */ public class URLIterator implements Iterator<URL> { private int start, end, cur; private IntFunction<URL> getter; /** * Perhaps as more of these "wonderful" photo-bomb sites are published, more versions * of this iterator shall occur. Right now, the easiest way to deal with iterating through * the forty or fifty pages of photos, is to indicate the start and end number of the pages, * <I><B>and require the user/programmer to provide a lambda function</I></B> "making" the * URL out of a cur-position number. * * @param start This is the integer that is the "first" page of the site. * * <DIV CLASS="HTML">{@code * <!-- This URL has a lot of "Cute Little Bears" being saved in Siberia * The way you can scrape all 39 photos quickly is to iterator through * each of the PHP calls via the value passed to "page=" --> * <A HREF='https://www.jerusalemonline.com/view/bear-cubs-jol/?page=1'> * }</DIV> * * @param end This is the integer that contains the last page of the photo-site collection. * In the particular case of the "Bears who lost their momma in Siberia" - the last page * that is currently available is page number 39. * * @param urlGetter Any programmer that is familiar with Java Lambda Functions, should * know this is just Java's version of a "Function Pointer" from C and C++. This function * pointer must be a function that takes as input and integer (which is a page number), and * returns as output a URL. This will be called once for each page on the site. * * <DIV CLASS="EXAMPLE">{@code * // Generally, one might think this should be a single-line lambda expression. Though * // single line function pointers are quite common, because calling the constructor to a * // URL can generate a MalformedURLException, and because these exceptions are not * // sub-classes of RunTimeException, this short lambda has to include a try-catch. Here, * // the checked exception is simply converted to NullPointerException - which is * // unchecked. The reality is that if proper values are entered for start and end, no * // exceptions will occur. * URLIterator iter = new URLIterator(1, 39, (int curPage) -> * { * try * { return new URL(urlStr + curPage); } * catch (MalformedURLException e) * { throw new NullPointerException("Malformed URL Exception" + e.toString()); } * } * }</DIV> */ public URLIterator(int start, int end, IntFunction<URL> urlGetter) { this.getter = urlGetter; this.start = start; this.end = end; this.cur = start - 1; } /** * Just checks if there are more elements available. * @return {@code TRUE} if there are more pages to check, and {@code FALSE} otherwise. */ public boolean hasNext() { return cur < end; } /** * Meeting the requirements of an instance of Java's standard iterator instance. * @return This shall return the "next" URL element from the Photo Site. */ public URL next() { cur++; if (cur > end) throw new NoSuchElementException( "The current iteration counter is: " + cur + " but unfortunately, the max-page-number you passed to the constructor is: " + end ); return getter.apply(cur); } public static URLIterator usual(String baseURLStr, int startPageNum, int lastPageNum) throws MalformedURLException { CHECK_EXCEPTIONS(baseURLStr, startPageNum, lastPageNum); return new URLIterator(startPageNum, lastPageNum, (int curPage) -> { try { return new URL(baseURLStr + curPage); } catch (MalformedURLException e) { throw new NullPointerException("Malformed URL Exception" + e.toString()); } // CHEAP-TRICK: Compile-Time Exception to Runtime Exception... However, the // base-URL has already been tested, and therefore this exception NEEDS to be // suppressed... NOTE: This exception should *NEVER* throw... }); } public static URLIterator usual (String url, String appendParamStr, int startPageNum, int lastPageNum) throws MalformedURLException { CHECK_EXCEPTIONS(url + 1 + appendParamStr, startPageNum, lastPageNum); return new URLIterator(startPageNum, lastPageNum, (int curPage) -> { try { return new URL(url + curPage + appendParamStr); } catch (MalformedURLException e) { throw new NullPointerException("Malformed URL Exception" + e.toString()); } // CHEAP-TRICK: Compile-Time Exception to Runtime Exception... However, the // base-URL has already been tested, and therefore this exception NEEDS to be // suppressed... NOTE: This exception should *NEVER* throw... }); } public static void CHECK_EXCEPTIONS(String url, int startPageNum, int lastPageNum) throws MalformedURLException { // FAIL-FAST: Check user input before the iterator starts iterating. if (startPageNum < 0) throw new IllegalArgumentException( "The value passed to the starting-page-number parameter [" + startPageNum + "], " + "was negative. Most often it is 1 or, possibly, 0." ); if (lastPageNum < 0) throw new IllegalArgumentException( "The value passed to the ending-page-number parameter [" + lastPageNum + "], was negative." ); if (startPageNum > lastPageNum) throw new IllegalArgumentException( "The value passed to the ending-page-number parameter [" + startPageNum + "], was greater " + "than the value passed to ending-page-number parameter [" + lastPageNum + "]." ); if (url == null) throw new NullPointerException ("A null value was passed as a url."); // FAIL-FAST: This should be a valid URL as a String. This invocation will throw the // MalformedURLException if it is not. new URL(url); } } |