001package Torello.HTML;
002
003import java.io.*;
004import java.util.Vector;
005import java.net.URL;
006import Torello.HTML.HTMLPage.Parser;
007
008import java.util.concurrent.*;
009import java.util.concurrent.locks.*;
010
011import Torello.HTML.Tools.JavaDoc.JDHeaderBackgroundImg;
012import Torello.HTML.Tools.JavaDoc.StaticFunctional;
013import Torello.HTML.Tools.JavaDoc.Excuse;
014
015/**
016 * A carbon-copy of class {@link HTMLPage}, augmented with a mechanism for setting <B>a timeout</B> 
017 * so that when scraping web-pages and {@code URL's} from servers that might have a tendency to hang,
018 * freeze, or delay - the Java Virtual Machine can skip and move-on when that timeout expires. 
019 * 
020 * <BR /><BR />
021 * <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_MWT>
022 * <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE>
023 * 
024 * @see Scrape#getHTML(BufferedReader, int, int)
025 * @see Scrape#getHTML(BufferedReader, String, String)
026 * @see HTMLPage
027 */
028@StaticFunctional(Excused="parser", Excuses=Excuse.SINGLETON)
029@JDHeaderBackgroundImg
030public class HTMLPageMWT
031{
032    private HTMLPageMWT() { }
033
034    /**
035     * If needing to "swap a proprietary parser" comes up, this is possible.
036     * It just needs to accept the same parameters as the current parser, and produce a 
037     * {@code Vector<HTMLNode>.}  This is not an advised step to take, but if an alternative
038     * parser has been tested and happens to be generating different results, it can be easily
039     * 'swapped out' for the one used now.
040     * @see HTMLPage.Parser
041     * @see HTMLPage.Parser#parse(CharSequence, boolean, String, String, String)
042     */
043    public static Parser parser = Torello.HTML.parse.ParserRE::parsePageTokens;
044
045    // ***************************************************************************************************
046    // These 6 functions presume that the HTML source is from a URL
047    // ***************************************************************************************************
048    
049    /**
050     * Parses and Vectorizes HTML from a URL source.
051     * Spawns a <I>monitor-thread</I> that stops the download if a certain, user-specified,
052     * time-limit is exceeded.
053     * @param timeout <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_TIMEOUT>
054     * @param unit <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_UNIT>
055     * @param url <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_URL>
056     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_ELIM_HT>
057     * @return <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RETURN>
058     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IOEX>
059     * @throws InterruptedException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IEX>
060     * @throws RejectedExecutionException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_REEX>
061     */
062    public static Vector<HTMLNode> getPageTokens(
063        long timeout, TimeUnit unit,
064        URL url, boolean eliminateHTMLTags
065    )
066        throws IOException, InterruptedException
067    {
068        return getPageTokens(timeout, unit, url, eliminateHTMLTags, null, null, null, null, null);
069    }
070    
071    /**
072     * Parses and Vectorizes HTML from a URL source.
073     * Spawns a <I>monitor-thread</I> that stops the download if a certain, user-specified,
074     * time-limit is exceeded.
075     * @param timeout <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_TIMEOUT>
076     * @param unit <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_UNIT>
077     * @param url <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_URL>
078     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_ELIM_HT>
079     * @param startTag <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_START_TAG>
080     * @param endTag <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_END_TAG>
081     * @return <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RETURN>
082     * @throws ScrapeException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_SCEX2>
083     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IOEX>
084     * @throws InterruptedException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IEX>
085     * @throws RejectedExecutionException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_REEX>
086     */ 
087    public static Vector<HTMLNode> getPageTokens(
088        long timeout, TimeUnit unit,
089        URL url, boolean eliminateHTMLTags,
090        String startTag, String endTag
091    )
092        throws IOException, InterruptedException
093    {
094        return getPageTokens
095            (timeout, unit, url, eliminateHTMLTags, startTag, endTag, null, null, null);
096    }
097    
098    /**
099     * Parses and Vectorizes HTML from a URL source.
100     * Spawns a <I>monitor-thread</I> that stops the download if a certain, user-specified,
101     * time-limit is exceeded.
102     * @param timeout <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_TIMEOUT>
103     * @param unit <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_UNIT>
104     * @param url <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_URL>
105     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_ELIM_HT>
106     * @param startLineNum <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_END_LN>
107     * @param endLineNum <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_START_LN>
108     * @return <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RETURN>
109     * @throws IllegalArgumentException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IAEX>
110     * @throws ScrapeException  <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_SCEX1>
111     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IOEX>
112     * @throws InterruptedException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IEX>
113     * @throws RejectedExecutionException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_REEX>
114     */
115    public static Vector<HTMLNode> getPageTokens(
116        long timeout, TimeUnit unit,
117        URL url, boolean eliminateHTMLTags,
118        int startLineNum, int endLineNum
119    )
120        throws IOException, InterruptedException
121    {
122        return getPageTokens
123            (timeout, unit, url, eliminateHTMLTags, startLineNum, endLineNum, null, null, null);
124    }
125
126    /**
127     * Parses and Vectorizes HTML from a URL source.
128     * Spawns a <I>monitor-thread</I> that stops the download if a certain, user-specified,
129     * time-limit is exceeded.
130     * @param timeout <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_TIMEOUT>
131     * @param unit <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_UNIT>
132     * @param url <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_URL>
133     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_ELIM_HT>
134     * @param rawHTMLFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RAW_HTML>
135     * @param matchesFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_MATCHES_F>
136     * @param justTextFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
137     * @return <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RETURN>
138     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IOEX>
139     * @throws InterruptedException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IEX>
140     * @throws RejectedExecutionException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_REEX>
141     */
142    public static Vector<HTMLNode> getPageTokens(
143        long timeout, TimeUnit unit,
144        URL url, boolean eliminateHTMLTags,
145        String rawHTMLFile, String matchesFile, String justTextFile
146    )
147        throws IOException, InterruptedException
148    {
149        return getPageTokens(
150            timeout, unit, url, eliminateHTMLTags, null, null,
151            rawHTMLFile, matchesFile, justTextFile
152        );
153    }
154
155    // ***************************************************************************************************
156    // The next 6 functions that follow presume that the input is in the form of a Java.util.BufferedReader
157    // ***************************************************************************************************
158
159    /**
160     * Parses and Vectorizes HTML from a {@code BufferedReader} source.
161     * Spawns a <I>monitor-thread</I> that stops the download if a 
162     * certain, user-specified, time-limit is exceeded.
163     * @param timeout <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_TIMEOUT>
164     * @param unit <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_UNIT>
165     * @param br <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_BR>
166     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_ELIM_HT>
167     * @return <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RETURN>
168     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IOEX>
169     * @throws InterruptedException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IEX>
170     * @throws RejectedExecutionException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_REEX>
171     */
172    public static Vector<HTMLNode> getPageTokens(
173        long timeout, TimeUnit unit,
174        BufferedReader br, boolean eliminateHTMLTags
175    )
176        throws IOException, InterruptedException
177    {
178        return getPageTokens
179            (timeout, unit, br, eliminateHTMLTags, null, null, null, null, null);
180    }
181
182    /**
183     * Parses and Vectorizes HTML from a {@code BufferedReader} source.
184     * Spawns a <I>monitor-thread</I> that stops the download if a 
185     * certain, user-specified, time-limit is exceeded.
186     * @param timeout <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_TIMEOUT>
187     * @param unit <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_UNIT>
188     * @param br <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_BR>
189     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_ELIM_HT>
190     * @param startTag <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_START_TAG>
191     * @param endTag <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_END_TAG>
192     * @return <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RETURN>
193     * @throws ScrapeException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_SCEX2>
194     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IOEX>
195     * @throws InterruptedException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IEX>
196     * @throws RejectedExecutionException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_REEX>
197     */ 
198    public static Vector<HTMLNode> getPageTokens(
199        long timeout, TimeUnit unit,
200        BufferedReader br, boolean eliminateHTMLTags, String startTag, String endTag
201    )
202        throws IOException, InterruptedException
203    {
204        return getPageTokens
205            (timeout, unit, br, eliminateHTMLTags, startTag, endTag, null, null, null);
206    }
207
208    /**
209     * Parses and Vectorizes HTML from a {@code BufferedReader} source.
210     * Spawns a <I>monitor-thread</I> that stops the download if a 
211     * certain, user-specified, time-limit is exceeded.
212     * @param timeout <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_TIMEOUT>
213     * @param unit <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_UNIT>
214     * @param br <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_BR>
215     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_ELIM_HT>
216     * @param startLineNum <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_END_LN>
217     * @param endLineNum <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_START_LN>
218     * @return <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RETURN>
219     * @throws IllegalArgumentException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IAEX>
220     * @throws ScrapeException  <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_SCEX1>
221     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IOEX>
222     * @throws InterruptedException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IEX>
223     * @throws RejectedExecutionException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_REEX>
224     */
225    public static Vector<HTMLNode> getPageTokens(
226        long timeout, TimeUnit unit,
227        BufferedReader br, boolean eliminateHTMLTags,
228        int startLineNum, int endLineNum
229    )
230        throws IOException, InterruptedException
231    {
232        return getPageTokens
233            (timeout, unit, br, eliminateHTMLTags, startLineNum, endLineNum, null, null, null);
234    }
235
236    /**
237     * Parses and Vectorizes HTML from a {@code BufferedReader} source.
238     * Spawns a <I>monitor-thread</I> that stops the download if a 
239     * certain, user-specified, time-limit is exceeded.
240     * @param timeout <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_TIMEOUT>
241     * @param unit <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_UNIT>
242     * @param br <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_BR>
243     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_ELIM_HT>
244     * @param rawHTMLFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RAW_HTML>
245     * @param matchesFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_MATCHES_F>
246     * @param justTextFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
247     * @return <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RETURN>
248     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IOEX>
249     * @throws InterruptedException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IEX>
250     * @throws RejectedExecutionException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_REEX>
251     */
252    public static Vector<HTMLNode> getPageTokens(
253        long timeout, TimeUnit unit,
254        BufferedReader br, boolean eliminateHTMLTags,
255        String rawHTMLFile, String matchesFile, String justTextFile
256    )
257        throws IOException, InterruptedException
258    { 
259        return getPageTokens
260            (timeout, unit, br, eliminateHTMLTags, null, null, rawHTMLFile, matchesFile, justTextFile);
261    }
262
263    // ***************************************************************************************************
264    // * Receives a "pre-instantiated" BufferedReader for the HTML Source parameter
265    // ***************************************************************************************************
266
267    private static final ExecutorService    executor    = Executors.newCachedThreadPool();
268    private static final Lock               lock        = new ReentrantLock();
269
270    /**
271     * If this class has been used to make "multi-threaded" calls that use a Time-Out wait-period,
272     * you might see your Java-Program hang for a few seconds when you would expect it to exit back to your O.S. normally.
273     *
274     * <BR /><BR /><B>Max Wait Time</B> operates by building a "Timeout &amp; Monitor" thread, and therefore when a program you
275     * have written yourself reaches the end of its code, <I><B>if you have performed any Internet-Downloads using
276     * {@code class HTMLPageMWT}</B></I>, then your program <I>might not exit immediately,</I> but rather sit at the
277     * command-prompt for anywhere between 10 and 30 seconds before this Timeout-Thread, created in class HTMLPageMWT, dies.
278     *
279     * <BR /><BR /><B><SPAN STYLE="color: red">MULTI-THREADED:</B></SPAN> You may immediately terminate any additional
280     * threads that were started using this method.
281     */
282    public static void shutdownMWTThreads() { executor.shutdownNow(); }
283
284    /**
285     * Parses and Vectorizes HTML from a {@code BufferedReader} source.
286     * Spawns a <I>monitor-thread</I> that stops the download if a 
287     * certain, user-specified, time-limit is exceeded.
288     * @param timeout <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_TIMEOUT>
289     * @param unit <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_UNIT>
290     * @param br <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_BR>
291     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_ELIM_HT>
292     * @param startTag <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_START_TAG>
293     * @param endTag <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_END_TAG>
294     * @param rawHTMLFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RAW_HTML>
295     * @param matchesFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_MATCHES_F>
296     * @param justTextFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
297     * @return <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RETURN>
298     * @throws ScrapeException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_SCEX2>
299     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IOEX>
300     * @throws InterruptedException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IEX>
301     * @throws RejectedExecutionException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_REEX>
302     */
303    public static Vector<HTMLNode> getPageTokens(
304        long timeout, TimeUnit unit,
305        BufferedReader br, boolean eliminateHTMLTags,
306        String startTag, String endTag,
307        String rawHTMLFile, String matchesFile, String justTextFile
308    )
309        throws IOException, InterruptedException
310    {
311        Callable<Vector<HTMLNode>> threadDownloader = new Callable<Vector<HTMLNode>>()
312        {
313            public Vector<HTMLNode> call() throws Exception
314            {
315                return parser.parse(
316                    Scrape.getHTML(br, startTag, endTag),
317                    eliminateHTMLTags, rawHTMLFile, matchesFile, justTextFile
318                );
319            }
320        };
321
322        lock.lock();
323        Future<Vector<HTMLNode>> future = executor.submit(threadDownloader);
324        lock.unlock();
325
326        try
327            { return future.get(timeout, unit); }
328        catch (TimeoutException e)
329            { return null; }
330        catch (ExecutionException e)
331        {
332            Throwable originalException = e.getCause();
333            if (originalException == null) throw new RejectedExecutionException(
334                "An Execution Exception was thrown, but it did provide a cause throwable " +
335                "(e.getCause() returned null).  See this exception's getCause() method to " +
336                "view the ExecutionException that has occurred.",
337                e
338            );
339
340            if (originalException instanceof IOException)
341                throw (IOException) originalException;
342
343            if (originalException instanceof RuntimeException)
344                throw (RuntimeException) originalException;
345
346            throw new RejectedExecutionException(
347                "An Execution Exception occurred, but it was neither a RuntimeException, " +
348                "nor IOException.  See this exception's getCause() method to view the " +
349                "underlying error that has occurred.", originalException
350            );
351        }
352    }
353
354    /**
355     * Parses and Vectorizes HTML from a {@code BufferedReader} source.
356     * Spawns a <I>monitor-thread</I> that stops the download if a 
357     * certain, user-specified, time-limit is exceeded.
358     * @param timeout <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_TIMEOUT>
359     * @param unit <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_UNIT>
360     * @param br <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_BR>
361     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_ELIM_HT>
362     * @param startLineNum <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_END_LN>
363     * @param endLineNum <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_START_LN>
364     * @param rawHTMLFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RAW_HTML>
365     * @param matchesFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_MATCHES_F>
366     * @param justTextFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
367     * @return <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RETURN>
368     * @throws IllegalArgumentException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IAEX>
369     * @throws ScrapeException  <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_SCEX1>
370     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IOEX>
371     * @throws InterruptedException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IEX>
372     * @throws RejectedExecutionException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_REEX>
373     */
374    public static Vector<HTMLNode> getPageTokens(
375        long timeout, TimeUnit unit,
376        BufferedReader br, boolean eliminateHTMLTags,
377        int startLineNum, int endLineNum,
378        String rawHTMLFile, String matchesFile, String justTextFile
379    )
380        throws IOException, InterruptedException
381    {
382        Callable<Vector<HTMLNode>> threadDownloader = new Callable<Vector<HTMLNode>>()
383        {
384            public Vector<HTMLNode> call() throws Exception
385            {
386                return parser.parse(
387                    Scrape.getHTML(br, startLineNum, endLineNum),
388                    eliminateHTMLTags, rawHTMLFile, matchesFile, justTextFile
389                );
390            }
391        };
392
393        lock.lock();
394        Future<Vector<HTMLNode>> future = executor.submit(threadDownloader);
395        lock.unlock();
396
397        try
398            { return future.get(timeout, unit); }
399        catch (TimeoutException e)
400            { return null; }
401        catch (ExecutionException e)
402        {
403            Throwable originalException = e.getCause();
404
405            if (originalException == null) throw new RejectedExecutionException(
406                "An Execution Exception was thrown, but it did provide a cause throwable " +
407                "(e.getCause() returned null).  See this exception's getCause() method to " +
408                "view the ExecutionException has that occurred.",
409                e
410            );
411
412            if (originalException instanceof IOException)
413                throw (IOException) originalException;
414
415            if (originalException instanceof RuntimeException)
416                throw (RuntimeException) originalException;
417
418            throw new RejectedExecutionException(
419                "An Execution Exception occurred, but it was neither a RuntimeException, nor " +
420                "IOException.  See this exception's getCause() method to view the underlying " +
421                "error that has occurred.", originalException
422            );
423        }
424    }
425
426    // ***************************************************************************************************
427    // * Receives a java.net.URL for the HTML Source parameter, which could Timeout/Hang - so it must
428    // * be opened within the Multi-Threaded "Timeout" code (and therefore requires a second version of
429    // * these two methods - where Scrape.openConn(url) is *inside* the monitored downloading thread.
430    // ***************************************************************************************************
431    
432    /**
433     * Parses and Vectorizes HTML from a URL source.
434     * Spawns a <I>monitor-thread</I> that stops the download if a certain, user-specified,
435     * time-limit is exceeded.
436     * @param timeout <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_TIMEOUT>
437     * @param unit <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_UNIT>
438     * @param url <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_URL>
439     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_ELIM_HT>
440     * @param startTag <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_START_TAG>
441     * @param endTag <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_END_TAG>
442     * @param rawHTMLFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RAW_HTML>
443     * @param matchesFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_MATCHES_F>
444     * @param justTextFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
445     * @return <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RETURN>
446     * @throws ScrapeException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_SCEX2>
447     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IOEX>
448     * @throws InterruptedException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IEX>
449     * @throws RejectedExecutionException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_REEX>
450     */
451    public static Vector<HTMLNode> getPageTokens(
452        long timeout, TimeUnit unit,
453        URL url, boolean eliminateHTMLTags,
454        String startTag, String endTag,
455        String rawHTMLFile, String matchesFile, String justTextFile
456    )
457        throws IOException, InterruptedException
458    {
459        Callable<Vector<HTMLNode>> threadDownloader = new Callable<Vector<HTMLNode>>()
460        {
461            public Vector<HTMLNode> call() throws Exception
462            { 
463                return parser.parse(
464                    Scrape.getHTML(Scrape.openConn(url), startTag, endTag),
465                    eliminateHTMLTags, rawHTMLFile, matchesFile, justTextFile
466                );
467            }
468        };
469
470        lock.lock();
471        Future<Vector<HTMLNode>> future = executor.submit(threadDownloader);
472        lock.unlock();
473
474        try
475            { return future.get(timeout, unit); }
476        catch (TimeoutException e)
477            { return null; }
478        catch (ExecutionException e)
479        {
480            Throwable originalException = e.getCause();
481
482            if (originalException == null) throw new RejectedExecutionException(
483                "An Execution Exception was thrown, but it did provide a cause throwable " +
484                "(e.getCause() returned null).  See this exception's getCause() method to " +
485                "view the ExecutionException that has occurred.", e
486            );
487
488            if (originalException instanceof IOException)
489                throw (IOException) originalException;
490
491            if (originalException instanceof RuntimeException)
492                throw (RuntimeException) originalException;
493
494            throw new RejectedExecutionException(
495                "An Execution Exception occurred, but it was neither a RuntimeException, " +
496                "nor IOException.  See this exception's getCause() method to view the " +
497                "underlying error that has occurred.", originalException
498            );
499        }
500    }
501
502    
503    /**
504     * Parses and Vectorizes HTML from a URL source.
505     * Spawns a <I>monitor-thread</I> that stops the download if a certain, user-specified,
506     * time-limit is exceeded.
507     * @param timeout <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_TIMEOUT>
508     * @param unit <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_UNIT>
509     * @param url <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_URL>
510     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_ELIM_HT>
511     * @param startLineNum <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_END_LN>
512     * @param endLineNum <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_START_LN>
513     * @param rawHTMLFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RAW_HTML>
514     * @param matchesFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_MATCHES_F>
515     * @param justTextFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
516     * @return <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RETURN>
517     * @throws IllegalArgumentException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IAEX>
518     * @throws ScrapeException  <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_SCEX1>
519     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IOEX>
520     * @throws InterruptedException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IEX>
521     * @throws RejectedExecutionException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_REEX>
522     */
523    public static Vector<HTMLNode> getPageTokens(
524        long timeout, TimeUnit unit,
525        URL url, boolean eliminateHTMLTags,
526        int startLineNum, int endLineNum,
527        String rawHTMLFile, String matchesFile, String justTextFile
528    )
529        throws IOException, InterruptedException
530    {
531        Callable<Vector<HTMLNode>> threadDownloader = new Callable<Vector<HTMLNode>>()
532        {
533            public Vector<HTMLNode> call() throws Exception
534            { 
535                return parser.parse(
536                    Scrape.getHTML(Scrape.openConn(url), startLineNum, endLineNum),
537                    eliminateHTMLTags, rawHTMLFile, matchesFile, justTextFile
538                );
539            }
540        };
541
542        lock.lock();
543        Future<Vector<HTMLNode>> future = executor.submit(threadDownloader);
544        lock.unlock();
545
546        try
547            { return future.get(timeout, unit); }
548        catch (TimeoutException e)
549            { return null; }
550        catch (ExecutionException e)
551        {
552            Throwable originalException = e.getCause();
553    
554            if (originalException == null) throw new RejectedExecutionException(
555                "An Execution Exception was thrown, but it did provide a cause throwable " +
556                "(e.getCause() returned null).  See this exception's getCause() method to " +
557                "view the ExecutionException has that occurred.",
558                e
559            );
560
561            if (originalException instanceof IOException)
562                throw (IOException) originalException;
563
564            if (originalException instanceof RuntimeException)
565                throw (RuntimeException) originalException;
566
567            throw new RejectedExecutionException(
568                "An Execution Exception occurred, but it was neither a RuntimeException, nor " +
569                "IOException.  See this exception's getCause() method to view the underlying " +
570                "error that has occurred.", originalException
571            );
572        }
573    }
574}