001package Torello.HTML;
002
003import java.io.*;
004import java.util.Vector;
005import java.net.URL;
006
007import Torello.HTML.Tools.JavaDoc.StaticFunctional;
008import Torello.HTML.Tools.JavaDoc.JDHeaderBackgroundImg;
009import Torello.HTML.Tools.JavaDoc.Excuse;
010
011/**
012 * Java HTML's flagship-parser class for converting HTML web-pages into plain Java {@code Vector's}
013 * of {@link HTMLNode}.
014 * 
015 * <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE>
016 * 
017 * @see Scrape#getHTML(BufferedReader, int, int)
018 * @see Scrape#getHTML(BufferedReader, String, String)
019 * @see HTMLPageMWT
020 */
021@StaticFunctional(Excused="parser", Excuses=Excuse.SINGLETON)
022@JDHeaderBackgroundImg
023public class HTMLPage
024{
025    private HTMLPage() { }
026
027    /**
028     * A function-pointer / lambda-target that (could) potentially be used to replace this
029     * library's current regular-expression based parser with something possibly faster or even
030     * more efficient.
031     * 
032     * <BR /><BR /><EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_PARSER>
033     * @see #parser
034     */
035    @FunctionalInterface
036    public static interface Parser
037    {
038        /**
039         * Parse html source-text into a {@code Vector<HTMLNode>}.
040         * @param html <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_HTML>
041         * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_ELIM_HT>
042         * @param rawHTMLFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RAW_HTML>
043         * <BR /><BR /><B STYLE='color:red;'>NOTE:</B> If you have decided to implement a parser,
044         * and you wish to ingore this parameter (and don't want to output such a file) - <I>it is
045         * (hopefully) obvious that you may skip this step!</I>
046         * @param matchesFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_MATCHES_F>
047         * <BR /><BR /><B>NOTE:</B> <I>As above,</I> you may skip implementing this.
048         * @param justTextFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
049         * <BR /><BR /><B>NOTE:</B> <I>As above,</I> you may skip implementing this.
050         * @return <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RETURN>
051         * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IOEX>
052         */
053        public Vector<HTMLNode> parse(
054            CharSequence html, boolean eliminateHTMLTags,
055            String rawHTMLFile, String matchesFile, String justTextFile
056        ) throws IOException;
057    }
058
059    /**
060     * If needing to "swap a proprietary parser" comes up, this is possible.
061     * It just needs to accept the same parameters as the current parser, and produce a 
062     * {@code Vector<HTMLNode>.}  This is not an advised step to take, but if an alternative
063     * parser has been tested and happens to be generating different results, it can be easily
064     * 'swapped out' for the one used now.
065     * @see Parser
066     * @see Parser#parse
067     */
068    public static Parser parser = Torello.HTML.parse.ParserRE::parsePageTokens;
069
070    // ***************************************************************************************************
071    // These 6 functions presume that the HTML source is from a URL
072    // ***************************************************************************************************
073    
074    /**
075     * Parses and Vectorizes HTML from a {@code java.net.URL} source.
076     * @param url <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_URL>
077     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_ELIM_HT>
078     * @return <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RETURN>
079     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IOEX>
080     */
081    public static Vector<HTMLNode> getPageTokens
082        (URL url, boolean eliminateHTMLTags)
083        throws IOException
084    { return getPageTokens(url, eliminateHTMLTags, null, null, null, null, null); }
085    
086    /**
087     * Parses and Vectorizes HTML from a {@code java.net.URL} source.
088     * @param url <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_URL>
089     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_ELIM_HT>
090     * @param startTag <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_START_TAG>
091     * @param endTag <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_END_TAG>
092     * @return <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RETURN>
093     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IOEX>
094     * @throws ScrapeException  <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_SCEX2>
095     */       
096    public static Vector<HTMLNode> getPageTokens
097        (URL url, boolean eliminateHTMLTags,
098        String startTag, String endTag)
099        throws IOException
100    { return getPageTokens(url, eliminateHTMLTags, startTag, endTag, null, null, null); }
101    
102    /**
103     * Parses and Vectorizes HTML from a {@code java.net.URL} source.
104     * @param url <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_URL>
105     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_ELIM_HT>
106     * @param startLineNum <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_END_LN>
107     * @param endLineNum <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_START_LN>
108     * @return <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RETURN>
109     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IOEX>
110     * @throws IllegalArgumentException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IAEX>
111     * @throws ScrapeException  <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_SCEX1>
112     */
113    public static Vector<HTMLNode> getPageTokens
114        (URL url, boolean eliminateHTMLTags,
115        int startLineNum, int endLineNum)
116        throws IOException
117    { return getPageTokens(url, eliminateHTMLTags, startLineNum, endLineNum, null, null, null); }
118
119    /**
120     * Parses and Vectorizes HTML from a {@code java.net.URL} source.
121     * @param url <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_URL>
122     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_ELIM_HT>
123     * @param rawHTMLFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RAW_HTML>
124     * @param matchesFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_MATCHES_F>
125     * @param justTextFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
126     * @return <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RETURN>
127     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IOEX>
128     */
129    public static Vector<HTMLNode> getPageTokens
130        (URL url, boolean eliminateHTMLTags,
131        String rawHTMLFile, String matchesFile, String justTextFile)
132        throws IOException
133    { return getPageTokens(url, eliminateHTMLTags, null, null, rawHTMLFile, matchesFile, justTextFile); }
134    
135    /**
136     * Parses and Vectorizes HTML from a {@code java.net.URL} source.
137     * @param url <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_URL>
138     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_ELIM_HT>
139     * @param startTag <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_START_TAG>
140     * @param endTag <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_END_TAG>
141     * @param rawHTMLFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RAW_HTML>
142     * @param matchesFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_MATCHES_F>
143     * @param justTextFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
144     * @return <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RETURN>
145     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IOEX>
146     * @throws ScrapeException  <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_SCEX2>
147     */
148    public static Vector<HTMLNode> getPageTokens
149        (URL url, boolean eliminateHTMLTags,
150        String startTag, String endTag,
151        String rawHTMLFile, String matchesFile, String justTextFile)
152        throws IOException
153    {
154        return getPageTokens(
155            Scrape.openConn(url), eliminateHTMLTags, startTag, endTag,
156            rawHTMLFile, matchesFile, justTextFile
157        );
158    }
159    
160    /**
161     * Parses and Vectorizes HTML from a {@code java.net.URL} source.
162     * @param url <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_URL>
163     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_ELIM_HT>
164     * @param startLineNum <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_END_LN>
165     * @param endLineNum <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_START_LN>
166     * @param rawHTMLFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RAW_HTML>
167     * @param matchesFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_MATCHES_F>
168     * @param justTextFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
169     * @return <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RETURN>
170     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IOEX>
171     * @throws IllegalArgumentException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IAEX>
172     * @throws ScrapeException  <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_SCEX1>
173     */
174    public static Vector<HTMLNode> getPageTokens
175        (URL url, boolean eliminateHTMLTags,
176        int startLineNum, int endLineNum,
177        String rawHTMLFile, String matchesFile, String justTextFile)
178        throws IOException
179    {
180        return getPageTokens(
181            Scrape.openConn(url), eliminateHTMLTags, startLineNum, endLineNum,
182            rawHTMLFile, matchesFile, justTextFile
183        );
184    }
185
186    // ********************************************************************************************
187    // These 6 functions presume that the HTML source is from a ready made "CharSequence"
188    // (StringBuffer, Builder or String itself)
189    // ********************************************************************************************
190    
191    /**
192     * Parses and Vectorizes HTML from a {@code CharSequence} (usually a {@code String}) source.
193     * @param html <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_HTML>
194     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_ELIM_HT>
195     * @return <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RETURN>
196     * <BR /><BR /><B><SPAN STYLE="color: red;">NOTE:</B></SPAN> This method does not throw any
197     * checked-exceptions, there is no Input-Output involved here, it is strictly a computational
198     * method that neither invokes the file-system, nor the web.
199     */
200    public static Vector<HTMLNode> getPageTokens
201        (CharSequence html, boolean eliminateHTMLTags) // NO IOException... NO I/O!
202    {
203        try
204            { return parser.parse(html, eliminateHTMLTags, null, null, null); }
205        catch (IOException e) 
206            { throw new LinkageError("String Parse has thrown IOException.  See getCause() for details.", e); }
207    }
208
209    /**
210     * Parses and Vectorizes HTML from a {@code CharSequence} (usually a {@code String}) source.
211     * @param html <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_HTML>
212     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_ELIM_HT>
213     * @param startTag <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_START_TAG>
214     * @param endTag <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_END_TAG>
215     * @return <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RETURN>
216     * <BR /><BR /><B><SPAN STYLE="color: red;">NOTE:</B></SPAN> This method does not throw any
217     * checked-exceptions, there is no Input-Output involved here, it is strictly a computational
218     * method that neither invokes the file-system, nor the web.
219     * @throws ScrapeException  <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_SCEX2>
220     */
221    public static Vector<HTMLNode> getPageTokens
222        (CharSequence html, boolean eliminateHTMLTags,
223        String startTag, String endTag)
224    // NO IOException... NO I/O!
225    {
226        try
227            { return getPageTokens(html, eliminateHTMLTags, startTag, endTag, null, null, null); }
228        catch (IOException e)
229        {
230            throw new LinkageError(
231                "Something has thrown an IOException.  This should not be possible here.  " +
232                "See getCause() for details.",
233                e
234            );
235        }
236        // This should never happen, when reading from a 'String' rather than a URL, or
237        // BufferedReader, IOException cannot happen.
238    }
239    
240    /**
241     * Parses and Vectorizes HTML from a {@code CharSequence} (usually a {@code String}) source.
242     * @param html <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_HTML>
243     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_ELIM_HT>
244     * @param startLineNum <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_END_LN>
245     * @param endLineNum <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_START_LN>
246     * @return <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RETURN>
247     * <BR /><BR /><B><SPAN STYLE="color: red;">NOTE:</B></SPAN> This method does not throw any
248     * checked-exceptions, there is no Input-Output involved here, it is strictly a computational
249     * method that neither invokes the file-system, nor the web.
250     * @throws IllegalArgumentException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IAEX>
251     * @throws ScrapeException  <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_SCEX1>
252     */
253    public static Vector<HTMLNode> getPageTokens
254        (CharSequence html, boolean eliminateHTMLTags,
255        int startLineNum, int endLineNum)
256    // NO IOException... NO I/O!
257    {
258        try
259        { 
260            return getPageTokens
261                (html, eliminateHTMLTags, startLineNum, endLineNum, null, null, null);
262        }
263        catch (IOException e)
264        { 
265            throw new LinkageError(
266                "String Parse has thrown IOException.  This should not be possible here.  " +
267                "See getCause() for details.", e);
268        }
269        // This should never happen, when reading from a 'String' rather than a URL, or
270        // BufferedReader, IOException cannot happen.
271    }
272
273    /**
274     * Parses and Vectorizes HTML from a {@code CharSequence} (usually a {@code String}) source.
275     * @param html <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_HTML>
276     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_ELIM_HT>
277     * @param rawHTMLFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RAW_HTML>
278     * @param matchesFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_MATCHES_F>
279     * @param justTextFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
280     * @return <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RETURN>
281     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IOEX>
282     */
283    public static Vector<HTMLNode> getPageTokens
284        (CharSequence html, boolean eliminateHTMLTags,
285        String rawHTMLFile, String matchesFile, String justTextFile)
286        throws IOException
287    { return parser.parse(html, eliminateHTMLTags, rawHTMLFile, matchesFile, justTextFile); }
288
289    /**
290     * Parses and Vectorizes HTML from a {@code CharSequence} (usually a {@code String}) source.
291     * @param html <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_HTML>
292     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_ELIM_HT>
293     * @param startTag <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_START_TAG>
294     * @param endTag <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_END_TAG>
295     * @param rawHTMLFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RAW_HTML>
296     * @param matchesFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_MATCHES_F>
297     * @param justTextFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
298     * @return <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RETURN>
299     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IOEX>
300     * @throws ScrapeException  <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_SCEX2>
301     */
302    public static Vector<HTMLNode> getPageTokens
303        (CharSequence html, boolean eliminateHTMLTags,
304        String startTag, String endTag,
305        String rawHTMLFile, String matchesFile, String justTextFile) 
306        throws IOException
307    {
308        String  htmlStr = html.toString();
309
310        int     sPos    = htmlStr.indexOf(startTag);
311        if (sPos == -1) throw new IllegalArgumentException
312            ("Passed String-Parameter 'startTag' [" + startTag + "] was not found in HTML.");
313
314        int     ePos    = htmlStr.indexOf(endTag, sPos);
315        if (ePos == -1) throw new IllegalArgumentException
316            ("Passed String-Parameter 'endTag' [" + endTag + "] was not found in HTML.");
317
318        ePos += endTag.length();
319
320        return parser.parse(
321            htmlStr.substring(sPos, ePos), eliminateHTMLTags,
322            rawHTMLFile, matchesFile, justTextFile
323        );
324    }
325    
326    /**
327     * Parses and Vectorizes HTML from a {@code CharSequence} (usually a {@code String}) source.
328     * @param html <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_HTML>
329     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_ELIM_HT>
330     * @param startLineNum <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_END_LN>
331     * @param endLineNum <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_START_LN>
332     * @param rawHTMLFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RAW_HTML>
333     * @param matchesFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_MATCHES_F>
334     * @param justTextFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
335     * @return <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RETURN>
336     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IOEX>
337     * @throws IllegalArgumentException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IAEX>
338     * @throws ScrapeException  <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_SCEX1>
339     */
340    public static Vector<HTMLNode> getPageTokens
341        (CharSequence html, boolean eliminateHTMLTags,
342        int startLineNum, int endLineNum,
343        String rawHTMLFile, String matchesFile, String justTextFile) 
344        throws IOException
345    {
346        return getPageTokens(
347            new BufferedReader(new StringReader(html.toString())),
348            eliminateHTMLTags, startLineNum, endLineNum, rawHTMLFile, matchesFile, justTextFile
349        );
350    }
351
352    // ***************************************************************************************************
353    // The next 6 functions that follow presume that the input is in the form of a Java.util.BufferedReader
354    // ***************************************************************************************************
355
356    /**
357     * Parses and Vectorizes HTML from a {@code BufferedReader} source.
358     * @param br <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_BR>
359     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_ELIM_HT>
360     * @return <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RETURN>
361     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IOEX>
362     */
363    public static Vector<HTMLNode> getPageTokens
364        (BufferedReader br, boolean eliminateHTMLTags)
365        throws IOException
366    { return getPageTokens(br, eliminateHTMLTags, null, null, null, null, null); }
367
368    /**
369     * Parses and Vectorizes HTML from a {@code BufferedReader} source.
370     * @param br <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_BR>
371     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_ELIM_HT>
372     * @param startTag <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_START_TAG>
373     * @param endTag <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_END_TAG>
374     * @return <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RETURN>
375     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IOEX>
376     * @throws ScrapeException  <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_SCEX2>
377     */ 
378    public static Vector<HTMLNode> getPageTokens
379        (BufferedReader br, boolean eliminateHTMLTags, String startTag, String endTag)
380        throws IOException
381    { return getPageTokens(br, eliminateHTMLTags, startTag, endTag, null, null, null); }
382
383    /**
384     * Parses and Vectorizes HTML from a {@code BufferedReader} source.
385     * @param br <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_BR>
386     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_ELIM_HT>
387     * @param startLineNum <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_END_LN>
388     * @param endLineNum <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_START_LN>
389     * @return <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RETURN>
390     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IOEX>
391     * @throws IllegalArgumentException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IAEX>
392     * @throws ScrapeException  <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_SCEX1>
393     */
394    public static Vector<HTMLNode> getPageTokens
395        (BufferedReader br, boolean eliminateHTMLTags,
396        int startLineNum, int endLineNum)
397        throws IOException
398    { return getPageTokens(br, eliminateHTMLTags, startLineNum, endLineNum, null, null, null); }
399
400
401    /**
402     * Parses and Vectorizes HTML from a {@code BufferedReader} source.
403     * @param br <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_BR>
404     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_ELIM_HT>
405     * @param rawHTMLFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RAW_HTML>
406     * @param matchesFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_MATCHES_F>
407     * @param justTextFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
408     * @return <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RETURN>
409     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IOEX>
410     */
411    public static Vector<HTMLNode> getPageTokens(
412        BufferedReader br, boolean eliminateHTMLTags,
413        String rawHTMLFile, String matchesFile, String justTextFile
414    )
415        throws IOException
416    {
417        return getPageTokens
418            (br, eliminateHTMLTags, null, null, rawHTMLFile, matchesFile, justTextFile);
419    }
420
421    // ********************************************************************************************
422    // ********************************************************************************************
423    // ********************************************************************************************
424
425    /**
426     * Parses and Vectorizes HTML from a {@code BufferedReader} source.
427     * @param br <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_BR>
428     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_ELIM_HT>
429     * @param startTag <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_START_TAG>
430     * @param endTag <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_END_TAG>
431     * @param rawHTMLFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RAW_HTML>
432     * @param matchesFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_MATCHES_F>
433     * @param justTextFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
434     * @return <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RETURN>
435     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IOEX>
436     * @throws ScrapeException  <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_SCEX2>
437     */
438    public static Vector<HTMLNode> getPageTokens
439        (BufferedReader br, boolean eliminateHTMLTags,
440        String startTag, String endTag,
441        String rawHTMLFile, String matchesFile, String justTextFile)
442        throws IOException
443    {
444        return parser.parse(
445            Scrape.getHTML(br, startTag, endTag), eliminateHTMLTags, rawHTMLFile,
446            matchesFile, justTextFile
447        );
448    }
449
450    /**
451     * Parses and Vectorizes HTML from a {@code BufferedReader} source.
452     * @param br <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_BR>
453     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_ELIM_HT>
454     * @param startLineNum <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_END_LN>
455     * @param endLineNum <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_START_LN>
456     * @param rawHTMLFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RAW_HTML>
457     * @param matchesFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_MATCHES_F>
458     * @param justTextFile <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
459     * @return <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_RETURN>
460     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IOEX>
461     * @throws IllegalArgumentException <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_IAEX>
462     * @throws ScrapeException  <EMBED CLASS="external-html" DATA-FILE-ID=HTML_PAGE_SCEX1>
463     */
464    public static Vector<HTMLNode> getPageTokens
465        (BufferedReader br, boolean eliminateHTMLTags,
466        int startLineNum, int endLineNum,
467        String rawHTMLFile, String matchesFile, String justTextFile)
468        throws IOException
469    {
470        return parser.parse(
471            Scrape.getHTML(br, startLineNum, endLineNum), eliminateHTMLTags,
472            rawHTMLFile, matchesFile, justTextFile
473        );
474    }
475}