Source code

001package Torello.HTML;
002
003import java.io.*;
004import java.util.Vector;
005import java.net.URL;
006
007import Torello.JavaDoc.StaticFunctional;
008import Torello.JavaDoc.JDHeaderBackgroundImg;
009import Torello.JavaDoc.Excuse;
010import Torello.Java.UnreachableError;
011
012/**
013 * Java HTML's flagship-parser class for converting HTML web-pages into plain Java {@code Vector's}
014 * of {@link HTMLNode}.
015 * 
016 * <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE>
017 * 
018 * @see Scrape#getHTML(BufferedReader, int, int)
019 * @see Scrape#getHTML(BufferedReader, String, String)
020 * @see HTMLPageMWT
021 */
022@StaticFunctional(Excused="parser", Excuses=Excuse.SINGLETON)
023@JDHeaderBackgroundImg
024public class HTMLPage
025{
026    private HTMLPage() { }
027
028    /**
029     * A function-pointer / lambda-target that (could) potentially be used to replace this
030     * library's current regular-expression based parser with something possibly faster or even
031     * more efficient.
032     * 
033     * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_PARSER>
034     * @see #parser
035     */
036    @FunctionalInterface
037    public static interface Parser
038    {
039        /**
040         * Parse html source-text into a {@code Vector<HTMLNode>}.
041         * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_HTML>
042         * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
043         * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML>
044         * <BR /><BR /><B STYLE='color:red;'>NOTE:</B> If you have decided to implement a parser,
045         * and you wish to ingore this parameter (and don't want to output such a file) - <I>it is
046         * (hopefully) obvious that you may skip this step!</I>
047         * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F>
048         * <BR /><BR /><B>NOTE:</B> <I>As above,</I> you may skip implementing this.
049         * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
050         * <BR /><BR /><B>NOTE:</B> <I>As above,</I> you may skip implementing this.
051         * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
052         * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX>
053         */
054        public Vector<HTMLNode> parse(
055                CharSequence    html,
056                boolean         eliminateHTMLTags,
057                String          rawHTMLFile,
058                String          matchesFile,
059                String          justTextFile
060            )
061        throws IOException;
062    }
063
064    /**
065     * If needing to "swap a proprietary parser" comes up, this is possible.
066     * It just needs to accept the same parameters as the current parser, and produce a 
067     * {@code Vector<HTMLNode>.}  This is not an advised step to take, but if an alternative
068     * parser has been tested and happens to be generating different results, it can be easily
069     * 'swapped out' for the one used now.
070     * @see Parser
071     * @see Parser#parse
072     */
073    public static Parser parser = Torello.HTML.HelperPackages.parse.ParserRE::parsePageTokens;
074
075
076    // ********************************************************************************************
077    // ********************************************************************************************
078    // These 6 functions presume that the HTML source needs to be downloaded & read from a URL
079    // ********************************************************************************************
080    // ********************************************************************************************
081
082
083    /**
084     * Convenience Method.
085     * <BR />Accepts: {@code URL}
086     * <BR />Passes null to parameters
087     * {@code startTag, endTag, rawHTMLFile, matchesFile & justTextFile}.
088     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
089     *      String, String, String, String, String)}
090     * <BR />And Invokes: {@link Scrape#openConn(URL)}
091     */
092    public static Vector<HTMLNode> getPageTokens
093        (URL url, boolean eliminateHTMLTags)
094        throws IOException
095    {
096        return getPageTokens
097            (Scrape.openConn(url), eliminateHTMLTags, null, null, null, null, null);
098    }
099    
100    /**
101     * Convenience Method.
102     * <BR />Accepts: {@code URL}
103     * <BR />And-Accepts: {@code 'startTag'} and {@code 'endTag'}
104     * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
105     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
106     *      String, String, String, String, String)}
107     * <BR />And Invokes: {@link Scrape#openConn(URL)}
108     */       
109    public static Vector<HTMLNode> getPageTokens
110        (URL url, boolean eliminateHTMLTags, String startTag, String endTag)
111        throws IOException
112    {
113        return getPageTokens
114            (Scrape.openConn(url), eliminateHTMLTags, startTag, endTag, null, null, null);
115    }
116    
117    /**
118     * Convenience Method.
119     * <BR />Accepts: {@code URL}
120     * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'}
121     * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
122     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
123     *      int, int, String, String, String)}
124     * <BR />And Invokes: {@link Scrape#openConn(URL)}
125     */
126    public static Vector<HTMLNode> getPageTokens
127        (URL url, boolean eliminateHTMLTags, int startLineNum, int endLineNum)
128        throws IOException
129    {
130        return getPageTokens
131            (Scrape.openConn(url), eliminateHTMLTags, startLineNum, endLineNum, null, null, null);
132    }
133
134    /**
135     * Convenience Method.
136     * <BR />Accepts: {@code URL}
137     * <BR />Passes null to {@code startTag} &amp; {@code endTag} parameters.
138     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
139     *      String, String, String, String, String)}
140     * <BR />And Invokes: {@link Scrape#openConn(URL)}
141     */
142    public static Vector<HTMLNode> getPageTokens(
143            URL url, boolean eliminateHTMLTags,
144            String rawHTMLFile, String matchesFile, String justTextFile
145        )
146        throws IOException
147    {
148        return getPageTokens(
149            Scrape.openConn(url), eliminateHTMLTags,
150            null, null,
151            rawHTMLFile, matchesFile, justTextFile
152        );
153    }
154    
155    /**
156     * Convenience Method.
157     * <BR />Accepts: {@code URL}
158     * <BR />And-Accepts: {@code 'startTag'} and {@code 'endTag'}
159     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
160     *      String, String, String, String, String)}
161     * <BR />And Invokes: {@link Scrape#openConn(URL)}
162     */
163    public static Vector<HTMLNode> getPageTokens(
164            URL url, boolean eliminateHTMLTags,
165            String startTag, String endTag,
166            String rawHTMLFile, String matchesFile, String justTextFile
167        )
168        throws IOException
169    {
170        return getPageTokens(
171            Scrape.openConn(url), eliminateHTMLTags,
172            startTag, endTag,
173            rawHTMLFile, matchesFile, justTextFile
174        );
175    }
176    
177    /**
178     * Convenience Method.
179     * <BR />Accepts: {@code URL}
180     * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'}
181     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
182     *      int, int, String, String, String)}
183     * <BR />And Invokes: {@link Scrape#openConn(URL)}
184     */
185    public static Vector<HTMLNode> getPageTokens(
186            URL url, boolean eliminateHTMLTags,
187            int startLineNum, int endLineNum,
188            String rawHTMLFile, String matchesFile, String justTextFile
189        )
190        throws IOException
191    {
192        return getPageTokens(
193            Scrape.openConn(url), eliminateHTMLTags,
194            startLineNum, endLineNum,
195            rawHTMLFile, matchesFile, justTextFile
196        );
197    }
198
199
200    // ********************************************************************************************
201    // ********************************************************************************************
202    // These 6 functions presume that the HTML source is from a CharSequence
203    // ********************************************************************************************
204    // ********************************************************************************************
205
206
207    /**
208     * Parses and Vectorizes HTML from a {@code CharSequence} (usually a {@code String}) source.
209     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_HTML>
210     * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
211     * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
212     * <BR /><BR /><B><SPAN STYLE="color: red;">NOTE:</B></SPAN> This method does not throw any
213     * checked-exceptions, there is no Input-Output involved here, it is strictly a computational
214     * method that neither invokes the file-system, nor the web.
215     */
216    public static Vector<HTMLNode> getPageTokens
217        (CharSequence html, boolean eliminateHTMLTags)
218        // NO IOException... NO I/O!
219    {
220        try
221            { return parser.parse(html, eliminateHTMLTags, null, null, null); }
222
223        // This should never happen, when reading from a 'String' rather than a URL, or
224        // BufferedReader ==> IOException will not be thrown.
225
226        catch (IOException ioe)
227            { throw new UnreachableError(ioe); }
228    }
229
230    /**
231     * Convenience Method.
232     * <BR />Accepts: {@code CharSequence}
233     * <BR />And-Accepts: {@code 'startTag'} and {@code 'endTag'}
234     * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
235     * <BR />Invokes: {@link #getPageTokens(CharSequence, boolean,
236     *      String, String, String, String, String)}
237     * <BR />Catches: {@code IOException} <B>{@code ==>}</B> No HTTP-I/O, so an IOException isn't
238     * possible!
239     */
240    public static Vector<HTMLNode> getPageTokens
241        (CharSequence html, boolean eliminateHTMLTags, String startTag, String endTag)
242    // NO IOException... NO I/O!
243    {
244        try
245            { return getPageTokens(html, eliminateHTMLTags, startTag, endTag, null, null, null); }
246
247        // This should never happen, when reading from a 'String' rather than a URL, or
248        // BufferedReader ==> IOException will not be thrown.
249
250        catch (IOException ioe)
251            { throw new UnreachableError(ioe); }
252    }
253    
254    /**
255     * Convenience Method.
256     * <BR />Accepts: {@code CharSequence}
257     * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'}
258     * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
259     * <BR />Invokes: {@link #getPageTokens(CharSequence, boolean,
260     *      int, int, String, String, String)}
261     * <BR />Catches: {@code IOException} <B>{@code ==>}</B> No HTTP-I/O, so an IOException isn't
262     * possible!
263     */
264    public static Vector<HTMLNode> getPageTokens
265        (CharSequence html, boolean eliminateHTMLTags, int startLineNum, int endLineNum)
266        // NO IOException... NO I/O!
267    {
268        try
269        { 
270            return getPageTokens
271                (html, eliminateHTMLTags, startLineNum, endLineNum, null, null, null);
272        }
273
274        // This should never happen, when reading from a 'String' rather than a URL, or
275        // BufferedReader ==> IOException will not be thrown.
276
277        catch (IOException ioe)
278            { throw new UnreachableError(ioe); }
279    }
280
281    /**
282     * Parses and Vectorizes HTML from a {@code CharSequence} (usually a {@code String}) source.
283     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_HTML>
284     * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
285     * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML>
286     * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F>
287     * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
288     * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
289     * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX>
290     */
291    public static Vector<HTMLNode> getPageTokens(
292            CharSequence html, boolean eliminateHTMLTags,
293            String rawHTMLFile, String matchesFile, String justTextFile
294        )
295        throws IOException
296    { return parser.parse(html, eliminateHTMLTags, rawHTMLFile, matchesFile, justTextFile); }
297
298    /**
299     * Parses and Vectorizes HTML from a {@code CharSequence} (usually a {@code String}) source.
300     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_HTML>
301     * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
302     * @param startTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_START_TAG>
303     * @param endTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_END_TAG>
304     * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML>
305     * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F>
306     * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
307     * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
308     * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX>
309     * @throws ScrapeException  <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_SCEX2>
310     */
311    public static Vector<HTMLNode> getPageTokens(
312            CharSequence html, boolean eliminateHTMLTags,
313            String startTag, String endTag,
314            String rawHTMLFile, String matchesFile, String justTextFile
315        )
316        throws IOException
317    {
318        String  htmlStr = html.toString();
319
320        int sPos = htmlStr.indexOf(startTag);
321
322        if (sPos == -1) throw new IllegalArgumentException
323            ("Passed String-Parameter 'startTag' [" + startTag + "] was not found in HTML.");
324
325        int ePos = htmlStr.indexOf(endTag, sPos);
326
327        if (ePos == -1) throw new IllegalArgumentException
328            ("Passed String-Parameter 'endTag' [" + endTag + "] was not found in HTML.");
329
330        ePos += endTag.length();
331
332        return parser.parse(
333            htmlStr.substring(sPos, ePos), eliminateHTMLTags,
334            rawHTMLFile, matchesFile, justTextFile
335        );
336    }
337    
338    /**
339     * Convenience Method.
340     * <BR />Accepts: {@code CharSequence}
341     * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'}
342     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
343     *      int, int, String, String, String)}
344     */
345    public static Vector<HTMLNode> getPageTokens(
346            CharSequence html, boolean eliminateHTMLTags,
347            int startLineNum, int endLineNum,
348            String rawHTMLFile, String matchesFile, String justTextFile
349        ) 
350        throws IOException
351    {
352        return getPageTokens(
353            new BufferedReader(new StringReader(html.toString())),
354            eliminateHTMLTags, startLineNum, endLineNum, rawHTMLFile, matchesFile, justTextFile
355        );
356    }
357
358
359    // ********************************************************************************************
360    // ********************************************************************************************
361    // The next 6 functions presume that the input is from a BufferedReader
362    // ********************************************************************************************
363    // ********************************************************************************************
364
365
366    /**
367     * Convenience Method.
368     * <BR />Accepts: {@code BufferedReader}
369     * <BR />Passes null to parameters
370     * {@code startTag, endTag, rawHTMLFile, matchesFile & justTextFile}.
371     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
372     *      String, String, String, String, String)}
373     */
374    public static Vector<HTMLNode> getPageTokens
375        (BufferedReader br, boolean eliminateHTMLTags)
376        throws IOException
377    { return getPageTokens(br, eliminateHTMLTags, null, null, null, null, null); }
378
379    /**
380     * Convenience Method.
381     * <BR />Accepts: {@code BufferedReader}
382     * <BR />And-Accepts: {@code 'startTag'} and {@code 'endTag'}
383     * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
384     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
385     *      String, String, String, String, String)}
386     */ 
387    public static Vector<HTMLNode> getPageTokens
388        (BufferedReader br, boolean eliminateHTMLTags, String startTag, String endTag)
389        throws IOException
390    { return getPageTokens(br, eliminateHTMLTags, startTag, endTag, null, null, null); }
391
392    /**
393     * Convenience Method.
394     * <BR />Accepts: {@code BufferedReader}
395     * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'}
396     * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
397     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
398     *      int, int, String, String, String)}
399     */
400    public static Vector<HTMLNode> getPageTokens
401        (BufferedReader br, boolean eliminateHTMLTags, int startLineNum, int endLineNum)
402        throws IOException
403    { return getPageTokens(br, eliminateHTMLTags, startLineNum, endLineNum, null, null, null); }
404
405
406    /**
407     * Convenience Method.
408     * <BR />Accepts: {@code BufferedReader}
409     * <BR />Passes null to {@code startTag} &amp; {@code endTag} parameters.
410     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
411     *      String, String, String, String, String)}
412     */
413    public static Vector<HTMLNode> getPageTokens(
414            BufferedReader br, boolean eliminateHTMLTags,
415            String rawHTMLFile, String matchesFile, String justTextFile
416        )
417        throws IOException
418    {
419        return getPageTokens
420            (br, eliminateHTMLTags, null, null, rawHTMLFile, matchesFile, justTextFile);
421    }
422
423
424    // ********************************************************************************************
425    // ********************************************************************************************
426    // 
427    // ********************************************************************************************
428    // ********************************************************************************************
429
430
431    /**
432     * Parses and Vectorizes HTML from a {@code BufferedReader} source.
433     * @param br <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_BR>
434     * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
435     * @param startTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_START_TAG>
436     * @param endTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_END_TAG>
437     * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML>
438     * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F>
439     * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
440     * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
441     * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX>
442     * @throws ScrapeException  <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_SCEX2>
443     */
444    public static Vector<HTMLNode> getPageTokens(
445            BufferedReader br, boolean eliminateHTMLTags,
446            String startTag, String endTag,
447            String rawHTMLFile, String matchesFile, String justTextFile
448        )
449        throws IOException
450    {
451        return parser.parse(
452            Scrape.getHTML(br, startTag, endTag), eliminateHTMLTags, rawHTMLFile,
453            matchesFile, justTextFile
454        );
455    }
456
457    /**
458     * Parses and Vectorizes HTML from a {@code BufferedReader} source.
459     * @param br <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_BR>
460     * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
461     * @param startLineNum <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_START_LN>
462     * @param endLineNum <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_END_LN>
463     * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML>
464     * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F>
465     * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
466     * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
467     * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX>
468     * @throws IllegalArgumentException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IAEX>
469     * @throws ScrapeException  <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_SCEX1>
470     */
471    public static Vector<HTMLNode> getPageTokens(
472            BufferedReader br, boolean eliminateHTMLTags,
473            int startLineNum, int endLineNum,
474            String rawHTMLFile, String matchesFile, String justTextFile
475        )
476        throws IOException
477    {
478        return parser.parse(
479            Scrape.getHTML(br, startLineNum, endLineNum), eliminateHTMLTags,
480            rawHTMLFile, matchesFile, justTextFile
481        );
482    }
483}