Source code

001package Torello.HTML;
002
003import java.io.*;
004import java.util.Vector;
005import java.net.URL;
006
007import Torello.JavaDoc.Annotations.StaticFunctional;
008import Torello.JavaDoc.Annotations.StaticFunctional.Excuse;
009
010import Torello.Java.UnreachableError;
011
012/**
013 * Java HTML's flagship-parser class for converting HTML web-pages into plain Java {@code Vector's}
014 * of {@link HTMLNode}.
015 * 
016 * <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE>
017 * 
018 * @see Scrape#getHTML(BufferedReader, int, int)
019 * @see Scrape#getHTML(BufferedReader, String, String)
020 * @see HTMLPageMWT
021 */
022@StaticFunctional(Excused="parser", Excuses=Excuse.SINGLETON)
023@Torello.JavaDoc.Annotations.JDHeaderBackgroundImg
024public class HTMLPage
025{
026    private HTMLPage() { }
027
028    /**
029     * A function-pointer / lambda-target that (could) potentially be used to replace this
030     * library's current regular-expression based parser with something possibly faster or even
031     * more efficient.
032     * 
033     * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_PARSER>
034     * @see #parser
035     */
036    @FunctionalInterface
037    public static interface Parser
038    {
039        /**
040         * Parse html source-text into a {@code Vector<HTMLNode>}.
041         * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_HTML>
042         * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
043         * 
044         * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML>
045         * 
046         * <BR /><BR /><DIV CLASS=JDHint> If you have decided to implement a parser, and you wish
047         * to ingore this parameter (and don't want to output such a file) - it is (hopefully)
048         * obvious that you may skip this step!</DIV>
049         * 
050         * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F>
051         * <BR /><BR /><DIV CLASS=JDHint><B>As above,</B> you may skip implementing this.</DIV>
052         * 
053         * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
054         * <BR /><BR /><DIV CLASS=JDHint><B>As above,</B> you may skip implementing this.</DIV>
055         * 
056         * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
057         * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX>
058         */
059        public Vector<HTMLNode> parse(
060                CharSequence    html,
061                boolean         eliminateHTMLTags,
062                String          rawHTMLFile,
063                String          matchesFile,
064                String          justTextFile
065            )
066        throws IOException;
067    }
068
069    /**
070     * If needing to "swap a proprietary parser" comes up, this is possible.
071     * It just needs to accept the same parameters as the current parser, and produce a 
072     * {@code Vector<HTMLNode>.}  This is not an advised step to take, but if an alternative
073     * parser has been tested and happens to be generating different results, it can be easily
074     * 'swapped out' for the one used now.
075     * @see Parser
076     * @see Parser#parse
077     */
078    public static Parser parser = ParserRE::parsePageTokens;
079
080
081    // ********************************************************************************************
082    // ********************************************************************************************
083    // These 6 functions presume that the HTML source needs to be downloaded & read from a URL
084    // ********************************************************************************************
085    // ********************************************************************************************
086
087
088    /**
089     * Convenience Method.
090     * <BR />Accepts: {@code URL}
091     * <BR />Passes null to parameters
092     * {@code startTag, endTag, rawHTMLFile, matchesFile & justTextFile}.
093     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
094     *      String, String, String, String, String)}
095     * <BR />And Invokes: {@link Scrape#openConn(URL)}
096     */
097    public static Vector<HTMLNode> getPageTokens
098        (URL url, boolean eliminateHTMLTags)
099        throws IOException
100    {
101        return getPageTokens
102            (Scrape.openConn(url), eliminateHTMLTags, null, null, null, null, null);
103    }
104    
105    /**
106     * Convenience Method.
107     * <BR />Accepts: {@code URL}
108     * <BR />And-Accepts: {@code 'startTag'} and {@code 'endTag'}
109     * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
110     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
111     *      String, String, String, String, String)}
112     * <BR />And Invokes: {@link Scrape#openConn(URL)}
113     */       
114    public static Vector<HTMLNode> getPageTokens
115        (URL url, boolean eliminateHTMLTags, String startTag, String endTag)
116        throws IOException
117    {
118        return getPageTokens
119            (Scrape.openConn(url), eliminateHTMLTags, startTag, endTag, null, null, null);
120    }
121    
122    /**
123     * Convenience Method.
124     * <BR />Accepts: {@code URL}
125     * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'}
126     * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
127     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
128     *      int, int, String, String, String)}
129     * <BR />And Invokes: {@link Scrape#openConn(URL)}
130     */
131    public static Vector<HTMLNode> getPageTokens
132        (URL url, boolean eliminateHTMLTags, int startLineNum, int endLineNum)
133        throws IOException
134    {
135        return getPageTokens
136            (Scrape.openConn(url), eliminateHTMLTags, startLineNum, endLineNum, null, null, null);
137    }
138
139    /**
140     * Convenience Method.
141     * <BR />Accepts: {@code URL}
142     * <BR />Passes null to {@code startTag} &amp; {@code endTag} parameters.
143     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
144     *      String, String, String, String, String)}
145     * <BR />And Invokes: {@link Scrape#openConn(URL)}
146     */
147    public static Vector<HTMLNode> getPageTokens(
148            URL url, boolean eliminateHTMLTags,
149            String rawHTMLFile, String matchesFile, String justTextFile
150        )
151        throws IOException
152    {
153        return getPageTokens(
154            Scrape.openConn(url), eliminateHTMLTags,
155            null, null,
156            rawHTMLFile, matchesFile, justTextFile
157        );
158    }
159    
160    /**
161     * Convenience Method.
162     * <BR />Accepts: {@code URL}
163     * <BR />And-Accepts: {@code 'startTag'} and {@code 'endTag'}
164     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
165     *      String, String, String, String, String)}
166     * <BR />And Invokes: {@link Scrape#openConn(URL)}
167     */
168    public static Vector<HTMLNode> getPageTokens(
169            URL url, boolean eliminateHTMLTags,
170            String startTag, String endTag,
171            String rawHTMLFile, String matchesFile, String justTextFile
172        )
173        throws IOException
174    {
175        return getPageTokens(
176            Scrape.openConn(url), eliminateHTMLTags,
177            startTag, endTag,
178            rawHTMLFile, matchesFile, justTextFile
179        );
180    }
181    
182    /**
183     * Convenience Method.
184     * <BR />Accepts: {@code URL}
185     * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'}
186     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
187     *      int, int, String, String, String)}
188     * <BR />And Invokes: {@link Scrape#openConn(URL)}
189     */
190    public static Vector<HTMLNode> getPageTokens(
191            URL url, boolean eliminateHTMLTags,
192            int startLineNum, int endLineNum,
193            String rawHTMLFile, String matchesFile, String justTextFile
194        )
195        throws IOException
196    {
197        return getPageTokens(
198            Scrape.openConn(url), eliminateHTMLTags,
199            startLineNum, endLineNum,
200            rawHTMLFile, matchesFile, justTextFile
201        );
202    }
203
204
205    // ********************************************************************************************
206    // ********************************************************************************************
207    // These 6 functions presume that the HTML source is from a CharSequence
208    // ********************************************************************************************
209    // ********************************************************************************************
210
211
212    /**
213     * Parses and Vectorizes HTML from a {@code CharSequence} (usually a {@code String}) source.
214     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_HTML>
215     * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
216     * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
217     * <BR /><BR /><B><SPAN STYLE="color: red;">NOTE:</B></SPAN> This method does not throw any
218     * checked-exceptions, there is no Input-Output involved here, it is strictly a computational
219     * method that neither invokes the file-system, nor the web.
220     */
221    public static Vector<HTMLNode> getPageTokens
222        (CharSequence html, boolean eliminateHTMLTags)
223        // NO IOException... NO I/O!
224    {
225        try
226            { return parser.parse(html, eliminateHTMLTags, null, null, null); }
227
228        // This should never happen, when reading from a 'String' rather than a URL, or
229        // BufferedReader ==> IOException will not be thrown.
230
231        catch (IOException ioe)
232            { throw new UnreachableError(ioe); }
233    }
234
235    /**
236     * Convenience Method.
237     * <BR />Accepts: {@code CharSequence}
238     * <BR />And-Accepts: {@code 'startTag'} and {@code 'endTag'}
239     * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
240     * <BR />Invokes: {@link #getPageTokens(CharSequence, boolean,
241     *      String, String, String, String, String)}
242     * <BR />Catches: {@code IOException} <B>{@code ==>}</B> No HTTP-I/O, so an IOException isn't
243     * possible!
244     */
245    public static Vector<HTMLNode> getPageTokens
246        (CharSequence html, boolean eliminateHTMLTags, String startTag, String endTag)
247    // NO IOException... NO I/O!
248    {
249        try
250            { return getPageTokens(html, eliminateHTMLTags, startTag, endTag, null, null, null); }
251
252        // This should never happen, when reading from a 'String' rather than a URL, or
253        // BufferedReader ==> IOException will not be thrown.
254
255        catch (IOException ioe)
256            { throw new UnreachableError(ioe); }
257    }
258    
259    /**
260     * Convenience Method.
261     * <BR />Accepts: {@code CharSequence}
262     * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'}
263     * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
264     * <BR />Invokes: {@link #getPageTokens(CharSequence, boolean,
265     *      int, int, String, String, String)}
266     * <BR />Catches: {@code IOException} <B>{@code ==>}</B> No HTTP-I/O, so an IOException isn't
267     * possible!
268     */
269    public static Vector<HTMLNode> getPageTokens
270        (CharSequence html, boolean eliminateHTMLTags, int startLineNum, int endLineNum)
271        // NO IOException... NO I/O!
272    {
273        try
274        { 
275            return getPageTokens
276                (html, eliminateHTMLTags, startLineNum, endLineNum, null, null, null);
277        }
278
279        // This should never happen, when reading from a 'String' rather than a URL, or
280        // BufferedReader ==> IOException will not be thrown.
281
282        catch (IOException ioe)
283            { throw new UnreachableError(ioe); }
284    }
285
286    /**
287     * Parses and Vectorizes HTML from a {@code CharSequence} (usually a {@code String}) source.
288     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_HTML>
289     * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
290     * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML>
291     * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F>
292     * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
293     * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
294     * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX>
295     */
296    public static Vector<HTMLNode> getPageTokens(
297            CharSequence html, boolean eliminateHTMLTags,
298            String rawHTMLFile, String matchesFile, String justTextFile
299        )
300        throws IOException
301    { return parser.parse(html, eliminateHTMLTags, rawHTMLFile, matchesFile, justTextFile); }
302
303    /**
304     * Parses and Vectorizes HTML from a {@code CharSequence} (usually a {@code String}) source.
305     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_HTML>
306     * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
307     * @param startTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_START_TAG>
308     * @param endTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_END_TAG>
309     * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML>
310     * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F>
311     * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
312     * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
313     * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX>
314     * @throws ScrapeException  <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_SCEX2>
315     */
316    public static Vector<HTMLNode> getPageTokens(
317            CharSequence html, boolean eliminateHTMLTags,
318            String startTag, String endTag,
319            String rawHTMLFile, String matchesFile, String justTextFile
320        )
321        throws IOException
322    {
323        String  htmlStr = html.toString();
324
325        int sPos = htmlStr.indexOf(startTag);
326
327        if (sPos == -1) throw new IllegalArgumentException
328            ("Passed String-Parameter 'startTag' [" + startTag + "] was not found in HTML.");
329
330        int ePos = htmlStr.indexOf(endTag, sPos);
331
332        if (ePos == -1) throw new IllegalArgumentException
333            ("Passed String-Parameter 'endTag' [" + endTag + "] was not found in HTML.");
334
335        ePos += endTag.length();
336
337        return parser.parse(
338            htmlStr.substring(sPos, ePos), eliminateHTMLTags,
339            rawHTMLFile, matchesFile, justTextFile
340        );
341    }
342    
343    /**
344     * Convenience Method.
345     * <BR />Accepts: {@code CharSequence}
346     * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'}
347     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
348     *      int, int, String, String, String)}
349     */
350    public static Vector<HTMLNode> getPageTokens(
351            CharSequence html, boolean eliminateHTMLTags,
352            int startLineNum, int endLineNum,
353            String rawHTMLFile, String matchesFile, String justTextFile
354        ) 
355        throws IOException
356    {
357        return getPageTokens(
358            new BufferedReader(new StringReader(html.toString())),
359            eliminateHTMLTags, startLineNum, endLineNum, rawHTMLFile, matchesFile, justTextFile
360        );
361    }
362
363
364    // ********************************************************************************************
365    // ********************************************************************************************
366    // The next 6 functions presume that the input is from a BufferedReader
367    // ********************************************************************************************
368    // ********************************************************************************************
369
370
371    /**
372     * Convenience Method.
373     * <BR />Accepts: {@code BufferedReader}
374     * <BR />Passes null to parameters
375     * {@code startTag, endTag, rawHTMLFile, matchesFile & justTextFile}.
376     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
377     *      String, String, String, String, String)}
378     */
379    public static Vector<HTMLNode> getPageTokens
380        (BufferedReader br, boolean eliminateHTMLTags)
381        throws IOException
382    { return getPageTokens(br, eliminateHTMLTags, null, null, null, null, null); }
383
384    /**
385     * Convenience Method.
386     * <BR />Accepts: {@code BufferedReader}
387     * <BR />And-Accepts: {@code 'startTag'} and {@code 'endTag'}
388     * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
389     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
390     *      String, String, String, String, String)}
391     */ 
392    public static Vector<HTMLNode> getPageTokens
393        (BufferedReader br, boolean eliminateHTMLTags, String startTag, String endTag)
394        throws IOException
395    { return getPageTokens(br, eliminateHTMLTags, startTag, endTag, null, null, null); }
396
397    /**
398     * Convenience Method.
399     * <BR />Accepts: {@code BufferedReader}
400     * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'}
401     * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
402     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
403     *      int, int, String, String, String)}
404     */
405    public static Vector<HTMLNode> getPageTokens
406        (BufferedReader br, boolean eliminateHTMLTags, int startLineNum, int endLineNum)
407        throws IOException
408    { return getPageTokens(br, eliminateHTMLTags, startLineNum, endLineNum, null, null, null); }
409
410
411    /**
412     * Convenience Method.
413     * <BR />Accepts: {@code BufferedReader}
414     * <BR />Passes null to {@code startTag} &amp; {@code endTag} parameters.
415     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
416     *      String, String, String, String, String)}
417     */
418    public static Vector<HTMLNode> getPageTokens(
419            BufferedReader br, boolean eliminateHTMLTags,
420            String rawHTMLFile, String matchesFile, String justTextFile
421        )
422        throws IOException
423    {
424        return getPageTokens
425            (br, eliminateHTMLTags, null, null, rawHTMLFile, matchesFile, justTextFile);
426    }
427
428
429    // ********************************************************************************************
430    // ********************************************************************************************
431    // 
432    // ********************************************************************************************
433    // ********************************************************************************************
434
435
436    /**
437     * Parses and Vectorizes HTML from a {@code BufferedReader} source.
438     * @param br <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_BR>
439     * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
440     * @param startTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_START_TAG>
441     * @param endTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_END_TAG>
442     * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML>
443     * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F>
444     * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
445     * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
446     * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX>
447     * @throws ScrapeException  <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_SCEX2>
448     */
449    public static Vector<HTMLNode> getPageTokens(
450            BufferedReader br, boolean eliminateHTMLTags,
451            String startTag, String endTag,
452            String rawHTMLFile, String matchesFile, String justTextFile
453        )
454        throws IOException
455    {
456        return parser.parse(
457            Scrape.getHTML(br, startTag, endTag), eliminateHTMLTags, rawHTMLFile,
458            matchesFile, justTextFile
459        );
460    }
461
462    /**
463     * Parses and Vectorizes HTML from a {@code BufferedReader} source.
464     * @param br <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_BR>
465     * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
466     * @param startLineNum <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_START_LN>
467     * @param endLineNum <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_END_LN>
468     * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML>
469     * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F>
470     * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
471     * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
472     * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX>
473     * @throws IllegalArgumentException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IAEX>
474     * @throws ScrapeException  <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_SCEX1>
475     */
476    public static Vector<HTMLNode> getPageTokens(
477            BufferedReader br, boolean eliminateHTMLTags,
478            int startLineNum, int endLineNum,
479            String rawHTMLFile, String matchesFile, String justTextFile
480        )
481        throws IOException
482    {
483        return parser.parse(
484            Scrape.getHTML(br, startLineNum, endLineNum), eliminateHTMLTags,
485            rawHTMLFile, matchesFile, justTextFile
486        );
487    }
488}