001package Torello.HTML;
002
003import java.io.*;
004import java.util.Vector;
005import java.net.URL;
006
007import Torello.JavaDoc.Excuse;
008import Torello.Java.UnreachableError;
009
010/**
011 * Java HTML's flagship-parser class for converting HTML web-pages into plain Java {@code Vector's}
012 * of {@link HTMLNode}.
013 * 
014 * <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE>
015 * 
016 * @see Scrape#getHTML(BufferedReader, int, int)
017 * @see Scrape#getHTML(BufferedReader, String, String)
018 * @see HTMLPageMWT
019 */
020@Torello.JavaDoc.StaticFunctional(Excused="parser", Excuses=Excuse.SINGLETON)
021@Torello.JavaDoc.JDHeaderBackgroundImg
022public class HTMLPage
023{
024    private HTMLPage() { }
025
026    /**
027     * A function-pointer / lambda-target that (could) potentially be used to replace this
028     * library's current regular-expression based parser with something possibly faster or even
029     * more efficient.
030     * 
031     * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_PARSER>
032     * @see #parser
033     */
034    @FunctionalInterface
035    public static interface Parser
036    {
037        /**
038         * Parse html source-text into a {@code Vector<HTMLNode>}.
039         * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_HTML>
040         * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
041         * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML>
042         * <BR /><BR /><B STYLE='color:red;'>NOTE:</B> If you have decided to implement a parser,
043         * and you wish to ingore this parameter (and don't want to output such a file) - <I>it is
044         * (hopefully) obvious that you may skip this step!</I>
045         * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F>
046         * <BR /><BR /><B>NOTE:</B> <I>As above,</I> you may skip implementing this.
047         * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
048         * <BR /><BR /><B>NOTE:</B> <I>As above,</I> you may skip implementing this.
049         * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
050         * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX>
051         */
052        public Vector<HTMLNode> parse(
053                CharSequence    html,
054                boolean         eliminateHTMLTags,
055                String          rawHTMLFile,
056                String          matchesFile,
057                String          justTextFile
058            )
059        throws IOException;
060    }
061
062    /**
063     * If needing to "swap a proprietary parser" comes up, this is possible.
064     * It just needs to accept the same parameters as the current parser, and produce a 
065     * {@code Vector<HTMLNode>.}  This is not an advised step to take, but if an alternative
066     * parser has been tested and happens to be generating different results, it can be easily
067     * 'swapped out' for the one used now.
068     * @see Parser
069     * @see Parser#parse
070     */
071    public static Parser parser = ParserRE::parsePageTokens;
072
073
074    // ********************************************************************************************
075    // ********************************************************************************************
076    // These 6 functions presume that the HTML source needs to be downloaded & read from a URL
077    // ********************************************************************************************
078    // ********************************************************************************************
079
080
081    /**
082     * Convenience Method.
083     * <BR />Accepts: {@code URL}
084     * <BR />Passes null to parameters
085     * {@code startTag, endTag, rawHTMLFile, matchesFile & justTextFile}.
086     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
087     *      String, String, String, String, String)}
088     * <BR />And Invokes: {@link Scrape#openConn(URL)}
089     */
090    public static Vector<HTMLNode> getPageTokens
091        (URL url, boolean eliminateHTMLTags)
092        throws IOException
093    {
094        return getPageTokens
095            (Scrape.openConn(url), eliminateHTMLTags, null, null, null, null, null);
096    }
097    
098    /**
099     * Convenience Method.
100     * <BR />Accepts: {@code URL}
101     * <BR />And-Accepts: {@code 'startTag'} and {@code 'endTag'}
102     * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
103     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
104     *      String, String, String, String, String)}
105     * <BR />And Invokes: {@link Scrape#openConn(URL)}
106     */       
107    public static Vector<HTMLNode> getPageTokens
108        (URL url, boolean eliminateHTMLTags, String startTag, String endTag)
109        throws IOException
110    {
111        return getPageTokens
112            (Scrape.openConn(url), eliminateHTMLTags, startTag, endTag, null, null, null);
113    }
114    
115    /**
116     * Convenience Method.
117     * <BR />Accepts: {@code URL}
118     * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'}
119     * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
120     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
121     *      int, int, String, String, String)}
122     * <BR />And Invokes: {@link Scrape#openConn(URL)}
123     */
124    public static Vector<HTMLNode> getPageTokens
125        (URL url, boolean eliminateHTMLTags, int startLineNum, int endLineNum)
126        throws IOException
127    {
128        return getPageTokens
129            (Scrape.openConn(url), eliminateHTMLTags, startLineNum, endLineNum, null, null, null);
130    }
131
132    /**
133     * Convenience Method.
134     * <BR />Accepts: {@code URL}
135     * <BR />Passes null to {@code startTag} &amp; {@code endTag} parameters.
136     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
137     *      String, String, String, String, String)}
138     * <BR />And Invokes: {@link Scrape#openConn(URL)}
139     */
140    public static Vector<HTMLNode> getPageTokens(
141            URL url, boolean eliminateHTMLTags,
142            String rawHTMLFile, String matchesFile, String justTextFile
143        )
144        throws IOException
145    {
146        return getPageTokens(
147            Scrape.openConn(url), eliminateHTMLTags,
148            null, null,
149            rawHTMLFile, matchesFile, justTextFile
150        );
151    }
152    
153    /**
154     * Convenience Method.
155     * <BR />Accepts: {@code URL}
156     * <BR />And-Accepts: {@code 'startTag'} and {@code 'endTag'}
157     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
158     *      String, String, String, String, String)}
159     * <BR />And Invokes: {@link Scrape#openConn(URL)}
160     */
161    public static Vector<HTMLNode> getPageTokens(
162            URL url, boolean eliminateHTMLTags,
163            String startTag, String endTag,
164            String rawHTMLFile, String matchesFile, String justTextFile
165        )
166        throws IOException
167    {
168        return getPageTokens(
169            Scrape.openConn(url), eliminateHTMLTags,
170            startTag, endTag,
171            rawHTMLFile, matchesFile, justTextFile
172        );
173    }
174    
175    /**
176     * Convenience Method.
177     * <BR />Accepts: {@code URL}
178     * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'}
179     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
180     *      int, int, String, String, String)}
181     * <BR />And Invokes: {@link Scrape#openConn(URL)}
182     */
183    public static Vector<HTMLNode> getPageTokens(
184            URL url, boolean eliminateHTMLTags,
185            int startLineNum, int endLineNum,
186            String rawHTMLFile, String matchesFile, String justTextFile
187        )
188        throws IOException
189    {
190        return getPageTokens(
191            Scrape.openConn(url), eliminateHTMLTags,
192            startLineNum, endLineNum,
193            rawHTMLFile, matchesFile, justTextFile
194        );
195    }
196
197
198    // ********************************************************************************************
199    // ********************************************************************************************
200    // These 6 functions presume that the HTML source is from a CharSequence
201    // ********************************************************************************************
202    // ********************************************************************************************
203
204
205    /**
206     * Parses and Vectorizes HTML from a {@code CharSequence} (usually a {@code String}) source.
207     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_HTML>
208     * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
209     * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
210     * <BR /><BR /><B><SPAN STYLE="color: red;">NOTE:</B></SPAN> This method does not throw any
211     * checked-exceptions, there is no Input-Output involved here, it is strictly a computational
212     * method that neither invokes the file-system, nor the web.
213     */
214    public static Vector<HTMLNode> getPageTokens
215        (CharSequence html, boolean eliminateHTMLTags)
216        // NO IOException... NO I/O!
217    {
218        try
219            { return parser.parse(html, eliminateHTMLTags, null, null, null); }
220
221        // This should never happen, when reading from a 'String' rather than a URL, or
222        // BufferedReader ==> IOException will not be thrown.
223
224        catch (IOException ioe)
225            { throw new UnreachableError(ioe); }
226    }
227
228    /**
229     * Convenience Method.
230     * <BR />Accepts: {@code CharSequence}
231     * <BR />And-Accepts: {@code 'startTag'} and {@code 'endTag'}
232     * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
233     * <BR />Invokes: {@link #getPageTokens(CharSequence, boolean,
234     *      String, String, String, String, String)}
235     * <BR />Catches: {@code IOException} <B>{@code ==>}</B> No HTTP-I/O, so an IOException isn't
236     * possible!
237     */
238    public static Vector<HTMLNode> getPageTokens
239        (CharSequence html, boolean eliminateHTMLTags, String startTag, String endTag)
240    // NO IOException... NO I/O!
241    {
242        try
243            { return getPageTokens(html, eliminateHTMLTags, startTag, endTag, null, null, null); }
244
245        // This should never happen, when reading from a 'String' rather than a URL, or
246        // BufferedReader ==> IOException will not be thrown.
247
248        catch (IOException ioe)
249            { throw new UnreachableError(ioe); }
250    }
251    
252    /**
253     * Convenience Method.
254     * <BR />Accepts: {@code CharSequence}
255     * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'}
256     * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
257     * <BR />Invokes: {@link #getPageTokens(CharSequence, boolean,
258     *      int, int, String, String, String)}
259     * <BR />Catches: {@code IOException} <B>{@code ==>}</B> No HTTP-I/O, so an IOException isn't
260     * possible!
261     */
262    public static Vector<HTMLNode> getPageTokens
263        (CharSequence html, boolean eliminateHTMLTags, int startLineNum, int endLineNum)
264        // NO IOException... NO I/O!
265    {
266        try
267        { 
268            return getPageTokens
269                (html, eliminateHTMLTags, startLineNum, endLineNum, null, null, null);
270        }
271
272        // This should never happen, when reading from a 'String' rather than a URL, or
273        // BufferedReader ==> IOException will not be thrown.
274
275        catch (IOException ioe)
276            { throw new UnreachableError(ioe); }
277    }
278
279    /**
280     * Parses and Vectorizes HTML from a {@code CharSequence} (usually a {@code String}) source.
281     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_HTML>
282     * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
283     * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML>
284     * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F>
285     * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
286     * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
287     * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX>
288     */
289    public static Vector<HTMLNode> getPageTokens(
290            CharSequence html, boolean eliminateHTMLTags,
291            String rawHTMLFile, String matchesFile, String justTextFile
292        )
293        throws IOException
294    { return parser.parse(html, eliminateHTMLTags, rawHTMLFile, matchesFile, justTextFile); }
295
296    /**
297     * Parses and Vectorizes HTML from a {@code CharSequence} (usually a {@code String}) source.
298     * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_HTML>
299     * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
300     * @param startTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_START_TAG>
301     * @param endTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_END_TAG>
302     * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML>
303     * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F>
304     * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
305     * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
306     * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX>
307     * @throws ScrapeException  <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_SCEX2>
308     */
309    public static Vector<HTMLNode> getPageTokens(
310            CharSequence html, boolean eliminateHTMLTags,
311            String startTag, String endTag,
312            String rawHTMLFile, String matchesFile, String justTextFile
313        )
314        throws IOException
315    {
316        String  htmlStr = html.toString();
317
318        int sPos = htmlStr.indexOf(startTag);
319
320        if (sPos == -1) throw new IllegalArgumentException
321            ("Passed String-Parameter 'startTag' [" + startTag + "] was not found in HTML.");
322
323        int ePos = htmlStr.indexOf(endTag, sPos);
324
325        if (ePos == -1) throw new IllegalArgumentException
326            ("Passed String-Parameter 'endTag' [" + endTag + "] was not found in HTML.");
327
328        ePos += endTag.length();
329
330        return parser.parse(
331            htmlStr.substring(sPos, ePos), eliminateHTMLTags,
332            rawHTMLFile, matchesFile, justTextFile
333        );
334    }
335    
336    /**
337     * Convenience Method.
338     * <BR />Accepts: {@code CharSequence}
339     * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'}
340     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
341     *      int, int, String, String, String)}
342     */
343    public static Vector<HTMLNode> getPageTokens(
344            CharSequence html, boolean eliminateHTMLTags,
345            int startLineNum, int endLineNum,
346            String rawHTMLFile, String matchesFile, String justTextFile
347        ) 
348        throws IOException
349    {
350        return getPageTokens(
351            new BufferedReader(new StringReader(html.toString())),
352            eliminateHTMLTags, startLineNum, endLineNum, rawHTMLFile, matchesFile, justTextFile
353        );
354    }
355
356
357    // ********************************************************************************************
358    // ********************************************************************************************
359    // The next 6 functions presume that the input is from a BufferedReader
360    // ********************************************************************************************
361    // ********************************************************************************************
362
363
364    /**
365     * Convenience Method.
366     * <BR />Accepts: {@code BufferedReader}
367     * <BR />Passes null to parameters
368     * {@code startTag, endTag, rawHTMLFile, matchesFile & justTextFile}.
369     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
370     *      String, String, String, String, String)}
371     */
372    public static Vector<HTMLNode> getPageTokens
373        (BufferedReader br, boolean eliminateHTMLTags)
374        throws IOException
375    { return getPageTokens(br, eliminateHTMLTags, null, null, null, null, null); }
376
377    /**
378     * Convenience Method.
379     * <BR />Accepts: {@code BufferedReader}
380     * <BR />And-Accepts: {@code 'startTag'} and {@code 'endTag'}
381     * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
382     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
383     *      String, String, String, String, String)}
384     */ 
385    public static Vector<HTMLNode> getPageTokens
386        (BufferedReader br, boolean eliminateHTMLTags, String startTag, String endTag)
387        throws IOException
388    { return getPageTokens(br, eliminateHTMLTags, startTag, endTag, null, null, null); }
389
390    /**
391     * Convenience Method.
392     * <BR />Accepts: {@code BufferedReader}
393     * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'}
394     * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
395     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
396     *      int, int, String, String, String)}
397     */
398    public static Vector<HTMLNode> getPageTokens
399        (BufferedReader br, boolean eliminateHTMLTags, int startLineNum, int endLineNum)
400        throws IOException
401    { return getPageTokens(br, eliminateHTMLTags, startLineNum, endLineNum, null, null, null); }
402
403
404    /**
405     * Convenience Method.
406     * <BR />Accepts: {@code BufferedReader}
407     * <BR />Passes null to {@code startTag} &amp; {@code endTag} parameters.
408     * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean,
409     *      String, String, String, String, String)}
410     */
411    public static Vector<HTMLNode> getPageTokens(
412            BufferedReader br, boolean eliminateHTMLTags,
413            String rawHTMLFile, String matchesFile, String justTextFile
414        )
415        throws IOException
416    {
417        return getPageTokens
418            (br, eliminateHTMLTags, null, null, rawHTMLFile, matchesFile, justTextFile);
419    }
420
421
422    // ********************************************************************************************
423    // ********************************************************************************************
424    // 
425    // ********************************************************************************************
426    // ********************************************************************************************
427
428
429    /**
430     * Parses and Vectorizes HTML from a {@code BufferedReader} source.
431     * @param br <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_BR>
432     * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
433     * @param startTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_START_TAG>
434     * @param endTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_END_TAG>
435     * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML>
436     * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F>
437     * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
438     * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
439     * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX>
440     * @throws ScrapeException  <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_SCEX2>
441     */
442    public static Vector<HTMLNode> getPageTokens(
443            BufferedReader br, boolean eliminateHTMLTags,
444            String startTag, String endTag,
445            String rawHTMLFile, String matchesFile, String justTextFile
446        )
447        throws IOException
448    {
449        return parser.parse(
450            Scrape.getHTML(br, startTag, endTag), eliminateHTMLTags, rawHTMLFile,
451            matchesFile, justTextFile
452        );
453    }
454
455    /**
456     * Parses and Vectorizes HTML from a {@code BufferedReader} source.
457     * @param br <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_BR>
458     * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
459     * @param startLineNum <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_START_LN>
460     * @param endLineNum <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_END_LN>
461     * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML>
462     * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F>
463     * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
464     * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
465     * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX>
466     * @throws IllegalArgumentException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IAEX>
467     * @throws ScrapeException  <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_SCEX1>
468     */
469    public static Vector<HTMLNode> getPageTokens(
470            BufferedReader br, boolean eliminateHTMLTags,
471            int startLineNum, int endLineNum,
472            String rawHTMLFile, String matchesFile, String justTextFile
473        )
474        throws IOException
475    {
476        return parser.parse(
477            Scrape.getHTML(br, startLineNum, endLineNum), eliminateHTMLTags,
478            rawHTMLFile, matchesFile, justTextFile
479        );
480    }
481}