001package Torello.HTML; 002 003import java.io.*; 004import java.util.Vector; 005import java.net.URL; 006 007import Torello.JavaDoc.Excuse; 008import Torello.Java.UnreachableError; 009 010/** 011 * Java HTML's flagship-parser class for converting HTML web-pages into plain Java {@code Vector's} 012 * of {@link HTMLNode}. 013 * 014 * <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE> 015 * 016 * @see Scrape#getHTML(BufferedReader, int, int) 017 * @see Scrape#getHTML(BufferedReader, String, String) 018 * @see HTMLPageMWT 019 */ 020@Torello.JavaDoc.StaticFunctional(Excused="parser", Excuses=Excuse.SINGLETON) 021@Torello.JavaDoc.JDHeaderBackgroundImg 022public class HTMLPage 023{ 024 private HTMLPage() { } 025 026 /** 027 * A function-pointer / lambda-target that (could) potentially be used to replace this 028 * library's current regular-expression based parser with something possibly faster or even 029 * more efficient. 030 * 031 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_PARSER> 032 * @see #parser 033 */ 034 @FunctionalInterface 035 public static interface Parser 036 { 037 /** 038 * Parse html source-text into a {@code Vector<HTMLNode>}. 039 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_HTML> 040 * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT> 041 * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML> 042 * <BR /><BR /><B STYLE='color:red;'>NOTE:</B> If you have decided to implement a parser, 043 * and you wish to ingore this parameter (and don't want to output such a file) - <I>it is 044 * (hopefully) obvious that you may skip this step!</I> 045 * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F> 046 * <BR /><BR /><B>NOTE:</B> <I>As above,</I> you may skip implementing this. 047 * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT> 048 * <BR /><BR /><B>NOTE:</B> <I>As above,</I> you may skip implementing this. 049 * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN> 050 * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX> 051 */ 052 public Vector<HTMLNode> parse( 053 CharSequence html, 054 boolean eliminateHTMLTags, 055 String rawHTMLFile, 056 String matchesFile, 057 String justTextFile 058 ) 059 throws IOException; 060 } 061 062 /** 063 * If needing to "swap a proprietary parser" comes up, this is possible. 064 * It just needs to accept the same parameters as the current parser, and produce a 065 * {@code Vector<HTMLNode>.} This is not an advised step to take, but if an alternative 066 * parser has been tested and happens to be generating different results, it can be easily 067 * 'swapped out' for the one used now. 068 * @see Parser 069 * @see Parser#parse 070 */ 071 public static Parser parser = ParserRE::parsePageTokens; 072 073 074 // ******************************************************************************************** 075 // ******************************************************************************************** 076 // These 6 functions presume that the HTML source needs to be downloaded & read from a URL 077 // ******************************************************************************************** 078 // ******************************************************************************************** 079 080 081 /** 082 * Convenience Method. 083 * <BR />Accepts: {@code URL} 084 * <BR />Passes null to parameters 085 * {@code startTag, endTag, rawHTMLFile, matchesFile & justTextFile}. 086 * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean, 087 * String, String, String, String, String)} 088 * <BR />And Invokes: {@link Scrape#openConn(URL)} 089 */ 090 public static Vector<HTMLNode> getPageTokens 091 (URL url, boolean eliminateHTMLTags) 092 throws IOException 093 { 094 return getPageTokens 095 (Scrape.openConn(url), eliminateHTMLTags, null, null, null, null, null); 096 } 097 098 /** 099 * Convenience Method. 100 * <BR />Accepts: {@code URL} 101 * <BR />And-Accepts: {@code 'startTag'} and {@code 'endTag'} 102 * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}. 103 * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean, 104 * String, String, String, String, String)} 105 * <BR />And Invokes: {@link Scrape#openConn(URL)} 106 */ 107 public static Vector<HTMLNode> getPageTokens 108 (URL url, boolean eliminateHTMLTags, String startTag, String endTag) 109 throws IOException 110 { 111 return getPageTokens 112 (Scrape.openConn(url), eliminateHTMLTags, startTag, endTag, null, null, null); 113 } 114 115 /** 116 * Convenience Method. 117 * <BR />Accepts: {@code URL} 118 * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'} 119 * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}. 120 * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean, 121 * int, int, String, String, String)} 122 * <BR />And Invokes: {@link Scrape#openConn(URL)} 123 */ 124 public static Vector<HTMLNode> getPageTokens 125 (URL url, boolean eliminateHTMLTags, int startLineNum, int endLineNum) 126 throws IOException 127 { 128 return getPageTokens 129 (Scrape.openConn(url), eliminateHTMLTags, startLineNum, endLineNum, null, null, null); 130 } 131 132 /** 133 * Convenience Method. 134 * <BR />Accepts: {@code URL} 135 * <BR />Passes null to {@code startTag} & {@code endTag} parameters. 136 * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean, 137 * String, String, String, String, String)} 138 * <BR />And Invokes: {@link Scrape#openConn(URL)} 139 */ 140 public static Vector<HTMLNode> getPageTokens( 141 URL url, boolean eliminateHTMLTags, 142 String rawHTMLFile, String matchesFile, String justTextFile 143 ) 144 throws IOException 145 { 146 return getPageTokens( 147 Scrape.openConn(url), eliminateHTMLTags, 148 null, null, 149 rawHTMLFile, matchesFile, justTextFile 150 ); 151 } 152 153 /** 154 * Convenience Method. 155 * <BR />Accepts: {@code URL} 156 * <BR />And-Accepts: {@code 'startTag'} and {@code 'endTag'} 157 * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean, 158 * String, String, String, String, String)} 159 * <BR />And Invokes: {@link Scrape#openConn(URL)} 160 */ 161 public static Vector<HTMLNode> getPageTokens( 162 URL url, boolean eliminateHTMLTags, 163 String startTag, String endTag, 164 String rawHTMLFile, String matchesFile, String justTextFile 165 ) 166 throws IOException 167 { 168 return getPageTokens( 169 Scrape.openConn(url), eliminateHTMLTags, 170 startTag, endTag, 171 rawHTMLFile, matchesFile, justTextFile 172 ); 173 } 174 175 /** 176 * Convenience Method. 177 * <BR />Accepts: {@code URL} 178 * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'} 179 * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean, 180 * int, int, String, String, String)} 181 * <BR />And Invokes: {@link Scrape#openConn(URL)} 182 */ 183 public static Vector<HTMLNode> getPageTokens( 184 URL url, boolean eliminateHTMLTags, 185 int startLineNum, int endLineNum, 186 String rawHTMLFile, String matchesFile, String justTextFile 187 ) 188 throws IOException 189 { 190 return getPageTokens( 191 Scrape.openConn(url), eliminateHTMLTags, 192 startLineNum, endLineNum, 193 rawHTMLFile, matchesFile, justTextFile 194 ); 195 } 196 197 198 // ******************************************************************************************** 199 // ******************************************************************************************** 200 // These 6 functions presume that the HTML source is from a CharSequence 201 // ******************************************************************************************** 202 // ******************************************************************************************** 203 204 205 /** 206 * Parses and Vectorizes HTML from a {@code CharSequence} (usually a {@code String}) source. 207 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_HTML> 208 * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT> 209 * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN> 210 * <BR /><BR /><B><SPAN STYLE="color: red;">NOTE:</B></SPAN> This method does not throw any 211 * checked-exceptions, there is no Input-Output involved here, it is strictly a computational 212 * method that neither invokes the file-system, nor the web. 213 */ 214 public static Vector<HTMLNode> getPageTokens 215 (CharSequence html, boolean eliminateHTMLTags) 216 // NO IOException... NO I/O! 217 { 218 try 219 { return parser.parse(html, eliminateHTMLTags, null, null, null); } 220 221 // This should never happen, when reading from a 'String' rather than a URL, or 222 // BufferedReader ==> IOException will not be thrown. 223 224 catch (IOException ioe) 225 { throw new UnreachableError(ioe); } 226 } 227 228 /** 229 * Convenience Method. 230 * <BR />Accepts: {@code CharSequence} 231 * <BR />And-Accepts: {@code 'startTag'} and {@code 'endTag'} 232 * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}. 233 * <BR />Invokes: {@link #getPageTokens(CharSequence, boolean, 234 * String, String, String, String, String)} 235 * <BR />Catches: {@code IOException} <B>{@code ==>}</B> No HTTP-I/O, so an IOException isn't 236 * possible! 237 */ 238 public static Vector<HTMLNode> getPageTokens 239 (CharSequence html, boolean eliminateHTMLTags, String startTag, String endTag) 240 // NO IOException... NO I/O! 241 { 242 try 243 { return getPageTokens(html, eliminateHTMLTags, startTag, endTag, null, null, null); } 244 245 // This should never happen, when reading from a 'String' rather than a URL, or 246 // BufferedReader ==> IOException will not be thrown. 247 248 catch (IOException ioe) 249 { throw new UnreachableError(ioe); } 250 } 251 252 /** 253 * Convenience Method. 254 * <BR />Accepts: {@code CharSequence} 255 * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'} 256 * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}. 257 * <BR />Invokes: {@link #getPageTokens(CharSequence, boolean, 258 * int, int, String, String, String)} 259 * <BR />Catches: {@code IOException} <B>{@code ==>}</B> No HTTP-I/O, so an IOException isn't 260 * possible! 261 */ 262 public static Vector<HTMLNode> getPageTokens 263 (CharSequence html, boolean eliminateHTMLTags, int startLineNum, int endLineNum) 264 // NO IOException... NO I/O! 265 { 266 try 267 { 268 return getPageTokens 269 (html, eliminateHTMLTags, startLineNum, endLineNum, null, null, null); 270 } 271 272 // This should never happen, when reading from a 'String' rather than a URL, or 273 // BufferedReader ==> IOException will not be thrown. 274 275 catch (IOException ioe) 276 { throw new UnreachableError(ioe); } 277 } 278 279 /** 280 * Parses and Vectorizes HTML from a {@code CharSequence} (usually a {@code String}) source. 281 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_HTML> 282 * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT> 283 * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML> 284 * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F> 285 * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT> 286 * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN> 287 * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX> 288 */ 289 public static Vector<HTMLNode> getPageTokens( 290 CharSequence html, boolean eliminateHTMLTags, 291 String rawHTMLFile, String matchesFile, String justTextFile 292 ) 293 throws IOException 294 { return parser.parse(html, eliminateHTMLTags, rawHTMLFile, matchesFile, justTextFile); } 295 296 /** 297 * Parses and Vectorizes HTML from a {@code CharSequence} (usually a {@code String}) source. 298 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_HTML> 299 * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT> 300 * @param startTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_START_TAG> 301 * @param endTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_END_TAG> 302 * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML> 303 * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F> 304 * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT> 305 * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN> 306 * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX> 307 * @throws ScrapeException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_SCEX2> 308 */ 309 public static Vector<HTMLNode> getPageTokens( 310 CharSequence html, boolean eliminateHTMLTags, 311 String startTag, String endTag, 312 String rawHTMLFile, String matchesFile, String justTextFile 313 ) 314 throws IOException 315 { 316 String htmlStr = html.toString(); 317 318 int sPos = htmlStr.indexOf(startTag); 319 320 if (sPos == -1) throw new IllegalArgumentException 321 ("Passed String-Parameter 'startTag' [" + startTag + "] was not found in HTML."); 322 323 int ePos = htmlStr.indexOf(endTag, sPos); 324 325 if (ePos == -1) throw new IllegalArgumentException 326 ("Passed String-Parameter 'endTag' [" + endTag + "] was not found in HTML."); 327 328 ePos += endTag.length(); 329 330 return parser.parse( 331 htmlStr.substring(sPos, ePos), eliminateHTMLTags, 332 rawHTMLFile, matchesFile, justTextFile 333 ); 334 } 335 336 /** 337 * Convenience Method. 338 * <BR />Accepts: {@code CharSequence} 339 * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'} 340 * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean, 341 * int, int, String, String, String)} 342 */ 343 public static Vector<HTMLNode> getPageTokens( 344 CharSequence html, boolean eliminateHTMLTags, 345 int startLineNum, int endLineNum, 346 String rawHTMLFile, String matchesFile, String justTextFile 347 ) 348 throws IOException 349 { 350 return getPageTokens( 351 new BufferedReader(new StringReader(html.toString())), 352 eliminateHTMLTags, startLineNum, endLineNum, rawHTMLFile, matchesFile, justTextFile 353 ); 354 } 355 356 357 // ******************************************************************************************** 358 // ******************************************************************************************** 359 // The next 6 functions presume that the input is from a BufferedReader 360 // ******************************************************************************************** 361 // ******************************************************************************************** 362 363 364 /** 365 * Convenience Method. 366 * <BR />Accepts: {@code BufferedReader} 367 * <BR />Passes null to parameters 368 * {@code startTag, endTag, rawHTMLFile, matchesFile & justTextFile}. 369 * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean, 370 * String, String, String, String, String)} 371 */ 372 public static Vector<HTMLNode> getPageTokens 373 (BufferedReader br, boolean eliminateHTMLTags) 374 throws IOException 375 { return getPageTokens(br, eliminateHTMLTags, null, null, null, null, null); } 376 377 /** 378 * Convenience Method. 379 * <BR />Accepts: {@code BufferedReader} 380 * <BR />And-Accepts: {@code 'startTag'} and {@code 'endTag'} 381 * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}. 382 * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean, 383 * String, String, String, String, String)} 384 */ 385 public static Vector<HTMLNode> getPageTokens 386 (BufferedReader br, boolean eliminateHTMLTags, String startTag, String endTag) 387 throws IOException 388 { return getPageTokens(br, eliminateHTMLTags, startTag, endTag, null, null, null); } 389 390 /** 391 * Convenience Method. 392 * <BR />Accepts: {@code BufferedReader} 393 * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'} 394 * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}. 395 * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean, 396 * int, int, String, String, String)} 397 */ 398 public static Vector<HTMLNode> getPageTokens 399 (BufferedReader br, boolean eliminateHTMLTags, int startLineNum, int endLineNum) 400 throws IOException 401 { return getPageTokens(br, eliminateHTMLTags, startLineNum, endLineNum, null, null, null); } 402 403 404 /** 405 * Convenience Method. 406 * <BR />Accepts: {@code BufferedReader} 407 * <BR />Passes null to {@code startTag} & {@code endTag} parameters. 408 * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean, 409 * String, String, String, String, String)} 410 */ 411 public static Vector<HTMLNode> getPageTokens( 412 BufferedReader br, boolean eliminateHTMLTags, 413 String rawHTMLFile, String matchesFile, String justTextFile 414 ) 415 throws IOException 416 { 417 return getPageTokens 418 (br, eliminateHTMLTags, null, null, rawHTMLFile, matchesFile, justTextFile); 419 } 420 421 422 // ******************************************************************************************** 423 // ******************************************************************************************** 424 // 425 // ******************************************************************************************** 426 // ******************************************************************************************** 427 428 429 /** 430 * Parses and Vectorizes HTML from a {@code BufferedReader} source. 431 * @param br <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_BR> 432 * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT> 433 * @param startTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_START_TAG> 434 * @param endTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_END_TAG> 435 * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML> 436 * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F> 437 * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT> 438 * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN> 439 * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX> 440 * @throws ScrapeException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_SCEX2> 441 */ 442 public static Vector<HTMLNode> getPageTokens( 443 BufferedReader br, boolean eliminateHTMLTags, 444 String startTag, String endTag, 445 String rawHTMLFile, String matchesFile, String justTextFile 446 ) 447 throws IOException 448 { 449 return parser.parse( 450 Scrape.getHTML(br, startTag, endTag), eliminateHTMLTags, rawHTMLFile, 451 matchesFile, justTextFile 452 ); 453 } 454 455 /** 456 * Parses and Vectorizes HTML from a {@code BufferedReader} source. 457 * @param br <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_BR> 458 * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT> 459 * @param startLineNum <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_START_LN> 460 * @param endLineNum <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_END_LN> 461 * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML> 462 * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F> 463 * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT> 464 * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN> 465 * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX> 466 * @throws IllegalArgumentException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IAEX> 467 * @throws ScrapeException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_SCEX1> 468 */ 469 public static Vector<HTMLNode> getPageTokens( 470 BufferedReader br, boolean eliminateHTMLTags, 471 int startLineNum, int endLineNum, 472 String rawHTMLFile, String matchesFile, String justTextFile 473 ) 474 throws IOException 475 { 476 return parser.parse( 477 Scrape.getHTML(br, startLineNum, endLineNum), eliminateHTMLTags, 478 rawHTMLFile, matchesFile, justTextFile 479 ); 480 } 481}