001package Torello.HTML; 002 003import java.io.*; 004import java.util.Vector; 005import java.net.URL; 006 007import Torello.JavaDoc.Annotations.StaticFunctional; 008import Torello.JavaDoc.Annotations.StaticFunctional.Excuse; 009 010import Torello.Java.UnreachableError; 011 012/** 013 * Java HTML's flagship-parser class for converting HTML web-pages into plain Java {@code Vector's} 014 * of {@link HTMLNode}. 015 * 016 * <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE> 017 * 018 * @see Scrape#getHTML(BufferedReader, int, int) 019 * @see Scrape#getHTML(BufferedReader, String, String) 020 * @see HTMLPageMWT 021 */ 022@StaticFunctional(Excused="parser", Excuses=Excuse.SINGLETON) 023@Torello.JavaDoc.Annotations.JDHeaderBackgroundImg 024public class HTMLPage 025{ 026 private HTMLPage() { } 027 028 /** 029 * A function-pointer / lambda-target that (could) potentially be used to replace this 030 * library's current regular-expression based parser with something possibly faster or even 031 * more efficient. 032 * 033 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_PARSER> 034 * @see #parser 035 */ 036 @FunctionalInterface 037 public static interface Parser 038 { 039 /** 040 * Parse html source-text into a {@code Vector<HTMLNode>}. 041 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_HTML> 042 * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT> 043 * 044 * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML> 045 * 046 * <BR /><BR /><DIV CLASS=JDHint> If you have decided to implement a parser, and you wish 047 * to ingore this parameter (and don't want to output such a file) - it is (hopefully) 048 * obvious that you may skip this step!</DIV> 049 * 050 * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F> 051 * <BR /><BR /><DIV CLASS=JDHint><B>As above,</B> you may skip implementing this.</DIV> 052 * 053 * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT> 054 * <BR /><BR /><DIV CLASS=JDHint><B>As above,</B> you may skip implementing this.</DIV> 055 * 056 * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN> 057 * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX> 058 */ 059 public Vector<HTMLNode> parse( 060 CharSequence html, 061 boolean eliminateHTMLTags, 062 String rawHTMLFile, 063 String matchesFile, 064 String justTextFile 065 ) 066 throws IOException; 067 } 068 069 /** 070 * If needing to "swap a proprietary parser" comes up, this is possible. 071 * It just needs to accept the same parameters as the current parser, and produce a 072 * {@code Vector<HTMLNode>.} This is not an advised step to take, but if an alternative 073 * parser has been tested and happens to be generating different results, it can be easily 074 * 'swapped out' for the one used now. 075 * @see Parser 076 * @see Parser#parse 077 */ 078 public static Parser parser = ParserRE::parsePageTokens; 079 080 081 // ******************************************************************************************** 082 // ******************************************************************************************** 083 // These 6 functions presume that the HTML source needs to be downloaded & read from a URL 084 // ******************************************************************************************** 085 // ******************************************************************************************** 086 087 088 /** 089 * Convenience Method. 090 * <BR />Accepts: {@code URL} 091 * <BR />Passes null to parameters 092 * {@code startTag, endTag, rawHTMLFile, matchesFile & justTextFile}. 093 * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean, 094 * String, String, String, String, String)} 095 * <BR />And Invokes: {@link Scrape#openConn(URL)} 096 */ 097 public static Vector<HTMLNode> getPageTokens 098 (URL url, boolean eliminateHTMLTags) 099 throws IOException 100 { 101 return getPageTokens 102 (Scrape.openConn(url), eliminateHTMLTags, null, null, null, null, null); 103 } 104 105 /** 106 * Convenience Method. 107 * <BR />Accepts: {@code URL} 108 * <BR />And-Accepts: {@code 'startTag'} and {@code 'endTag'} 109 * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}. 110 * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean, 111 * String, String, String, String, String)} 112 * <BR />And Invokes: {@link Scrape#openConn(URL)} 113 */ 114 public static Vector<HTMLNode> getPageTokens 115 (URL url, boolean eliminateHTMLTags, String startTag, String endTag) 116 throws IOException 117 { 118 return getPageTokens 119 (Scrape.openConn(url), eliminateHTMLTags, startTag, endTag, null, null, null); 120 } 121 122 /** 123 * Convenience Method. 124 * <BR />Accepts: {@code URL} 125 * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'} 126 * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}. 127 * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean, 128 * int, int, String, String, String)} 129 * <BR />And Invokes: {@link Scrape#openConn(URL)} 130 */ 131 public static Vector<HTMLNode> getPageTokens 132 (URL url, boolean eliminateHTMLTags, int startLineNum, int endLineNum) 133 throws IOException 134 { 135 return getPageTokens 136 (Scrape.openConn(url), eliminateHTMLTags, startLineNum, endLineNum, null, null, null); 137 } 138 139 /** 140 * Convenience Method. 141 * <BR />Accepts: {@code URL} 142 * <BR />Passes null to {@code startTag} & {@code endTag} parameters. 143 * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean, 144 * String, String, String, String, String)} 145 * <BR />And Invokes: {@link Scrape#openConn(URL)} 146 */ 147 public static Vector<HTMLNode> getPageTokens( 148 URL url, boolean eliminateHTMLTags, 149 String rawHTMLFile, String matchesFile, String justTextFile 150 ) 151 throws IOException 152 { 153 return getPageTokens( 154 Scrape.openConn(url), eliminateHTMLTags, 155 null, null, 156 rawHTMLFile, matchesFile, justTextFile 157 ); 158 } 159 160 /** 161 * Convenience Method. 162 * <BR />Accepts: {@code URL} 163 * <BR />And-Accepts: {@code 'startTag'} and {@code 'endTag'} 164 * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean, 165 * String, String, String, String, String)} 166 * <BR />And Invokes: {@link Scrape#openConn(URL)} 167 */ 168 public static Vector<HTMLNode> getPageTokens( 169 URL url, boolean eliminateHTMLTags, 170 String startTag, String endTag, 171 String rawHTMLFile, String matchesFile, String justTextFile 172 ) 173 throws IOException 174 { 175 return getPageTokens( 176 Scrape.openConn(url), eliminateHTMLTags, 177 startTag, endTag, 178 rawHTMLFile, matchesFile, justTextFile 179 ); 180 } 181 182 /** 183 * Convenience Method. 184 * <BR />Accepts: {@code URL} 185 * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'} 186 * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean, 187 * int, int, String, String, String)} 188 * <BR />And Invokes: {@link Scrape#openConn(URL)} 189 */ 190 public static Vector<HTMLNode> getPageTokens( 191 URL url, boolean eliminateHTMLTags, 192 int startLineNum, int endLineNum, 193 String rawHTMLFile, String matchesFile, String justTextFile 194 ) 195 throws IOException 196 { 197 return getPageTokens( 198 Scrape.openConn(url), eliminateHTMLTags, 199 startLineNum, endLineNum, 200 rawHTMLFile, matchesFile, justTextFile 201 ); 202 } 203 204 205 // ******************************************************************************************** 206 // ******************************************************************************************** 207 // These 6 functions presume that the HTML source is from a CharSequence 208 // ******************************************************************************************** 209 // ******************************************************************************************** 210 211 212 /** 213 * Parses and Vectorizes HTML from a {@code CharSequence} (usually a {@code String}) source. 214 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_HTML> 215 * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT> 216 * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN> 217 * <BR /><BR /><B><SPAN STYLE="color: red;">NOTE:</B></SPAN> This method does not throw any 218 * checked-exceptions, there is no Input-Output involved here, it is strictly a computational 219 * method that neither invokes the file-system, nor the web. 220 */ 221 public static Vector<HTMLNode> getPageTokens 222 (CharSequence html, boolean eliminateHTMLTags) 223 // NO IOException... NO I/O! 224 { 225 try 226 { return parser.parse(html, eliminateHTMLTags, null, null, null); } 227 228 // This should never happen, when reading from a 'String' rather than a URL, or 229 // BufferedReader ==> IOException will not be thrown. 230 231 catch (IOException ioe) 232 { throw new UnreachableError(ioe); } 233 } 234 235 /** 236 * Convenience Method. 237 * <BR />Accepts: {@code CharSequence} 238 * <BR />And-Accepts: {@code 'startTag'} and {@code 'endTag'} 239 * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}. 240 * <BR />Invokes: {@link #getPageTokens(CharSequence, boolean, 241 * String, String, String, String, String)} 242 * <BR />Catches: {@code IOException} <B>{@code ==>}</B> No HTTP-I/O, so an IOException isn't 243 * possible! 244 */ 245 public static Vector<HTMLNode> getPageTokens 246 (CharSequence html, boolean eliminateHTMLTags, String startTag, String endTag) 247 // NO IOException... NO I/O! 248 { 249 try 250 { return getPageTokens(html, eliminateHTMLTags, startTag, endTag, null, null, null); } 251 252 // This should never happen, when reading from a 'String' rather than a URL, or 253 // BufferedReader ==> IOException will not be thrown. 254 255 catch (IOException ioe) 256 { throw new UnreachableError(ioe); } 257 } 258 259 /** 260 * Convenience Method. 261 * <BR />Accepts: {@code CharSequence} 262 * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'} 263 * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}. 264 * <BR />Invokes: {@link #getPageTokens(CharSequence, boolean, 265 * int, int, String, String, String)} 266 * <BR />Catches: {@code IOException} <B>{@code ==>}</B> No HTTP-I/O, so an IOException isn't 267 * possible! 268 */ 269 public static Vector<HTMLNode> getPageTokens 270 (CharSequence html, boolean eliminateHTMLTags, int startLineNum, int endLineNum) 271 // NO IOException... NO I/O! 272 { 273 try 274 { 275 return getPageTokens 276 (html, eliminateHTMLTags, startLineNum, endLineNum, null, null, null); 277 } 278 279 // This should never happen, when reading from a 'String' rather than a URL, or 280 // BufferedReader ==> IOException will not be thrown. 281 282 catch (IOException ioe) 283 { throw new UnreachableError(ioe); } 284 } 285 286 /** 287 * Parses and Vectorizes HTML from a {@code CharSequence} (usually a {@code String}) source. 288 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_HTML> 289 * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT> 290 * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML> 291 * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F> 292 * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT> 293 * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN> 294 * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX> 295 */ 296 public static Vector<HTMLNode> getPageTokens( 297 CharSequence html, boolean eliminateHTMLTags, 298 String rawHTMLFile, String matchesFile, String justTextFile 299 ) 300 throws IOException 301 { return parser.parse(html, eliminateHTMLTags, rawHTMLFile, matchesFile, justTextFile); } 302 303 /** 304 * Parses and Vectorizes HTML from a {@code CharSequence} (usually a {@code String}) source. 305 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_HTML> 306 * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT> 307 * @param startTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_START_TAG> 308 * @param endTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_END_TAG> 309 * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML> 310 * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F> 311 * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT> 312 * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN> 313 * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX> 314 * @throws ScrapeException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_SCEX2> 315 */ 316 public static Vector<HTMLNode> getPageTokens( 317 CharSequence html, boolean eliminateHTMLTags, 318 String startTag, String endTag, 319 String rawHTMLFile, String matchesFile, String justTextFile 320 ) 321 throws IOException 322 { 323 String htmlStr = html.toString(); 324 325 int sPos = htmlStr.indexOf(startTag); 326 327 if (sPos == -1) throw new IllegalArgumentException 328 ("Passed String-Parameter 'startTag' [" + startTag + "] was not found in HTML."); 329 330 int ePos = htmlStr.indexOf(endTag, sPos); 331 332 if (ePos == -1) throw new IllegalArgumentException 333 ("Passed String-Parameter 'endTag' [" + endTag + "] was not found in HTML."); 334 335 ePos += endTag.length(); 336 337 return parser.parse( 338 htmlStr.substring(sPos, ePos), eliminateHTMLTags, 339 rawHTMLFile, matchesFile, justTextFile 340 ); 341 } 342 343 /** 344 * Convenience Method. 345 * <BR />Accepts: {@code CharSequence} 346 * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'} 347 * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean, 348 * int, int, String, String, String)} 349 */ 350 public static Vector<HTMLNode> getPageTokens( 351 CharSequence html, boolean eliminateHTMLTags, 352 int startLineNum, int endLineNum, 353 String rawHTMLFile, String matchesFile, String justTextFile 354 ) 355 throws IOException 356 { 357 return getPageTokens( 358 new BufferedReader(new StringReader(html.toString())), 359 eliminateHTMLTags, startLineNum, endLineNum, rawHTMLFile, matchesFile, justTextFile 360 ); 361 } 362 363 364 // ******************************************************************************************** 365 // ******************************************************************************************** 366 // The next 6 functions presume that the input is from a BufferedReader 367 // ******************************************************************************************** 368 // ******************************************************************************************** 369 370 371 /** 372 * Convenience Method. 373 * <BR />Accepts: {@code BufferedReader} 374 * <BR />Passes null to parameters 375 * {@code startTag, endTag, rawHTMLFile, matchesFile & justTextFile}. 376 * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean, 377 * String, String, String, String, String)} 378 */ 379 public static Vector<HTMLNode> getPageTokens 380 (BufferedReader br, boolean eliminateHTMLTags) 381 throws IOException 382 { return getPageTokens(br, eliminateHTMLTags, null, null, null, null, null); } 383 384 /** 385 * Convenience Method. 386 * <BR />Accepts: {@code BufferedReader} 387 * <BR />And-Accepts: {@code 'startTag'} and {@code 'endTag'} 388 * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}. 389 * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean, 390 * String, String, String, String, String)} 391 */ 392 public static Vector<HTMLNode> getPageTokens 393 (BufferedReader br, boolean eliminateHTMLTags, String startTag, String endTag) 394 throws IOException 395 { return getPageTokens(br, eliminateHTMLTags, startTag, endTag, null, null, null); } 396 397 /** 398 * Convenience Method. 399 * <BR />Accepts: {@code BufferedReader} 400 * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'} 401 * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}. 402 * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean, 403 * int, int, String, String, String)} 404 */ 405 public static Vector<HTMLNode> getPageTokens 406 (BufferedReader br, boolean eliminateHTMLTags, int startLineNum, int endLineNum) 407 throws IOException 408 { return getPageTokens(br, eliminateHTMLTags, startLineNum, endLineNum, null, null, null); } 409 410 411 /** 412 * Convenience Method. 413 * <BR />Accepts: {@code BufferedReader} 414 * <BR />Passes null to {@code startTag} & {@code endTag} parameters. 415 * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean, 416 * String, String, String, String, String)} 417 */ 418 public static Vector<HTMLNode> getPageTokens( 419 BufferedReader br, boolean eliminateHTMLTags, 420 String rawHTMLFile, String matchesFile, String justTextFile 421 ) 422 throws IOException 423 { 424 return getPageTokens 425 (br, eliminateHTMLTags, null, null, rawHTMLFile, matchesFile, justTextFile); 426 } 427 428 429 // ******************************************************************************************** 430 // ******************************************************************************************** 431 // 432 // ******************************************************************************************** 433 // ******************************************************************************************** 434 435 436 /** 437 * Parses and Vectorizes HTML from a {@code BufferedReader} source. 438 * @param br <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_BR> 439 * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT> 440 * @param startTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_START_TAG> 441 * @param endTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_END_TAG> 442 * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML> 443 * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F> 444 * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT> 445 * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN> 446 * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX> 447 * @throws ScrapeException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_SCEX2> 448 */ 449 public static Vector<HTMLNode> getPageTokens( 450 BufferedReader br, boolean eliminateHTMLTags, 451 String startTag, String endTag, 452 String rawHTMLFile, String matchesFile, String justTextFile 453 ) 454 throws IOException 455 { 456 return parser.parse( 457 Scrape.getHTML(br, startTag, endTag), eliminateHTMLTags, rawHTMLFile, 458 matchesFile, justTextFile 459 ); 460 } 461 462 /** 463 * Parses and Vectorizes HTML from a {@code BufferedReader} source. 464 * @param br <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_BR> 465 * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT> 466 * @param startLineNum <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_START_LN> 467 * @param endLineNum <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_END_LN> 468 * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML> 469 * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F> 470 * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT> 471 * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN> 472 * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX> 473 * @throws IllegalArgumentException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IAEX> 474 * @throws ScrapeException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_SCEX1> 475 */ 476 public static Vector<HTMLNode> getPageTokens( 477 BufferedReader br, boolean eliminateHTMLTags, 478 int startLineNum, int endLineNum, 479 String rawHTMLFile, String matchesFile, String justTextFile 480 ) 481 throws IOException 482 { 483 return parser.parse( 484 Scrape.getHTML(br, startLineNum, endLineNum), eliminateHTMLTags, 485 rawHTMLFile, matchesFile, justTextFile 486 ); 487 } 488}