001package Torello.HTML; 002 003import Torello.HTML.*; 004import Torello.Java.FileRW; 005import java.util.*; 006import java.io.*; 007import java.net.*; 008 009/** 010 * Demonstrates using 'Splash,' which is one of many ways to execute the Java-Script on 011 * Web-Pages, before those pages are parsed. 012 * 013 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=SPLASH_BRIDGE> 014 */ 015public class SplashBridge 016{ 017 private SplashBridge() { } 018 019 /** 020 * Once the {@code Splash HTTP Server} is running (which requires the {@code Docker} loading 021 * and installation tool, all one has to do is <I><B>prepend this {@code String}</B></I> to 022 * any {@code URL}, and the {@code Splash Script Executor} will be invoked on the HTML and 023 * Script that is received from that {@code URL}; 024 * 025 * <DIV CLASS="EXAMPLE">{@code 026 * String myURL = "https://cars.com"; 027 * URL withSplashServerURL = new URL(SplashBridge.SPLASH_URL + myURL); 028 * 029 * // Here, just use the standard HTML scrape and parsing routines to retrieve the HTML 030 * // from the URL 'myURL'. Splash will execute any 'dynamic HTML' that is loaded via the 031 * // standard script libraries like AJAX, JSON, React-JS, jQuery, or Angular. 032 * 033 * Vector<HTMLNode> html = HTMLPage.getPageTokens(withSplashServerURL, false); 034 * 035 * // NOTE: The above invocation will not call the "www.cars.com" server, BUT RATHER, will 036 * // ask the HTTP Server running on the local host as a PROXY to retrieve the HTML 037 * // from "www.cars.com". Before returning that HTML, the local proxy server will also 038 * // execute the dynamic-loading script that is present on the main page of "cars.com" 039 * // 040 * // ALSO: There are other libraries that perform this type of work: Selenium, and Android 041 * // class WebView. 042 * }</DIV> 043 */ 044 public static final String SPLASH_URL = "http://localhost:8050/render.html?url="; 045 046 /** 047 * <EMBED CLASS='external-html' DATA-FILE-ID=SPLASH_DOCKER> 048 * @throws IOException If there are any {@code HTTP} errors when downloading or processing 049 * the HTML. 050 */ 051 public static void example01() throws IOException 052 { 053 // Call the splash-bridge running on local-host @ port 8050 054 // The "wait" parameter means it will wait up to four seconds to run java-script AJAX 055 // data-retrieval tasks that are on the page. 056 057 String urlStr = 058 "http://localhost:8050/render.html?url=" + 059 "https://en.wikipedia.org/w/index.php?title=Christopher_Columbus&oldid=924321156" + 060 "&timeout=10&wait=4.0"; 061 062 URL url = new URL(urlStr); 063 064 // This will just use the standard Java HTTP URLConnection class to connect to the exact 065 // same page. 066 067 String urlStr2 = "https://en.wikipedia.org/w/index.php?title=Christopher_Columbus&oldid=924321156"; 068 069 URL url2 = new URL(urlStr2); 070 071 // Download both versions. This version is contacting a Splash Server on a local host 072 // running @ port 8050 073 // NOTE: This writes the HTML to a Flat-File on the File-System. 074 075 Vector<HTMLNode> v = HTMLPage.getPageTokens(url, false); 076 077 FileRW.writeFile(Util.pageToString(v), "cc.html"); 078 079 // This version is contacting Wikipedia.com, and ignoring any possible AJAX or Java-Script 080 // calls - script calls of any kind are being ignored by this version. 081 // NOTE: This writes the HTML to a Flat-File on the File-System. 082 083 Vector<HTMLNode> v2 = HTMLPage.getPageTokens(url2, false); 084 085 FileRW.writeFile(Util.pageToString(v2), "cc2.html"); 086 087 // FileOutput Size: Version 1: 650737 Nov 4 18:28 cc.html 088 // FileOutput Size: Version 2: 493879 Nov 4 18:28 cc2.html 089 // RESULTS: Clearly there is quite a bit of downloaded data from AJAX & Splash 090 } 091 092 // public static void main(String[] argv) throws IOException { example02(); } 093}