001package Torello.HTML;
002
003import Torello.HTML.*;
004import Torello.Java.FileRW;
005import java.util.*;
006import java.io.*;
007import java.net.*;
008
009/**
010 * Demonstrates using 'Splash,' which is one of many ways to execute the Java-Script on
011 * Web-Pages, before those pages are parsed.
012 * 
013 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=SPLASH_BRIDGE>
014 */
015public class SplashBridge
016{
017    private SplashBridge() { }
018
019    /**
020     * Once the {@code Splash HTTP Server} is running (which requires the {@code Docker} loading
021     * and installation tool, all one has to do is <I><B>prepend this {@code String}</B></I> to
022     * any {@code URL}, and the {@code Splash Script Executor} will be invoked on the HTML and
023     * Script that is received from that {@code URL};
024     * 
025     * <DIV CLASS="EXAMPLE">{@code
026     * String   myURL               = "https://cars.com";
027     * URL      withSplashServerURL = new URL(SplashBridge.SPLASH_URL +  myURL);
028     *
029     * // Here, just use the standard HTML scrape and parsing routines to retrieve the HTML
030     * // from the URL 'myURL'.  Splash will execute any 'dynamic HTML' that is loaded via the
031     * // standard script libraries like AJAX, JSON, React-JS, jQuery, or Angular.
032     *
033     * Vector<HTMLNode> html = HTMLPage.getPageTokens(withSplashServerURL, false);
034     * 
035     * // NOTE: The above invocation will not call the "www.cars.com" server, BUT RATHER, will
036     * //       ask the HTTP Server running on the local host as a PROXY to retrieve the HTML
037     * //       from "www.cars.com".  Before returning that HTML, the local proxy server will also
038     * //       execute the dynamic-loading script that is present on the main page of "cars.com"
039     * // 
040     * // ALSO: There are other libraries that perform this type of work: Selenium, and Android
041     * //       class WebView.
042     * }</DIV>
043     */
044    public static final String SPLASH_URL = "http://localhost:8050/render.html?url=";
045
046    /**
047     * <EMBED CLASS='external-html' DATA-FILE-ID=SPLASH_DOCKER>
048     * @throws IOException If there are any {@code HTTP} errors when downloading or processing
049     * the HTML.
050     */
051    public static void example01() throws IOException
052    {
053        // Call the splash-bridge running on local-host @ port 8050
054        // The "wait" parameter means it will wait up to four seconds to run java-script AJAX
055        // data-retrieval tasks that are on the page.
056
057        String urlStr =
058            "http://localhost:8050/render.html?url=" + 
059            "https://en.wikipedia.org/w/index.php?title=Christopher_Columbus&oldid=924321156" +
060                "&timeout=10&wait=4.0";
061
062        URL url = new URL(urlStr);
063
064        // This will just use the standard Java HTTP URLConnection class to connect to the exact
065        // same page.
066
067        String urlStr2 = "https://en.wikipedia.org/w/index.php?title=Christopher_Columbus&oldid=924321156";
068
069        URL url2 = new URL(urlStr2);
070
071        // Download both versions.  This version is contacting a Splash Server on a local host
072        // running @ port 8050
073        // NOTE: This writes the HTML to a Flat-File on the File-System.
074
075        Vector<HTMLNode> v = HTMLPage.getPageTokens(url, false);
076
077        FileRW.writeFile(Util.pageToString(v), "cc.html");
078
079        // This version is contacting Wikipedia.com, and ignoring any possible AJAX or Java-Script
080        // calls - script calls of any kind are being ignored by this version.
081        // NOTE: This writes the HTML to a Flat-File on the File-System.
082
083        Vector<HTMLNode> v2 = HTMLPage.getPageTokens(url2, false);
084
085        FileRW.writeFile(Util.pageToString(v2), "cc2.html");
086
087        // FileOutput Size: Version 1: 650737 Nov  4 18:28 cc.html
088        // FileOutput Size: Version 2: 493879 Nov  4 18:28 cc2.html
089        // RESULTS: Clearly there is quite a bit of downloaded data from AJAX & Splash
090    }
091
092    // public static void main(String[] argv) throws IOException { example02(); }
093}