001package Torello.Browser;
002
003
004// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
005// UNIX Terminal Shell Colors (Text Colors)
006// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
007
008import static Torello.Java.C.BPURPLE_BKGND;
009import static Torello.Java.C.BRED;
010import static Torello.Java.C.BCYAN;
011import static Torello.Java.C.BYELLOW_BKGND;
012import static Torello.Java.C.RESET;
013
014
015// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
016// Browser-API, JavaScript-API Commands
017// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
018
019import Torello.Browser.JavaScriptAPI.RunTime;
020
021import Torello.Browser.BrowserAPI.DOM;
022import Torello.Browser.BrowserAPI.Target;
023import Torello.Browser.BrowserAPI.Page;
024
025
026// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
027// Misc Torello.Java
028// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
029
030import Torello.Java.StrIndent;
031import Torello.Java.StringParse;
032import Torello.Java.EXCC;
033import Torello.Java.Verbosity;
034
035import Torello.Java.Additional.Ret2;
036
037import Torello.HTML.Tools.Images.Request;
038import Torello.HTML.Tools.Images.Results;
039import Torello.HTML.Tools.Images.ImageScraper;
040
041
042// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
043// Parsing the Output of this program
044// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
045
046import Torello.HTML.HTMLPage;
047import Torello.HTML.HTMLNode;
048import Torello.HTML.Attributes;
049import Torello.HTML.TC;
050
051import Torello.HTML.NodeSearch.TagNodeFind;
052
053
054// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
055// Standard Java Libraries
056// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
057
058import java.util.function.Consumer;
059
060import java.io.IOException;
061import java.util.function.Predicate;
062import java.util.stream.Collectors;
063import java.util.stream.Stream;
064import java.util.Arrays;
065import java.util.List;
066import java.util.Vector;
067
068import javax.json.JsonString;
069
070/**
071 * An example of this package's utility.
072 * <EMBED CLASS='external-html' DATA-FILE-ID=EXAMPLE>
073 */
074public class Example01
075{
076    private Example01() { }
077
078    /** The URL that is being scraped in this example */
079    protected static final String samAltmanURL = "https://en.wikipedia.org/wiki/Sam_Altman";
080
081    protected static final ConnRecord connRec = new ConnRecord();
082
083    static 
084    {
085        connRec.setEventHandler(Printing::printObj);
086        connRec.setBrowserErrorHandler(Printing::printObj);
087        connRec.setRDPErrorHandler(Printing::printObj);
088        connRec.setRawTextReceiver(System.out);
089        connRec.setAppTextReceiver(System.out);
090        connRec.setErrTextReceiver(System.out);
091    }
092
093
094    // ********************************************************************************************
095    // ********************************************************************************************
096    // MAIN METHOD
097    // ********************************************************************************************
098    // ********************************************************************************************
099
100
101    /** This class is intended to be invoked from the Command Line. */
102    public static void main(String[] argv) throws Exception
103    {
104        // Opening a WebSocket Browser-Connection to the currently running Chrome-Instance
105        final WebSocketSender bws = STEP_01_openBrowserWebSocket();
106
107        // Close any currently opened pages / tabs inside the browser
108        STEP_02_closeAllPages(bws);
109
110        // Open a Browser-Page (using 'bws') for reading Sam Altman's Wikipedia Profile
111        final String targetID = STEP_03_openSamAltmanPage(bws);
112
113        // Create / Build a WebSocket-Connection object to the newly opened Sam Altman Page.
114        final WebSocketSender pws = STEP_04_getPageWebSocket(targetID);
115
116        // Execute some Java-Script so that the scrape code may run
117        final String html = STEP_05_runJavaScript(pws);
118
119        // Print the Image-URL's, retrieve those URL's too
120        final String[] imgURLs = STEP_06_extractImageURLs(html);
121
122        // Download the Images into a download folder
123        STEP_07_downloadImages(imgURLs);
124
125        bws.disconnect();
126        pws.disconnect();
127    }
128
129
130    // ********************************************************************************************
131    // ********************************************************************************************
132    // The Steps
133    // ********************************************************************************************
134    // ********************************************************************************************
135
136
137    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
138    // STEP-01: Opening a WebSocket Browser Connection
139    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
140
141    /** <EMBED CLASS='external-html' DATA-FILE-ID=EX01_STEP01> */
142    protected static WebSocketSender STEP_01_openBrowserWebSocket() throws Exception
143    {
144        Printing.notice("Opening a WebSocket Browser Connection...");
145
146        final BrowserConn browserConn = BrowserConn.getBrowserConn(9222, false);
147
148        System.out.println(
149            '\n' + BCYAN + "Example01.java: " + RESET +
150            BRED + "Opened Browser Connection:\n" + RESET + browserConn.toString()
151        );
152
153        final WebSocketSender bws = browserConn.createSender(Example01.connRec);
154
155        // Chat-GPT once suggested this line. I just haven't removed it.  It's not hurting anyone!
156        Thread.sleep(1000);
157
158        return bws;
159    }
160
161
162    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
163    // STEP-02: Close all Opened Browser Tabs
164    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
165
166    /** <EMBED CLASS='external-html' DATA-FILE-ID=EX01_STEP02> */
167    protected static void STEP_02_closeAllPages(WebSocketSender bws) throws Exception
168    {
169        Printing.notice("Closing All Currently Open Pages, using BrowserConn");
170
171
172        // This is currently unused.  I used to filter for only the opened Wiki-Pages, but now this
173        // method simply closes every open page.  No sense in deleting this line, though
174
175        final Predicate<Target.TargetInfo> isSamAltman = (Target.TargetInfo t) ->
176                t.type.equals("page")
177            &&  (t.url != null)
178            &&  (t.url.startsWith(samAltmanURL));
179
180        System.out.println
181            ('\n' + BCYAN + "Example01.java: " + RESET + "Getting all tabs...");
182    
183        final Target.TargetInfo[] allTabs = Target
184            .getTargets(null /* FilterEntry[] */)
185            .exec(bws)
186            .await();
187
188        System.out.println
189            ('\n' + BCYAN + "Example01.java: " + RESET + "Found " + allTabs.length + " tabs.");
190
191        if (allTabs.length > 0)
192
193            for (int i = 0; i < allTabs.length; i++)
194            {
195                final String tid = allTabs[i].targetId;
196                System.out.println(BRED + "Closing Tab: " + RESET + tid);
197                Target.closeTarget(tid).exec(bws).await();
198            }
199    }
200
201
202    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
203    // STEP-03: Open a Web-Browser Page / Tab that is reading the Wiki for Open-AI's Sam Altman
204    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
205
206    /** <EMBED CLASS='external-html' DATA-FILE-ID=EX01_STEP03> */
207    protected static String STEP_03_openSamAltmanPage(final WebSocketSender bws) throws Exception
208    {
209        Printing.notice("Opening a Sam Altman Wikipedia Page, using BrowserConn.");
210
211        final String targetID = Target
212            .createTarget()
213            .accept("url", samAltmanURL)
214            .build()
215            .exec(bws)
216            .await();
217
218        final Target.TargetInfo targetInfo = Target
219            .getTargetInfo(targetID)
220            .exec(bws)
221            .await();
222
223        System.out.println(
224            '\n' + BCYAN + "Example01.java: " + RESET +
225            BRED + "Created New Tab:\n" + RESET + targetInfo.toString()
226        );
227
228
229        // I leave these one second delays here.  AGAIN - Chat-GPT suggested them to me once.
230        // Chat-GPT, in every sense of the word, knows more about my code than I do!  (The CDP 
231        // Protocol is a very well understood protocol - just not in Java so much)
232
233        Thread.sleep(1000);
234
235        return targetID;
236    }
237
238
239    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
240    // STEP-04: Create a "PageConn" Object to the recently opened Sam Altman Wikipedia Page
241    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
242
243    /** <EMBED CLASS='external-html' DATA-FILE-ID=EX01_STEP04> */
244    protected static WebSocketSender STEP_04_getPageWebSocket(final String targetID)
245        throws Exception
246    {
247        Printing.notice("Create PageConn Web-Socket Connection to Altman's Wiki");
248
249        // Attach to that Sam Altman Page (switch to tab-level WebSocket)
250        final PageConn pageConn = PageConn
251            .getAllPageConn(9222, false)
252            .filter((PageConn pc) -> pc.id.equals(targetID))
253            .findFirst()
254            .orElseThrow(() -> new RuntimeException("The Page-Connection was Not found !!!"));
255
256        System.out.println(
257            '\n' + BCYAN + "Example01.java: " + RESET +
258            BRED + "Found Page Connection to Sam Altman Wiki:\n" + RESET + pageConn.toString()
259        );
260
261        final WebSocketSender pws = pageConn.createSender(Example01.connRec);
262
263
264        // I think this is the last one...  Wait 1 second, it might make a difference while the 
265        // page actually loads, and the Web-Socket connects... I have no idea!  It's just 1 second!
266
267        Thread.sleep(1000);
268
269        return pws;
270    }
271
272
273    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
274    // STEP-05: Execute some Java-Script
275    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
276
277    /** <EMBED CLASS='external-html' DATA-FILE-ID=EX01_STEP05> */
278    protected static String STEP_05_runJavaScript(final WebSocketSender pws) throws Exception
279    {
280        Printing.notice("Execute the needed Java Script, so the Scraper can Run");
281
282        // Enable the Page domain
283        System.out.println('\n' + BCYAN + "Example01.java: " + RESET + "Page.enable()");
284        Page.enable(null /* Boolean */).exec(pws).await();
285
286        // Enable the DOM domain
287        System.out.println('\n' + BCYAN + "Example01.java: " + RESET + "DOM.enable()");
288        DOM.enable(null /* String */).exec(pws).await();
289
290        // Enable the Runtime domain
291        System.out.println('\n' + BCYAN + "Example01.java: " + RESET + "RunTime.enable()");
292        RunTime.enable().exec(pws).await();
293
294        // This is the actual last one.  Make sure that the DOM & RunTime modules are running!
295        Thread.sleep(1000);
296
297        // 5. Evaluate the HTML via JavaScript
298        System.out.println('\n' + BCYAN + "Example01.java: " + RESET + "RunTime.evaluate()");
299
300        final RunTime.evaluate$$RET r = RunTime
301            .evaluate()
302            .accept("expression", "document.documentElement.outerHTML")
303            .accept("returnByValue", true)
304            .build()
305            .exec(pws)
306            .await();
307
308        System.out.println(
309            '\n' + BCYAN + "Example01.java: " + RESET + "Response RemoteObject:" + '\n' +
310            r.result.toString()
311        );
312
313        final String html = ((JsonString) r.result.value).getString();
314
315        return html;
316    }
317
318
319    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
320    // STEP-06: Print the Image-URL's
321    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
322
323    /**
324     * <EMBED CLASS='external-html' DATA-FILE-ID=EX01_STEP06>
325     * @see HTMLPage#getPageTokens(CharSequence, boolean)
326     * @see TagNodeFind
327     * @see Attributes#retrieve(Vector, int[], String)
328     */
329    protected static String[] STEP_06_extractImageURLs(final String html) throws Exception
330    {
331        Printing.notice("Parsing HTML for Images Printing the URL's");
332
333        final Vector<HTMLNode>      altPage = HTMLPage.getPageTokens(html, false);
334        final int[]                 images  = TagNodeFind.all(altPage, TC.OpeningTags, "img");
335        final String[]              imgURLs = Attributes.retrieve(altPage, images, "src");
336        final int                   numImg  = imgURLs.length;
337
338        System.out.println
339            ('\n' + BCYAN + "Example01.java: " + RESET + "Number of Images Found: " + numImg);
340
341        for (int i = 0; i < numImg; i++) System.out.println("    " + imgURLs[i]);
342
343        return imgURLs;
344    }
345
346
347    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
348    // STEP-07: Download the Image's into a folder
349    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
350
351    /**
352     * <EMBED CLASS='external-html' DATA-FILE-ID=EX01_STEP07>
353     * @see ImageScraper#download(Request, Appendable)
354     * @see Request
355     * @see Results
356     * @see ImageScraper#shutdownTOThreads()
357     */
358    protected static void STEP_07_downloadImages(final String[] imageURLs) throws Exception
359    {
360        Printing.notice("Download the Image's into a folder");
361
362        final Stream.Builder<String> builder = Stream.builder();
363
364        for (int i = 0; i < imageURLs.length; i++)
365            if (imageURLs[i].startsWith("//"))
366                builder.accept("https:" + imageURLs[i]);
367
368        // Build a Request-Object
369        final List<String>  imgURLsList = builder.build().collect(Collectors.toList());
370        final Request       req         = Request.buildFromStrIter(imgURLsList);
371
372        // Add a few more Scraper-Configurations to the Request Object
373        req.targetDirectory                     = "image-downloads/";
374        req.useDefaultCounterForImageFileNames  = true;
375        req.skipOnDownloadException             = true;
376        req.verbosity                           = Verbosity.Normal;
377
378        try 
379            // Run the scraper, Send all Text-Output to 'System.out' (Ignore / Discard Results)
380            { final Results results = ImageScraper.download(req, System.out); }
381
382        catch (Exception e)
383            { System.out.println(EXCC.toString(e)); }
384
385        finally 
386            // This needs to happen, or this entire program will hang / lock up the terminal
387            { ImageScraper.shutdownTOThreads(); }
388    }
389}