001package Torello.Browser; 002 003 004// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 005// UNIX Terminal Shell Colors (Text Colors) 006// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 007 008import static Torello.Java.C.BPURPLE_BKGND; 009import static Torello.Java.C.BRED; 010import static Torello.Java.C.BCYAN; 011import static Torello.Java.C.BYELLOW_BKGND; 012import static Torello.Java.C.RESET; 013 014 015// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 016// Browser-API, JavaScript-API Commands 017// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 018 019import Torello.Browser.JavaScriptAPI.RunTime; 020 021import Torello.Browser.BrowserAPI.DOM; 022import Torello.Browser.BrowserAPI.Target; 023import Torello.Browser.BrowserAPI.Page; 024 025 026// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 027// Misc Torello.Java 028// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 029 030import Torello.Java.StrIndent; 031import Torello.Java.StringParse; 032import Torello.Java.EXCC; 033import Torello.Java.Verbosity; 034 035import Torello.Java.Additional.Ret2; 036 037import Torello.HTML.Tools.Images.Request; 038import Torello.HTML.Tools.Images.Results; 039import Torello.HTML.Tools.Images.ImageScraper; 040 041 042// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 043// Parsing the Output of this program 044// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 045 046import Torello.HTML.HTMLPage; 047import Torello.HTML.HTMLNode; 048import Torello.HTML.Attributes; 049import Torello.HTML.TC; 050 051import Torello.HTML.NodeSearch.TagNodeFind; 052 053 054// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 055// Standard Java Libraries 056// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 057 058import java.util.function.Consumer; 059 060import java.io.IOException; 061import java.util.function.Predicate; 062import java.util.stream.Collectors; 063import java.util.stream.Stream; 064import java.util.Arrays; 065import java.util.List; 066import java.util.Vector; 067 068import javax.json.JsonString; 069 070/** 071 * An example of this package's utility. 072 * <EMBED CLASS='external-html' DATA-FILE-ID=EXAMPLE> 073 */ 074public class Example01 075{ 076 private Example01() { } 077 078 /** The URL that is being scraped in this example */ 079 protected static final String samAltmanURL = "https://en.wikipedia.org/wiki/Sam_Altman"; 080 081 protected static final ConnRecord connRec = new ConnRecord(); 082 083 static 084 { 085 connRec.setEventHandler(Printing::printObj); 086 connRec.setBrowserErrorHandler(Printing::printObj); 087 connRec.setRDPErrorHandler(Printing::printObj); 088 connRec.setRawTextReceiver(System.out); 089 connRec.setAppTextReceiver(System.out); 090 connRec.setErrTextReceiver(System.out); 091 } 092 093 094 // ******************************************************************************************** 095 // ******************************************************************************************** 096 // MAIN METHOD 097 // ******************************************************************************************** 098 // ******************************************************************************************** 099 100 101 /** This class is intended to be invoked from the Command Line. */ 102 public static void main(String[] argv) throws Exception 103 { 104 // Opening a WebSocket Browser-Connection to the currently running Chrome-Instance 105 final WebSocketSender bws = STEP_01_openBrowserWebSocket(); 106 107 // Close any currently opened pages / tabs inside the browser 108 STEP_02_closeAllPages(bws); 109 110 // Open a Browser-Page (using 'bws') for reading Sam Altman's Wikipedia Profile 111 final String targetID = STEP_03_openSamAltmanPage(bws); 112 113 // Create / Build a WebSocket-Connection object to the newly opened Sam Altman Page. 114 final WebSocketSender pws = STEP_04_getPageWebSocket(targetID); 115 116 // Execute some Java-Script so that the scrape code may run 117 final String html = STEP_05_runJavaScript(pws); 118 119 // Print the Image-URL's, retrieve those URL's too 120 final String[] imgURLs = STEP_06_extractImageURLs(html); 121 122 // Download the Images into a download folder 123 STEP_07_downloadImages(imgURLs); 124 125 bws.disconnect(); 126 pws.disconnect(); 127 } 128 129 130 // ******************************************************************************************** 131 // ******************************************************************************************** 132 // The Steps 133 // ******************************************************************************************** 134 // ******************************************************************************************** 135 136 137 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 138 // STEP-01: Opening a WebSocket Browser Connection 139 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 140 141 /** <EMBED CLASS='external-html' DATA-FILE-ID=EX01_STEP01> */ 142 protected static WebSocketSender STEP_01_openBrowserWebSocket() throws Exception 143 { 144 Printing.notice("Opening a WebSocket Browser Connection..."); 145 146 final BrowserConn browserConn = BrowserConn.getBrowserConn(9222, false); 147 148 System.out.println( 149 '\n' + BCYAN + "Example01.java: " + RESET + 150 BRED + "Opened Browser Connection:\n" + RESET + browserConn.toString() 151 ); 152 153 final WebSocketSender bws = browserConn.createSender(Example01.connRec); 154 155 // Chat-GPT once suggested this line. I just haven't removed it. It's not hurting anyone! 156 Thread.sleep(1000); 157 158 return bws; 159 } 160 161 162 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 163 // STEP-02: Close all Opened Browser Tabs 164 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 165 166 /** <EMBED CLASS='external-html' DATA-FILE-ID=EX01_STEP02> */ 167 protected static void STEP_02_closeAllPages(WebSocketSender bws) throws Exception 168 { 169 Printing.notice("Closing All Currently Open Pages, using BrowserConn"); 170 171 172 // This is currently unused. I used to filter for only the opened Wiki-Pages, but now this 173 // method simply closes every open page. No sense in deleting this line, though 174 175 final Predicate<Target.TargetInfo> isSamAltman = (Target.TargetInfo t) -> 176 t.type.equals("page") 177 && (t.url != null) 178 && (t.url.startsWith(samAltmanURL)); 179 180 System.out.println 181 ('\n' + BCYAN + "Example01.java: " + RESET + "Getting all tabs..."); 182 183 final Target.TargetInfo[] allTabs = Target 184 .getTargets(null /* FilterEntry[] */) 185 .exec(bws) 186 .await(); 187 188 System.out.println 189 ('\n' + BCYAN + "Example01.java: " + RESET + "Found " + allTabs.length + " tabs."); 190 191 if (allTabs.length > 0) 192 193 for (int i = 0; i < allTabs.length; i++) 194 { 195 final String tid = allTabs[i].targetId; 196 System.out.println(BRED + "Closing Tab: " + RESET + tid); 197 Target.closeTarget(tid).exec(bws).await(); 198 } 199 } 200 201 202 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 203 // STEP-03: Open a Web-Browser Page / Tab that is reading the Wiki for Open-AI's Sam Altman 204 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 205 206 /** <EMBED CLASS='external-html' DATA-FILE-ID=EX01_STEP03> */ 207 protected static String STEP_03_openSamAltmanPage(final WebSocketSender bws) throws Exception 208 { 209 Printing.notice("Opening a Sam Altman Wikipedia Page, using BrowserConn."); 210 211 final String targetID = Target 212 .createTarget() 213 .accept("url", samAltmanURL) 214 .build() 215 .exec(bws) 216 .await(); 217 218 final Target.TargetInfo targetInfo = Target 219 .getTargetInfo(targetID) 220 .exec(bws) 221 .await(); 222 223 System.out.println( 224 '\n' + BCYAN + "Example01.java: " + RESET + 225 BRED + "Created New Tab:\n" + RESET + targetInfo.toString() 226 ); 227 228 229 // I leave these one second delays here. AGAIN - Chat-GPT suggested them to me once. 230 // Chat-GPT, in every sense of the word, knows more about my code than I do! (The CDP 231 // Protocol is a very well understood protocol - just not in Java so much) 232 233 Thread.sleep(1000); 234 235 return targetID; 236 } 237 238 239 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 240 // STEP-04: Create a "PageConn" Object to the recently opened Sam Altman Wikipedia Page 241 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 242 243 /** <EMBED CLASS='external-html' DATA-FILE-ID=EX01_STEP04> */ 244 protected static WebSocketSender STEP_04_getPageWebSocket(final String targetID) 245 throws Exception 246 { 247 Printing.notice("Create PageConn Web-Socket Connection to Altman's Wiki"); 248 249 // Attach to that Sam Altman Page (switch to tab-level WebSocket) 250 final PageConn pageConn = PageConn 251 .getAllPageConn(9222, false) 252 .filter((PageConn pc) -> pc.id.equals(targetID)) 253 .findFirst() 254 .orElseThrow(() -> new RuntimeException("The Page-Connection was Not found !!!")); 255 256 System.out.println( 257 '\n' + BCYAN + "Example01.java: " + RESET + 258 BRED + "Found Page Connection to Sam Altman Wiki:\n" + RESET + pageConn.toString() 259 ); 260 261 final WebSocketSender pws = pageConn.createSender(Example01.connRec); 262 263 264 // I think this is the last one... Wait 1 second, it might make a difference while the 265 // page actually loads, and the Web-Socket connects... I have no idea! It's just 1 second! 266 267 Thread.sleep(1000); 268 269 return pws; 270 } 271 272 273 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 274 // STEP-05: Execute some Java-Script 275 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 276 277 /** <EMBED CLASS='external-html' DATA-FILE-ID=EX01_STEP05> */ 278 protected static String STEP_05_runJavaScript(final WebSocketSender pws) throws Exception 279 { 280 Printing.notice("Execute the needed Java Script, so the Scraper can Run"); 281 282 // Enable the Page domain 283 System.out.println('\n' + BCYAN + "Example01.java: " + RESET + "Page.enable()"); 284 Page.enable(null /* Boolean */).exec(pws).await(); 285 286 // Enable the DOM domain 287 System.out.println('\n' + BCYAN + "Example01.java: " + RESET + "DOM.enable()"); 288 DOM.enable(null /* String */).exec(pws).await(); 289 290 // Enable the Runtime domain 291 System.out.println('\n' + BCYAN + "Example01.java: " + RESET + "RunTime.enable()"); 292 RunTime.enable().exec(pws).await(); 293 294 // This is the actual last one. Make sure that the DOM & RunTime modules are running! 295 Thread.sleep(1000); 296 297 // 5. Evaluate the HTML via JavaScript 298 System.out.println('\n' + BCYAN + "Example01.java: " + RESET + "RunTime.evaluate()"); 299 300 final RunTime.evaluate$$RET r = RunTime 301 .evaluate() 302 .accept("expression", "document.documentElement.outerHTML") 303 .accept("returnByValue", true) 304 .build() 305 .exec(pws) 306 .await(); 307 308 System.out.println( 309 '\n' + BCYAN + "Example01.java: " + RESET + "Response RemoteObject:" + '\n' + 310 r.result.toString() 311 ); 312 313 final String html = ((JsonString) r.result.value).getString(); 314 315 return html; 316 } 317 318 319 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 320 // STEP-06: Print the Image-URL's 321 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 322 323 /** 324 * <EMBED CLASS='external-html' DATA-FILE-ID=EX01_STEP06> 325 * @see HTMLPage#getPageTokens(CharSequence, boolean) 326 * @see TagNodeFind 327 * @see Attributes#retrieve(Vector, int[], String) 328 */ 329 protected static String[] STEP_06_extractImageURLs(final String html) throws Exception 330 { 331 Printing.notice("Parsing HTML for Images Printing the URL's"); 332 333 final Vector<HTMLNode> altPage = HTMLPage.getPageTokens(html, false); 334 final int[] images = TagNodeFind.all(altPage, TC.OpeningTags, "img"); 335 final String[] imgURLs = Attributes.retrieve(altPage, images, "src"); 336 final int numImg = imgURLs.length; 337 338 System.out.println 339 ('\n' + BCYAN + "Example01.java: " + RESET + "Number of Images Found: " + numImg); 340 341 for (int i = 0; i < numImg; i++) System.out.println(" " + imgURLs[i]); 342 343 return imgURLs; 344 } 345 346 347 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 348 // STEP-07: Download the Image's into a folder 349 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 350 351 /** 352 * <EMBED CLASS='external-html' DATA-FILE-ID=EX01_STEP07> 353 * @see ImageScraper#download(Request, Appendable) 354 * @see Request 355 * @see Results 356 * @see ImageScraper#shutdownTOThreads() 357 */ 358 protected static void STEP_07_downloadImages(final String[] imageURLs) throws Exception 359 { 360 Printing.notice("Download the Image's into a folder"); 361 362 final Stream.Builder<String> builder = Stream.builder(); 363 364 for (int i = 0; i < imageURLs.length; i++) 365 if (imageURLs[i].startsWith("//")) 366 builder.accept("https:" + imageURLs[i]); 367 368 // Build a Request-Object 369 final List<String> imgURLsList = builder.build().collect(Collectors.toList()); 370 final Request req = Request.buildFromStrIter(imgURLsList); 371 372 // Add a few more Scraper-Configurations to the Request Object 373 req.targetDirectory = "image-downloads/"; 374 req.useDefaultCounterForImageFileNames = true; 375 req.skipOnDownloadException = true; 376 req.verbosity = Verbosity.Normal; 377 378 try 379 // Run the scraper, Send all Text-Output to 'System.out' (Ignore / Discard Results) 380 { final Results results = ImageScraper.download(req, System.out); } 381 382 catch (Exception e) 383 { System.out.println(EXCC.toString(e)); } 384 385 finally 386 // This needs to happen, or this entire program will hang / lock up the terminal 387 { ImageScraper.shutdownTOThreads(); } 388 } 389}