001package Torello.HTML.Tools.NewsSite; 002 003import java.util.*; 004import java.io.*; 005import java.util.stream.*; 006import java.net.*; 007 008import Torello.HTML.*; 009import Torello.HTML.NodeSearch.*; 010import Torello.Java.*; 011 012import static Torello.Java.C.*; 013 014import Torello.Java.Additional.Ret2; 015import Torello.Java.Additional.URLs; 016 017import Torello.JavaDoc.Excuse; 018import Torello.JavaDoc.StaticFunctional; 019 020/** 021 * Collects all <B>news-article {@code URL's}</B> from a news oriented web-site's main web-page 022 * and from the list 'sub-section' web-pages. 023 * 024 * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPE_URLS> 025 */ 026@StaticFunctional(Excused="SKIP_ON_SECTION_URL_EXCEPTION", Excuses=Excuse.CONFIGURATION) 027public class ScrapeURLs 028{ 029 private ScrapeURLs() { } 030 031 /** 032 * This is a {@code static boolean} configuration field. When this is set to {@code TRUE}, if 033 * one of the {@code "Section URL's"} provided to this class is not valid, and generates a 034 * {@code 404 FileNotFoundException}, or some other {@code HttpConnection} exception, those 035 * exceptions will simply be logged, and quietly ignored. 036 * 037 * <BR /><BR />When this {@code flag} is set to {@code FALSE}, any problems that can occur when 038 * attempting to pick out News Article {@code URL's} from a {@code Section Web-Page} will cause 039 * a {@code SectionURLException} to throw, and the {@code ScrapeURL's} process will halt. 040 * 041 * <BR /><BR /><B><SPAN STYLE="color: red;">SIMPLY PUT:</B></SPAN> There are occasions when a 042 * news web-site will remove a section such as "Commerce", "Sports", or "Travel" - and when or 043 * if one of these suddenly goes missing, it is better to just skip the site rather than 044 * halting the scrape, keep this {@code flag} set to {@code TRUE}. 045 * 046 * <BR /><BR /><B><SPAN STYLE="color: red;">ALSO:</B></SPAN> This is, indeed, a {@code public} 047 * and {@code static flag} (field) which does mean that all processes ({@code Thread's}) using 048 * {@code class ScrapeURLs} must share the same setting (simultaneously). This particular 049 * {@code flag} <I><B>CANNOT</B> be changed in a {@code Thread-Safe} manner.</I> 050 */ 051 public static boolean SKIP_ON_SECTION_URL_EXCEPTION = true; 052 053 /** 054 * Convenience Method. 055 * <BR />Invokes: {@link #get(Vector, URLFilter, LinksGet, Appendable)} 056 */ 057 public static Vector<Vector<String>> get(NewsSite ns, Appendable log) throws IOException 058 { return get(ns.sectionURLsVec(), ns.filter, ns.linksGetter, log); } 059 060 /** 061 * This class is used to retrieve <B>all</B> of the available article {@code URL} links found 062 * on <B>all sections</B> of a newspaper website. 063 * 064 * @param sectionURLs This should be a vector of {@code URL's}, that has all of the the 065 * "Main News-Paper Page Sections." Typical NewsPaper Sections are things like: Life, Sports, 066 * Business, World, Economy, Arts, etc... This parameter may not be null, or a 067 * {@code NullPointerException} will throw. 068 * 069 * @param articleURLFilter If there is a standard pattern for a URL that must be avoided, then 070 * this filter parameter should be used. This parameter may be null, and if it is, it shall be 071 * ignored. This Java {@code URL-Predicate} (an instance of {@code Predicate<URL>}) should 072 * return {@code TRUE} if a particular {@code URL} needs to be <B>kept</B>, not <B>filtered</B>. 073 * When this {@code Predicate} evaluates to {@code FALSE} - <I>the {@code URL} will be 074 * filtered</I>. 075 * 076 * <BR /><BR /><B>NOTE:</B> This behavior is identical to the Java Stream's method 077 * {@code "filter(Predicate<>)".} 078 * 079 * <BR /><BR /><B>ALSO:</B> {@code URL's} that are filtered will neither be scraped, nor saved, 080 * into the newspaper article result-set output file. 081 * 082 * @param linksGetter This method may be used to retrieve all links on a particular 083 * section-page. This parameter may be null. If it is null, it will be ignored - <I>and all 084 * HTML Anchor ({@code <A HREF=...>}) links will be considered "Newspaper Articles to be 085 * scraped."</I> Be careful about ignoring this parameter, because there may be many 086 * extraneous non-news-article links on a particular Internet News WebSite or inside a 087 * Web-Page Section. 088 * 089 * @param log This prints log information to the screen. This parameter may not be null, 090 * or a {@code NullPointerException} will throw. 091 * <EMBED CLASS='external-html' DATA-FILE-ID=APPENDABLE> 092 * 093 * @return The "{@code Vector} of {@code Vector's}" that is returned is simply a list of 094 * all newspaper anchor-link {@code URL's} found on each Newspaper Sub-Section {@code URL} 095 * passed to the {@code 'sectionURLs'} parameter. The returned "{@code Vector} of 096 * {@code Vector's}" is parallel to the input-parameter {@code Vector<URL> Section-URL's}. 097 * 098 * <BR /><BR />What this means is that the Newspaper-Article {@code URL}-Links scraped from 099 * the page located at {@code sectionURLs.elementAt(0)} - will be stored in the 100 * return-{@code Vector} at {@code ret.elementAt(0).} 101 * 102 * <BR /><BR />The article {@code URL's} scraped off of 103 * page {@code URL} from {@code sectionURLs.elementAt(1)} will be stored in the 104 * return-{@code Vector} at {@code ret.elementAt(1)}. <I><B>And so on, and so 105 * forth...</I></B> 106 * 107 * @throws SectionURLException If one of the provided {@code sectionURL's} (Life, Sports, 108 * Travel, etc...) is not valid, or not available on the page then this exception will throw. 109 * Note, though, however there is a {@code flag} ({@link #SKIP_ON_SECTION_URL_EXCEPTION}) that 110 * will force this method to simply "skip" a faulty or non-available {@code Section URL}, and 111 * move on to the next news-article section. 112 * 113 * <BR /><BR />By default, this {@code flag} is set to {@code TRUE}, meaning that this method 114 * will skip news-paper sections that have been temporarily removed rather than causing the 115 * method to exit. This default behavior can be changed by setting the {@code flag} 116 * {@code FALSE}. 117 */ 118 public static Vector<Vector<String>> get( 119 Vector<URL> sectionURLs, URLFilter articleURLFilter, LinksGet linksGetter, 120 Appendable log 121 ) 122 { 123 LOG_WRITE( 124 log, 125 "\n" + BRED + 126 "*****************************************************************************************\n" + 127 "*****************************************************************************************\n" + 128 RESET + " Finding Article URL's in Newspaper Sections" + BRED + "\n" + 129 "*****************************************************************************************\n" + 130 "*****************************************************************************************\n" + 131 RESET + '\n' 132 ); 133 134 Vector<Vector<String>> ret = new Vector<>(); 135 136 for (URL sectionURL : sectionURLs) 137 { 138 Stream<String> urlStream; 139 140 // It helps to run this, because web-pages can use a lot of strings 141 System.gc(); 142 143 // Starting Scraping the Section for URL's 144 LOG_WRITE(log, "Visiting Section URL: " + sectionURL.toString() + '\n'); 145 146 try 147 { 148 // Download, Scrape & Parse the main-page or section URL. 149 Vector<HTMLNode> sectionPage = HTMLPage.getPageTokens(sectionURL, false); 150 151 // If the 'LinksGet' instances is null, then select all URL's on the main-page 152 // section-pge, and pray for rain (hope for the best). If no 'LinksGet' instance 153 // was provided, there will likely be many spurious / irrelevant links to 154 // non-article pages, and even advertisement pages that are also included in this 155 // Stream<String>. 156 // 157 // InnerTagGet returns a Vector<TagNode>. Convert that to a Stream<String>, where 158 // each 'String' in the 'Stream' is the HREF attribute of the <A HREF=...> tag. 159 160 if (linksGetter == null) 161 urlStream = InnerTagGet.all(sectionPage, "a", "href") 162 .stream().map((TagNode tn) -> tn.AV("href")); 163 164 else 165 urlStream = linksGetter.apply(sectionURL, sectionPage).stream(); 166 } 167 catch (Exception e) 168 { 169 LOG_WRITE( 170 log, 171 BRED + "Error loading this main-section page-URL\n" + RESET + 172 e.getMessage() + '\n' 173 ); 174 175 if (SKIP_ON_SECTION_URL_EXCEPTION) 176 { 177 LOG_WRITE(log, "Non-fatal Exception, continuing to next Section URL.\n\n"); 178 continue; 179 } 180 else 181 { 182 LOG_WRITE( 183 log, 184 BRED + "Fatal - Exiting. Top-Level Section URL's must be valid URL's." + 185 RESET + "\n" + HTTPCodes.convertMessageVerbose(e, sectionURL, 0) + '\n' 186 ); 187 188 throw new SectionURLException 189 ("Invalid Main Section URL: " + sectionURL.toString(), e); 190 } 191 } 192 193 Vector<String> sectionArticleURLs = urlStream 194 195 // If any TagNode's did not have HREF-Attributes, remove those null-values 196 .filter ((String href) -> (href != null)) 197 198 // Perform a Standard String.trim() operation. 199 .map ((String href) -> href.trim()) 200 201 // Any HREF's that are "just white-space" are now removed. 202 .filter ((String href) -> href.length() > 0) 203 204 // This removes any HREF Attribute values that begin with 205 // "mailto:" "tel:" "javascript:" "magnet:" etc... 206 .filter ((String href) -> StrCmpr.startsWithNAND_CI(href, Links.NON_URL_HREFS())) 207 208 // Now, Resolve any "Partial URL References" 209 .map ((String href) -> Links.resolve_KE(href, sectionURL)) 210 211 // If there were any exceptions while performing the Partial-URL Resolve-Operation, 212 // then print an error message. 213 .peek ((Ret2<URL, MalformedURLException> r2) -> 214 { 215 if (r2.b != null) LOG_WRITE( 216 log, 217 "Section URL was a malformed URL, and provided exception messsage:\n" + 218 r2.b.getMessage() + '\n' 219 ); 220 }) 221 222 // Convert the Ret2<URL, Exception> to just the URL, without any Exceptions 223 .map ((Ret2<URL, MalformedURLException> r2) -> r2.a) 224 225 // If there was an exception, the URL Ret.a field would be null (remove nulls) 226 .filter ((URL url) -> url != null) 227 228 // NOTE: When this evaluates to TRUE - it should be kept 229 // Java Stream's 'filter' method states that when the predicate evaluates to TRUE, 230 // the stream element is KEPT / RETAINED. 231 // 232 // Class URLFilter mimics the filter behavior of Streams.filter(...) 233 234 .filter ((URL url) -> (articleURLFilter == null) || articleURLFilter.test(url)) 235 236 // Convert these to "Standard Strings" 237 // Case-Insensitive parts are set to LowerCase 238 // Case Sensitive Parts are left alone. 239 240 .map ((URL url) -> URLs.urlToString(url)) 241 242 // Filter any duplicates -> This is the reason for the above case-sensitive parts 243 // being separated. 244 245 .distinct() 246 247 // Convert the URL's back to a String. There really should not be any exceptions, 248 // This is just an "extra-careful" step. It is not needed. 249 250 .filter ((String url) -> 251 { try { new URL(url); return true; } catch (Exception e) { return false; } }) 252 253 // Convert the Stream to a Vector 254 .collect(Collectors.toCollection(Vector::new)); 255 256 ret.add(sectionArticleURLs); 257 258 LOG_WRITE( 259 log, 260 "Found [" + BYELLOW + sectionArticleURLs.size() + RESET + "] " + 261 "Article Links.\n\n" 262 ); 263 } 264 265 // Provide a simple count to the log output on how many URL's have been uncovered. 266 // NOTE: This does not heed whether different sections contain non-distinct URL's. 267 // (An identical URL found in two different sections will be counted twice!) 268 269 int totalURLs = 0; 270 271 // <?> Prevents the "Xlint:all" from generating warnings... 272 for (Vector<?> section : ret) totalURLs += section.size(); 273 274 LOG_WRITE( 275 log, 276 "Complete Possible Article URL list has: " + 277 BYELLOW + StringParse.zeroPad10e4(totalURLs) + RESET + ' ' + 278 "url(s).\n\n" 279 ); 280 281 return ret; 282 } 283 284 // This is necessary because Java's 'java.lang.Appendable' permits a IOException 285 private static void LOG_WRITE(Appendable log, String s) 286 { 287 try 288 { log.append(s); } 289 290 catch (Exception e) 291 { 292 System.out.println( 293 "While trying to write to the log, an exception occurred\n" + 294 "the java.lang.Appendable you have provided threw an IOException:\n" + 295 StrIndent.indent(e.getMessage(), 4) + 296 "Unfortunaely, with a faulty Appendable-Log, the scraper cannot continue." 297 ); 298 299 e.printStackTrace(); 300 301 System.out.println("Fatal Error, JVM Exiting..."); 302 303 System.exit(1); 304 } 305 } 306}