001package Torello.HTML.Tools.NewsSite; 002 003import Torello.Java.*; 004 005import Torello.HTML.URLFilter; 006import Torello.Java.Additional.RemoveUnsupportedIterator; 007import Torello.Languages.LC; 008 009import java.util.*; 010import java.net.*; 011 012/** 013 * The 'data flow' encapsulation class that contains most of the salient features of a news 014 * oriented web-site. 015 * 016 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=NEWS_SITE> 017 */ 018public class NewsSite implements java.io.Serializable 019{ 020 /** <EMBED CLASS='external-html' DATA-FILE-ID=SVUID> */ 021 public static final long serialVersionUID = 1; 022 023 /** A Simple Name for the news-site */ 024 public final String siteName; 025 026 /** Country of origin for the news-site in question */ 027 public final Country country; 028 029 /** {@code URL} of the main-page for the news web-site */ 030 public final URL siteURL; 031 032 /** A Language Code instance for the web-site, if needed. */ 033 public final LC languageCode; 034 035 /** A simple text description of the news web-site */ 036 public final String description; 037 038 /** 039 * Should contain the complete list of news-section {@code URL's} 040 * @see ScrapeURLs#get​(Vector, URLFilter, LinksGet, Appendable) 041 */ 042 private final Vector<URL> sectionURLs; 043 044 /** 045 * An instance of {@link URLFilter} for filtering out certain {@code URL's} from the list of 046 * scraped article-{@code URL's}. 047 * @see ScrapeURLs 048 */ 049 public final URLFilter filter; 050 051 /** 052 * An instance of {@code LinksGet} for retrieving Article-{@code URL} links from a section 053 * page 054 * @see ScrapeURLs 055 */ 056 public final LinksGet linksGetter; 057 058 /** 059 * An instance of {@code ArticleGet} used to retrieve news-articles from this site. 060 * @see ScrapeArticles 061 */ 062 public final ArticleGet articleGetter; 063 064 /** 065 * An instance of {@code StrFilter} for finding banner's or ad's 066 * @see ScrapeArticles 067 */ 068 public final StrFilter bannerAndAddFinder; 069 070 /** 071 * Convenience Constructor. 072 * <BR />May pass a {@link StrFilter} to the {@link URLFilter} parameter instead. 073 * <BR />Invokes: {@link #NewsSite(String, Country, String, LC, String, Vector, URLFilter, 074 * LinksGet, ArticleGet, StrFilter)} 075 */ 076 public NewsSite( 077 String siteName, 078 Country country, 079 String siteURLAsStr, 080 LC languageCode, 081 String description, 082 Vector<URL> sectionURLs, 083 StrFilter filter, 084 LinksGet linksGetter, 085 ArticleGet articleGetter, 086 StrFilter bannerAndAddFinder 087 ) 088 { 089 this( 090 siteName, country, siteURLAsStr, languageCode, description, sectionURLs, 091 URLFilter.fromStrFilter(filter), linksGetter, articleGetter, bannerAndAddFinder 092 ); 093 } 094 095 /** 096 * Simple constructor for this data-class. 097 * 098 * @param siteName This site's name 099 * @param country The <I>country-of-origin</I> for this news web-site. 100 * @param siteURLAsStr The primary {@code URL} for the news web-site. 101 * 102 * @param languageCode If this site uses a non-English system, the {@code 'languageCode'} 103 * parameter can keep track of the language. 104 * 105 * @param description Brief Description of the site. 106 * 107 * @param sectionURLs This should list the primary news-sections on the web-site. News 108 * sections include lists such as "Life", "Health", "Business", "World News", "Sports" - but 109 * this list could actually include just about anything. 110 * 111 * @param filter If, when scraping a section, there are {@code URL's} that need to be filtered, 112 * this parameter can help filtering non-Article, non-news links. As explained in the 113 * {@code class ScrapeURL's}, this is often a simple one-lined lambda-expression that 114 * identifies which {@code URL's} match a Regular-Expression {@code Pattern}. 115 * 116 * @param linksGetter This is a 'getter', which also is often just a one line 117 * regular-expression lambda for retrieving the links from a section web-page. 118 * 119 * @param articleGetter This should implement the {@code ArticleGet} interface. 120 * @param bannerAndAddFinder Filter for finding repetitive ads or banners. 121 */ 122 @SuppressWarnings("unchecked") 123 public NewsSite( 124 String siteName, 125 Country country, 126 String siteURLAsStr, 127 LC languageCode, 128 String description, 129 Vector<URL> sectionURLs, 130 URLFilter filter, 131 LinksGet linksGetter, 132 ArticleGet articleGetter, 133 StrFilter bannerAndAddFinder 134 ) 135 { 136 this.siteName = siteName; 137 this.country = country; 138 this.languageCode = languageCode; 139 this.description = description; 140 this.sectionURLs = (Vector<URL>) sectionURLs.clone(); 141 this.filter = filter; 142 this.linksGetter = linksGetter; 143 this.articleGetter = articleGetter; 144 this.bannerAndAddFinder = bannerAndAddFinder; 145 146 try 147 { this.siteURL = new URL(siteURLAsStr); } 148 149 catch (MalformedURLException e) 150 { 151 throw new NewsSiteException( 152 "Unable to instantiate the parameter 'siteURLAsStr'. There was a Malformed URL " + 153 "Exception thrown. Please see this Exceptions Throwable.getCause() for more " + 154 "details.", e 155 ); 156 } 157 } 158 159 /** 160 * Retrieves the <B>Section URL's</B> (life, comedy, sports, business, world) for this 161 * news-site 162 * 163 * @return An {@code Iterator<URL>} of the different sections for a particular news-site. 164 */ 165 public Iterator<URL> sectionURLsIter() 166 { return new RemoveUnsupportedIterator<URL>(sectionURLs.iterator()); } 167 168 /** 169 * Retrieves the <B>Section URL's</B> (life, comedy, sports, business, world) for this 170 * news-site 171 * 172 * @return A {@code Vector<URL>} of the different sections for a particular news-site. 173 */ 174 @SuppressWarnings("unchecked") 175 public Vector<URL> sectionURLsVec() 176 { return (Vector<URL>) sectionURLs.clone(); } 177}