001package Torello.HTML.Tools.SearchEngines;
002
003import Torello.HTML.*;
004import Torello.HTML.NodeSearch.*;
005import Torello.Java.*;
006
007import Torello.Java.Shell.C;
008import Torello.Java.Additional.Ret2;
009
010import java.util.*;
011import java.util.stream.*;
012import java.io.IOException;
013import java.net.URL;
014import java.util.regex.Pattern;
015
016import Torello.HTML.Tools.JavaDoc.StaticFunctional;
017import Torello.HTML.Tools.JavaDoc.Excuse;
018
019/**
020 * BaiDuQuery (百度搜索) - Example class that makes an HTTP connection (rather than a REST 
021 * invocation) to the Search-Engine.
022 * 
023 * <BR /><BR />
024 * <EMBED CLASS="external-html" DATA-FILE-ID="百度搜索">
025 * <EMBED CLASS="external-html" DATA-FILE-ID="USESPLASH">
026 */
027@StaticFunctional(Excused="SPLASH_URL", Excuses=Excuse.CONFIGURATION)
028public class BaiDuQuery
029{
030    private BaiDuQuery() { }
031
032    final static char[] URL_ESC_CHARS =
033    {
034        '%', '#', '$', '&', '@', '`', '/', ':', ';', '<', '=', '>', '?', '[', '\\', ']',
035        '^', '{', '|', '}', '~', '\'', ',', '(', ')'
036    };
037
038    /**
039     * In order to use "Splash" - it is *very simple* ... Start the Server, and append
040     * this {@code String} to the beginning of all {@code URL's}:
041     */
042    public static String SPLASH_URL = "http://localhost:8050/render.html?url=";
043
044    /**
045     * This class is returned as a result of performing a search on 百度搜索.
046     */
047    public static class SearchResult implements java.io.Serializable
048    {
049        /** <EMBED CLASS="external-html" DATA-FILE-ID="SVUIDFI"> */
050        public static final long serialVersionUID = 1;
051
052        /** This is the link of {@code 'this'} search result. */
053        public final String url;
054
055        /**
056         * This is the "Anchor Text" that 百度 provided for this link.  To be specific, it is
057         * the text between the {@code <A HREF=url>} and the {@code </A>}.
058         */         
059        public final String title;
060
061        /**
062         * Usually, this array will be null.  Note, <B><I>null will be used in place of a zero
063         * length array</I></B>.  There are some sites for which a search-result would have
064         * multiple, or even numerous matches for a given search-{@code String}.  When a search
065         * using 百度 provides more than one result, the list of 'Additional Matches' are
066         * scraped and put into this array.
067         */
068        public final SearchResult[] subResults;
069
070        SearchResult(String url, String title, SearchResult[] subResults)
071        {
072            this.url        = url;
073            this.title      = title;
074            this.subResults = subResults;
075        }
076
077        SearchResult(String url, String title)
078        { this(url, title, null); }
079
080        /**
081         * Converts this search result to a simple {@code String}
082         * @return A {@code String} version of this Search Result.
083         */
084        public String toString()
085        {
086            if (subResults == null) return title + '\n' + url + '\n';
087
088            StringBuilder sb = new StringBuilder();
089
090            for (SearchResult sr : subResults)
091                sb.append('\t' + sr.title + "\n\t" + sr.url + '\n');
092
093            return title + '\n' + url + '\n' + sb.toString();
094        }
095    }
096
097    /**
098     * This class may be invoked at the Command Line.  The arguments passed to this class
099     * will be sent to the {@link #query(Appendable, String[])} method.  The results will be
100     * printed to the terminal.
101     * @throws IOException If there is any I/O problems that occur in scraping the site.  
102     */
103    public static void main(String[] argv) throws IOException
104    { query(System.out, argv); }
105
106    /**
107     * This will poll the nearest <B STYLE="color: red;">百度.com Web Server</B> for the results
108     * of a search.
109     * <EMBED CLASS="external-html" DATA-FILE-ID="MUSTSPLASH">
110     * @param log This is the log parameter.  If this parameter is null, it shall be ingored.
111     * <EMBED CLASS="external-html" DATA-FILE-ID="APPENDABLE">
112     * @param argv This should be the list of keywords that would be typed into a
113     * {@code 百度 Search Bar}.  There may be spaces here, so the entire search {@code String}
114     * may be sent as a single-{@code String}, or it may be broken up into tokens and passed
115     * individually here.  It is mostly irrelevant.
116     * @return <EMBED CLASS="external-html" DATA-FILE-ID="SERET2">
117     * @throws IOException If there is any I/O problems that occur in scraping the site.  
118     * @see StrReplace#r(String, char[], IntCharFunction,)
119     */
120    public static Ret2<SearchResult[], URL[]> query(Appendable log, String... argv)
121        throws IOException
122    {
123        StringBuilder queryBuilder = new StringBuilder();
124
125        for (int i=0; i < argv.length; i++)
126        {
127            String temp = argv[i].replace("+", "%2B").replace(" ", "+");
128
129            temp = StrReplace.r
130                (temp, URL_ESC_CHARS, (int t, char c) -> '%' + Integer.toHexString((int) c));
131
132            queryBuilder.append(temp);
133            if (i < (argv.length -1)) queryBuilder.append('+');
134        }
135
136        String queryStr = queryBuilder.toString();
137
138        if (log != null) log.append("Query String:\n" + C.BYELLOW + queryStr + C.RESET + '\n');
139
140        return query(log, new URL("https://www.baidu.com/s?wd=" + queryStr));
141    }
142
143    /**
144     * This will poll the nearest <B STYLE="color: red;">百度 Web Server</B> for the results
145     * of a search - given a provided {@code URL}.  The {@code URL} provided by this method
146     * ought to be one of the {@code URL's} retrieved from the "next" button - <I>as was returned
147     * by a previous search engine query</I> (the {@code Ret2.b} list of {@code URL's}).
148     *
149     * <EMBED CLASS="external-html" DATA-FILE-ID="MUSTSPLASH">
150     *
151     * <BR /><BR />Here is the "core HTML retrieve operation" for a <B>{@code BaiDu.com Search
152     * Bar Result}</B>
153     * 
154     * <DIV CLASS="SNIP">{@code
155     * // Create a "百度 Results Iterator" - each result is wrapped in an HTML Element
156     * // that looks like: <DIV CLASS="result"> ... </DIV>  (or CLASS="result-op")
157     * HNLIInclusive resultsIter = InnerTagInclusiveIterator.get
158     *      (v, "div", "class", TextComparitor.C_OR, "result", "result-op");
159     * 
160     * while (resultsIter.hasNext())
161     * 
162     *      // The first anchor <A HREF=...> will contain the link for this search-result.
163     *      Vector<HTMLNode>  firstLink = TagNodeGetInclusive.first(result, "a");
164     * 
165     *      // Here is how the URL and Anchor text is collected
166     *      String            url       = ((TagNode) firstLink.elementAt(0)).AV("href").trim();
167     *      String            title     = Util.textNodesString(firstLink).trim(); 
168     *
169     * }</DIV>
170     *
171     * @param log This is the log parameter.  If this parameter is null, it shall be ingored.
172     * <EMBED CLASS="external-html" DATA-FILE-ID="APPENDABLE">
173     *
174     * @param query This may be a query {@code URL} that has been prepared, by 百度, to
175     * be used for the "next 10 results" of a particular search.  
176     *
177     * <BR /><BR /><B>Specifically:</B> This {@code URL} should have been retrieved from a
178     * previous search-results page, and was listed as containing additional (next 10 matches)
179     * links.
180     *
181     * @return <EMBED CLASS="external-html" DATA-FILE-ID="SERET2">
182     * @throws IOException If there is any I/O problems that occur in scraping the site.  
183     */
184    public static Ret2<SearchResult[], URL[]> query(Appendable log, URL query)
185        throws IOException
186    {
187        // Use a java Stream.Builder to save the results to a Java Stream.
188        // Streams are easily converted to arrays.
189        Stream.Builder<SearchResult> resultsBuilder = Stream.builder();
190
191        URL splashQuery = new URL(SPLASH_URL + query.toString());
192
193        // Download the HTML, and save it to a java.util.Vector (like an array)
194        Vector<HTMLNode> v = HTMLPage.getPageTokens(splashQuery, false, "out.html", null, null);
195
196        // Create a "Google Results Iterator" - each result is wrapped in an HTML Element
197        // that looks like: <DIV CLASS="rc"> ... </DIV>
198        HNLIInclusive resultsIter = InnerTagInclusiveIterator.get
199            (v, "div", "class", TextComparitor.C_OR, "result", "result-op");
200
201        while (resultsIter.hasNext())
202        {
203            // Get the <DIV CLASS="rc"> ... </DIV> contents.
204            Vector<HTMLNode>    result          = resultsIter.next();
205
206            // The first anchor <A HREF=...> will contain the link for this search-result.
207            Vector<HTMLNode>    firstLink       = TagNodeGetInclusive.first(result, "a");
208
209            String url      = ((TagNode) firstLink.elementAt(0)).AV("href").trim();
210            String title    = Util.textNodesString(firstLink).trim(); 
211
212            // Save the results in a Java Stream, using Stream.Builder.
213            Stream.Builder<SearchResult> subResultsBuilder = Stream.builder();
214
215            // To get the list of search-result sub-links, retrieve all links that are labelled
216            // <DIV CLASS="c-row"> ... </A>
217            HNLIInclusive subLinksIter = InnerTagInclusiveIterator.get
218                (result, "div", "class", TextComparitor.C, "c-row");
219
220            // Iterate through any "Sub Links"  Again, a "Sub Link" is hereby being defined
221            // as a search result for a particular web-site that would be able to produce
222            // many / numerous additional links.  Often times these additional links are more
223            // useful than the primary link that was returned.
224            while (subLinksIter.hasNext())
225            {
226                Vector<HTMLNode>    div             = subLinksIter.next();
227                // System.out.println(Util.pageToString(div) + "\n********* RT **********************\n");
228
229                // The link / search-result itself is the first HTML Anchor Element (<A HREF=...>...</A>)
230                DotPair             subLink         = TagNodeFindInclusive.first(div, "A");
231
232                if (subLink == null) continue;
233
234                // Get the URL
235                String subLinkURL = ((TagNode) div.elementAt(subLink.start)).AV("href").trim();
236
237                // The first URL returned is just the one we have already retrieved.
238                if (subLinkURL.equalsIgnoreCase(url)) continue;
239
240                // Get the text that is wrapped inside the <A HREF=..> "this-text" </A>
241                // HTML Element.  Util.textNodesString(...) simply removes all TagNodes, and
242                // appends the TextNodes together.
243                String subLinkTitle = Util.textNodesString(div, subLink).trim();
244
245                subResultsBuilder.accept(new SearchResult(subLinkURL, subLinkTitle));
246            }
247
248            // Use Java Stream's to build the SearchResult[] Array.  Call the
249            // Stream.Builder.build() method, and then call the Stream.toArray(...) method.
250            SearchResult[] subResults = subResultsBuilder.build().toArray(SearchResult[]::new);
251
252            SearchResult sr = new SearchResult
253                (url, title, (subResults.length > 0) ? subResults : null);
254
255            resultsBuilder.accept(sr);
256        }
257
258        // Use java's Stream.Builder.build() to create the Stream, then easily convert
259        // to an array.
260        SearchResult[] srArr = resultsBuilder.build().toArray(SearchResult[]::new);
261
262        // If the log is not null, print out the results.
263        if (log != null)
264            for (SearchResult sr : srArr) log.append(sr.toString() + '\n');
265
266        // IMPORTANT NOTE:  This code will retrieve the next available 10 PAGES of 
267        // SEARCH RESULTS as a URL.
268        DotPair nextResultsDIV = InnerTagFindInclusive.first
269            (v, "div", "id", TextComparitor.EQ, "page");
270
271        // Use these criteria specifiers to find the HTML '<A HREF...>' NEXT-PAGE in
272        // SEARCH-RESULTS links...
273        Vector<DotPair> nextPages = InnerTagFindInclusive.all
274            (v, nextResultsDIV.start, nextResultsDIV.end, "a", "href");
275
276        URL[] urlArr = new URL[nextPages.size()];
277
278        /*
279        log.append(
280            "nextResultsDIV.size(): " + nextResultsDIV.size() + '\n' +
281            "urlArr.length: " + urlArr.length + '\n'
282        );
283        */
284
285        // The URL for each of the next pages.  A programmer may expand this
286        // answer by investigating more of these links in a loop.
287        for (int i=0; i < nextPages.size(); i++)
288        {
289            DotPair link = nextPages.elementAt(i);
290            String  href = ((TagNode) v.elementAt(link.start)).AV("href");
291
292            urlArr[i] = Links.resolve(href, query);
293            // log.append(urlArr[i].toString() + '\n');
294        }
295
296        return new Ret2<SearchResult[], URL[]>(srArr, urlArr);
297    }
298}