Translate.java.html

package Torello.Languages;

import java.util.*;
import java.io.*;

import Torello.HTML.*;
import Torello.HTML.NodeSearch.*;
import Torello.Java.*;

import Torello.Java.Additional.URLs;

/**
 * Translate (普通话, Simplified &amp; Traditional Chinese) Characters.
 * 
 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=T>
 */
@Torello.JavaDoc.StaticFunctional
public class Translate
{
    private Translate() { }

    public static void article(
            Vector<String>                  simpSentencesIN,
            Vector<Vector<String>>          sentencesOUT,
            Vector<Vector<Vector<String>>>  wordTablesOUT,
            Vector<String>                  DOUTArr,
            Vector<Boolean>                 DOUTErrorBoolArr
        )
        throws IOException
    {
        for (String simpSentence : simpSentencesIN)
        {
            Vector<String>          sentences   = new Vector<String>();
            Vector<Vector<String>>  wordTable   = new Vector<Vector<String>>();
            StringBuilder           DOUT        = new StringBuilder();
            boolean                 error       = block(simpSentence, sentences, wordTable, DOUT);

            sentencesOUT.add(sentences);
            wordTablesOUT.add(wordTable);
            DOUTArr.add(DOUT.toString());
            DOUTErrorBoolArr.add(Boolean.valueOf(error));
        }
    }

    /**
     * 
     * @throws IOException The interface java.lang.Appendable mandates that the IOException must be
     * treated as a checked exception for all output operations.  Therefore IOException is a
     * required exception in this method' throws clause.
     */
    public static boolean block(
            String                  simpSentenceIN,
            Vector<String>          sentencesOUT,
            Vector<Vector<String>>  wordTableOUT,
            Appendable              DOUT
        )
        throws IOException
    {
        String[]    gtScrape        = sentenceZH(simpSentenceIN);
        String      pronSentence    = gtScrape[0];
        String      englSentence    = gtScrape[1];

        sentencesOUT.removeAllElements();
        sentencesOUT.add(simpSentenceIN);
        sentencesOUT.add(pronSentence);
        sentencesOUT.add(englSentence);

        Vector<String>  simpWords   = new Vector<String>();
        Vector<String>  pronWords   = new Vector<String>();
        boolean         errorParse  = PinYinParse.parse
                            (DOUT, simpSentenceIN, pronSentence, simpWords, pronWords);

        if (pronWords.size() != simpWords.size()) throw new IllegalStateException(
            "The pronunciation and the character vector's should be the exact same length.\n" +
            "pronWords.size() = " + pronWords.size() + " and simpWords.size() = " + 
            simpWords.size()
        );

        int len = pronWords.size();
        for (int i=0; i < len; i++)
        {
            Vector<String> vocabEntryRow = new Vector<String>();

            vocabEntryRow.add(simpWords.elementAt(i));
            vocabEntryRow.add(pronWords.elementAt(i));
            vocabEntryRow.add(""); //Dictionary.lookupTrad(simp, pron));
            vocabEntryRow.add(""); //Dictionary.lookupEngl(simp, pron));

            wordTableOUT.add(vocabEntryRow);
        }
        return errorParse;
    }


    // *********************************************************************************************************
    // This is from the original file named "GTScrapeCN"
    // *********************************************************************************************************

    /**
     * This receives as input a sentence in simplified Mandarin Chinese.  If it finds a period in
     * it, it breaks the sentence up into smaller bricks based around the period.  It queries
     * Google Translate using this sentence.
     * 
     * @param chinese Any sentence, paragraph, phrase or word in Simplified-Mandarin
     * 
     * @return Two separate Strings returned in a String array - two elements long.
     * 
     * <OL CLASS=JDOL>
     * <LI><CODE>ret[0]</CODE> - The pronunciation (罗马拼音) String scraped from a call to Google
     * Translate</LI>
     * <LI><CODE>ret[0]</CODE> - The English - also scraped from a call to
     * <CODE>http://translate.google.com</CODE></LI>
     * </OL>
     */
    public static String[] sentenceZH(String chinese)
    {
        if (chinese.indexOf('\n') != -1) throw new IllegalArgumentException("CHINESE:\t" + chinese + "\nContains a newline!");

        String[]        cArr            = chinese.trim().split("。");
        StringBuilder   completePron    = new StringBuilder();
        StringBuilder   completeEngl    = new StringBuilder();

        for (int i=0; i < cArr.length; i++)
        {
            // Prepare the queries and scrape http://translate.google.com/query web-page.
            Vector<HTMLNode>    page        = null;
            int                 retryCount  = 0;

            while ((page == null) && (retryCount < 6))
                try {
                    String          chineseQ    = URLs.toProperURLV2(cArr[i] + "。");
                    BufferedReader  br          = Scrape.openConn_iso_8859_1("https://translate.google.com/?q=" + chineseQ);
                    page                        = HTMLPage.getPageTokens(br, false);
                } catch (Exception e) {
                    retryCount++;
                    System.out.println("RETRY-SCRAPE Google Translate:\n" + "Attempt #" + retryCount + "\n" + e.getMessage());
                }

            // Get Chinese PinYin as Sentence
            StringBuilder       pron    = new StringBuilder();
            Vector<HTMLNode>    partial = InnerTagGetInclusive.first(page, "div", "id", TextComparitor.EQ_CASE_INSENSITIVE, "src-translit");

            Util.Remove.allTagNodes(partial);
            for (HTMLNode n : partial) pron.append(((TextNode) n).str);
            completePron.append(Escape.replace(pron.toString()).trim() + "  ");


            // Get English from Translate Website as a Sentence
            StringBuilder   engl    = new StringBuilder();
            partial                 = InnerTagGetInclusive.first(page, "span", "id", TextComparitor.EQ_CASE_INSENSITIVE, "result_box");
            Util.Remove.allTagNodes(partial);
            for (HTMLNode n : partial) engl.append(((TextNode) n).str);
            completeEngl.append(Escape.replace(engl.toString().replaceAll("\\\\u200b", "")).trim() + "  ");
        }
        String [] retArr = { completePron.toString(), completeEngl.toString() };
        return retArr;
    }

    /**
     * Retrieves the PinYin pronunciation from Google Translate Servers for a single Chinese Word.
     * 
     * @param chineseWord Any single word in simplified Mandarin Chinese
     * 
     * @return The Pinyin Pronunciation of that word, stripped by Google Translate Servers.
     */
    public static String getPinYin(String chineseWord) throws IOException
    {
        BufferedReader      br      = Scrape.openConn_iso_8859_1("https://translate.google.com/?q=" + chineseWord + "&source=zh-CN");
        Vector<HTMLNode>    page    = HTMLPage.getPageTokens(br, false);
        Vector<HTMLNode>    partial = InnerTagGetInclusive.first(page, "div", "id", TextComparitor.EQ_CASE_INSENSITIVE, "src-translit");
        String              pron    = "";

        Util.Remove.allTagNodes(partial);
        for (HTMLNode n : partial) pron += ((TextNode) n).str;
        return Escape.replace(pron);
    }

    /**
     * Retrieves the Google Translate (English) Textbox-defintion for a particular Mandarin Chinese
     * Word. <BR /><B>NOTE:</B> This is not the information under the primary/main
     * translation-text-box, this is the translation-text-box word itself.  
     * 
     * @param chineseWord Any single word in simplified Mandarin Chinese
     * 
     * @return The Google Translate Server's best attempt at a Translation.
     */
    public static String getEnglish(String chineseWord) throws IOException
    {
        BufferedReader      br      = Scrape.openConn_iso_8859_1("https://translate.google.com/?q=" + chineseWord);
        Vector<HTMLNode>    page    = HTMLPage.getPageTokens(br, false, null, "matches.txt", null);
        Vector<HTMLNode>    partial = InnerTagGetInclusive.first(page, "span", "id", TextComparitor.EQ_CASE_INSENSITIVE, "result_box");
        String              engl    = "";

        Util.Remove.allTagNodes(partial);
        for (HTMLNode n : partial) engl += ((TextNode) n).str;
        return Escape.replace(engl);
    }
}