001package Torello.Languages; 002 003import java.io.*; 004import java.util.*; 005 006/** 007 * PinYinParse (罗马拼音). 008 * 009 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=PYP> 010 */ 011@Torello.JavaDoc.StaticFunctional 012public class PinYinParse 013{ 014 private PinYinParse() { } 015 016 /** 017 * The purpose of this is produce the Parallel arrays (Vector<String>) which contain 018 * Chinese Characters and Chinese PinYin based on the results of the Google Translate Query. 019 * 020 * <BR /><BR /><B CLASS=JDDescLabel>Scrape, non-API Invoation:</B> 021 * 022 * <BR />This is of "limited use" - since primarily the input to this function is a 023 * {@code String} that has been scraped from the <B>{@code Google Translate Website}</B>, not 024 * a {@code String} from a query to Google Cloud Server's <B>{@code Translate-API}</B>. 025 * 026 * <BR /><BR />The API version of Mandarin Translations literally leaves out the Pin-Yin 027 * Romanizations, and makes the entire package a lot less useable. The web-site itself can be 028 * scraped, and the Pin-Yin obtained, but that String comes from a web-site that changes from 029 * time-to-time. 030 * 031 * <BR /><BR /><B CLASS=JDDescLabel>Using a Bot:</B> 032 * 033 * <BR />If scraping Google's Translate Web-site conjurs images of the police coming to your 034 * door, another web-site that seems to do pretty good Romanization is Pin1Yin1.com. I have 035 * another class that scrapes that site. 036 * 037 * @param DOUT This is filled up with Debug Information as this class is run. It may be any 038 * implementation of java's {@code java.lang.Appendable} interface. 039 * 040 * @param simpSentence This is the complete simplified-Mandarin sentence obtained from 041 * news-article. 042 * 043 * @param pronSentence This is the pronunciation of the simplified-Mandarin sentence. This 044 * should have already been obtained from Google Translate. 045 * 046 * @param characters This should be an empty vector. It will be populated by the words from 047 * the original Mandarin sentence, based on the pronunciation obtained from Google Translate. 048 * 049 * @param pronunciation This should also be an empty vector. It will be populated after the 050 * words from the pronunciation sentence have been parsed into individual words. 051 * 052 * @return boolean This is true if there was possibly an error along the way. 053 * The specific requirements for the boolean value are: <BR /> 054 * {@code (cSent.length() != totalChinese) && (totalChinese > 0);} 055 * 056 * @throws IOException The {@code interface java.lang.Appendable} mandates that the 057 * {@code IOException} must be treated as a checked exception for all output operations. 058 * Therefore {@code IOException} is a required exception in this method' throws clause. 059 */ 060 public static boolean parse( 061 Appendable DOUT, 062 String simpSentence, 063 String pronSentence, 064 Vector<String> characters, 065 Vector<String> pronunciation 066 ) 067 throws IOException 068 { 069 int totalChinese = 0; 070 DOUT.append("********************************************\n"); 071 DOUT.append("chin = " + simpSentence + "\n"); 072 DOUT.append("pron = " + pronSentence + "\n"); 073 074 // remove "alternate" (AUC) versions of A...Z or 0..9 are there.. 075 String cSent = ZH.convertAnyAUC(simpSentence); 076 077 // CHANGED 2018.09.24 - dellAllPunctuation does not remove '.' and ',' between numbers! 078 String pSent = ZH.delAllPunctuationPINYIN(pronSentence); 079 080 cSent = ZH.delAllPunctuationCHINESE(cSent); 081 082 DOUT.append("********************************************\n"); 083 DOUT.append("After Removing non-alphanumeric UniCode, and Alt-UniCode:\n"); 084 DOUT.append("cSent=" + cSent + "\n"); 085 DOUT.append("pSent=" + pSent + "\n"); 086 DOUT.append("********************************************\n"); 087 088 // Leading or ending blanks messes this up 089 // *** Use trim() 090 091 String[] pWords = pSent.trim().split(" "); 092 093 for (int i = 0; i < pWords.length; i++) 094 { 095 String pronWord = pWords[i].trim(); 096 097 if (pronWord.length() == 0) continue; 098 099 // Sometimes alphabetic characters appear in the chinese string. 100 int leading = ZH.countLeadingLettersAndNumbers(cSent.substring(totalChinese)); 101 102 if (leading > 0) 103 { 104 String alphaNumericASCII = cSent.substring(totalChinese, totalChinese + leading); 105 106 DOUT.append("*** Found English and Numbers ASCII in Chinese Sentence ***\n"); 107 DOUT.append("There are " + leading + " leading alpha numeric characters."); 108 DOUT.append(" [" + alphaNumericASCII + "]\n"); 109 DOUT.append("pronunciation word is: [" + pronWord + "]\n"); 110 111 pronunciation.add(pronWord); 112 characters.add(alphaNumericASCII); 113 114 totalChinese += leading; 115 } 116 117 // else - it's just normal characters in the chinese string 118 else 119 { 120 int numChinese = ZH.countSyllablesAndNonChinese(pronWord, DOUT); 121 String chineseWord = cSent.substring(totalChinese, totalChinese + numChinese); 122 123 DOUT.append("The word [" + pronWord + "] "); 124 DOUT.append("corresponds to " + numChinese + " Unicode Characters "); 125 DOUT.append("[" + chineseWord + "]\n"); 126 127 // Add the new word to the list 128 pronunciation.add(pronWord); 129 characters.add(chineseWord); 130 131 totalChinese += numChinese; 132 } 133 } 134 135 DOUT.append( 136 "********************************************\n" + 137 "COMPLETED SENTENCE LOOP\n" + 138 "SUMMARY:\n" + 139 "FOUND (" + totalChinese + ") characters in Chinese String\n" + 140 "STRING CONTAINS (" + cSent.length() + ") characters\n" + 141 ((totalChinese != cSent.length()) ? "\nPOSSIBLE ERROR MISMATCH\n\n" : "") + 142 "********************************************\n" 143 ); 144 145 return (cSent.length() != totalChinese) && (totalChinese > 0); 146 } 147}