001package Torello.Languages; 002 003import java.util.*; 004import java.util.regex.*; 005import java.io.*; 006 007import java.util.stream.IntStream; 008 009import Torello.HTML.*; 010import Torello.Java.*; 011 012/** 013 * Pin1Yin1 (罗马拼音 - Online Internet Translation Service). 014 * 015 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=P1Y1> 016 */ 017public class Pin1Yin1 018{ 019 /** PinYin {@code URL} */ 020 public static final String PIN1_YIN1_URL = "https://pin1yin1.com/pinyin/convert/?c="; 021 022 private static final String sampleText = "中国政府网_中央人民政府门户网站"; 023 024 /** {@code pin1yin1.com} JSON parse */ 025 private static final Pattern P1, P2, P3; 026 027 static 028 { 029 // If boot-strapping a data-file, put these four lines in comments. 030 Vector<Pattern> v = RegExFiles.LFEC_JAR_ZIP 031 (Pin1Yin1.class, "data-files/Pin1Yin1.sdat"); 032 033 P1 = v.elementAt(0); // The JSON reply 034 P2 = v.elementAt(1); // Words wrapped in quotation marks 035 P3 = v.elementAt(2); // Integer array 036 } 037 038 /** 039 * This is the original query-string - <I>in Simplified Chinese</I>. 040 * 041 * <BR /><BR /><B>NOTE:</B> How accurate the traditional/simplified 042 * conversion is is not known, and probably not exact. 043 */ 044 public final String simpSENTENCE; 045 046 /** This is the original query-string in Traditional Chinese */ 047 public final String tradSENTENCE; 048 049 /** 050 * The Simplified Conversion - in this version, characters are grouped by "Word Boundaries" 051 */ 052 public final String[] simp; 053 054 /** The Pronunciation of each element in the "Word Boundaries" version. */ 055 public final String[] pron; 056 057 /** 058 * The traditional conversion - In this version, also, characters are grouped by 059 * "Word Boundaries" 060 */ 061 public final String[] trad; 062 063 /** 064 * These are the English Translations of each word; some Chinese Words are translated into 065 * multiple english words. 066 */ 067 public final String[] engl; 068 069 /** 070 * This is a 1-to-1 mapping between the chinese characters in the query, and their mandarin 071 * pronunciation. Specifically, each element in this array contains a single ('one') syllable 072 * of pinyin romanization - a.k.a. one chinese character's pronunciation. The array 073 * {@code final String[] pron} contains as many syllables as the "Grouped By" Chinese Word 074 * does. 075 */ 076 public final String[] pronSINGLE_WORD; 077 078 /** 079 * This is just an array of integers that was actually returned from the server. It identifies 080 * the number of Chinese Characters that are in each Chinese Word. 081 */ 082 public final int[] wordGroups; 083 084 /** 085 * Unless specifically requested, this variable is null. It is the JSON as a String returned 086 * from the server 087 */ 088 public final String JSON; 089 090 /** Run the translation from the command-line. */ 091 public static void main(String[] argv) throws IOException 092 { 093 // System.out.println("P1: " + P1 + "\nP2: " + P2 + "\nP3: " + P3); 094 // System.exit(1); 095 096 System.out.println( 097 getResponse( 098 ((argv.length == 1) ? argv[0] : sampleText), 099 true 100 ) 101 .toString() 102 ); 103 } 104 105 /** 106 * This translates a sentence from Mandarin into English. The response object is an instance 107 * of this exact class. 108 * 109 * @param chineseText This is a text String input, and is expected to be in Mandarin. The 110 * server does not seem to mind whether the input is in "Simplified" or "Traditional." 111 * 112 * @param saveJSON If this is true, the field {@code public final String JSON} will contain the 113 * original JSON string retrieved from the server. If this is FALSE, the JSON field will be 114 * null. 115 * 116 * @return An instance of this class, with all of the constant {@code final} fields filled in. 117 */ 118 public static Pin1Yin1 getResponse(String chineseText, boolean saveJSON) throws IOException 119 { return new Pin1Yin1(chineseText, saveJSON); } 120 121 private Pin1Yin1(String chineseText, boolean saveJSON) throws IOException 122 { 123 String receivedJSON = Scrape.scrapePage(PIN1_YIN1_URL + chineseText).toString(); 124 Matcher m = P1.matcher(receivedJSON); 125 126 if (! m.find()) throw new IllegalStateException 127 ("The regular expression didn't match the response from pin1yin1.com"); 128 129 JSON = saveJSON ? receivedJSON : null; 130 String question = m.group(1); 131 simpSENTENCE = m.group(2); 132 tradSENTENCE = m.group(3); 133 pronSINGLE_WORD = parseQuotationJSONArray(m.group(4)).toArray(new String[0]); 134 engl = parseQuotationJSONArray(m.group(5)).toArray(new String[0]); 135 wordGroups = parseIntJSONArray(m.group(6)); 136 137 String[][] matched = matchThemUp(simpSENTENCE, tradSENTENCE, pronSINGLE_WORD, wordGroups); 138 simp = matched[0]; 139 trad = matched[1]; 140 pron = matched[2]; 141 } 142 143 /** This converts {@code 'this'} result to a {@code java.lang.String} */ 144 public String toString() 145 { 146 StringBuilder sb = new StringBuilder(); 147 if (JSON != null) sb.append(JSON + "\n"); 148 sb.append("Simp: " + simpSENTENCE + "\n" + "Trad: " + tradSENTENCE); 149 150 sb.append("\nSimp-Words:\t"); 151 for (String s : simp) sb.append(s + ", "); 152 int len = sb.length(); 153 sb.delete(len - 2, len); // DELETE THE ENDING COMMA-SPACE 154 155 sb.append("\nPron-Words:\t"); 156 for (String s : pron) sb.append(s + ", "); 157 len = sb.length(); 158 sb.delete(len - 2, len); // DELETE THE ENDING COMMA-SPACE 159 160 sb.append("\nTrad-Words:\t"); 161 for (String s : trad) sb.append(s + ", "); 162 len = sb.length(); 163 sb.delete(len - 2, len); // DELETE THE ENDING COMMA-SPACE 164 165 sb.append("\nEngl-Words:\t"); 166 for (String s : engl) sb.append("[" + s + "] "); 167 168 return sb.toString(); 169 } 170 171 private static Vector<String> parseQuotationJSONArray(String lineJSON) 172 { 173 Vector<String> ret = new Vector<String>(); 174 Matcher m = P2.matcher(lineJSON); 175 while (m.find()) ret.addElement(m.group(1)); 176 return ret; 177 } 178 179 private static int[] parseIntJSONArray(String lineJSON) 180 { 181 IntStream.Builder b = IntStream.builder(); 182 Matcher m = P3.matcher(lineJSON); 183 while (m.find()) b.add(Integer.parseInt(m.group(1))); 184 return b.build().toArray(); 185 } 186 187 /** 188 * Matches the characters together. 189 * 190 * @param simp Simplified Character's 191 * @param trad Traditional Charater's 192 * @param pron Pronunciation 193 * @param intMatchArr m 194 * 195 * @return Matches up the characters and pronunciation. 196 * 197 * <BR /><BR /><UL CLASS=JDUL> 198 * <LI>{@code ret[0]:} Simplified Chinese Characters Grouped into words</LI> 199 * <LI>{@code ret[1]:} Traditional Chinese Characters Grouped into words</LI> 200 * <LI>{@code ret[2]:} Pronunciation Grouped into words</LI> 201 * <UL> 202 */ 203 private static String[][] matchThemUp 204 (String simp, String trad, String[] pron, int[] intMatchArr) 205 { 206 String tempStr = ""; 207 Vector<String> tempVec = new Vector<String>(); 208 String[][] ret = new String[3][0]; 209 int pos = 0; 210 211 // Line up the Simplified characters into word boundaries 212 for (int numChineseChars : intMatchArr) 213 { 214 for (int i=0; i < numChineseChars; i++) tempStr += simp.charAt(pos + i); 215 pos += numChineseChars; 216 tempVec.addElement(tempStr); 217 tempStr = ""; 218 } 219 ret[0] = tempVec.toArray(ret[0]); 220 221 // Line up the Traditional Characters into word boundaries 222 pos = 0; 223 tempVec.removeAllElements(); 224 for (int numChineseChars : intMatchArr) 225 { 226 for (int i=0; i < numChineseChars; i++) tempStr += trad.charAt(pos + i); 227 pos += numChineseChars; 228 tempVec.addElement(tempStr); 229 tempStr = ""; 230 } 231 ret[1] = tempVec.toArray(ret[1]); 232 233 // Lne up the Pronunciation "PinYin-Characters" into word boundaries 234 pos = 0; 235 tempVec.removeAllElements(); 236 for (int numChineseChars : intMatchArr) 237 { 238 for (int i=0; i < numChineseChars; i++) tempStr += pron[pos + i]; 239 pos += numChineseChars; 240 tempVec.addElement(tempStr); 241 tempStr = ""; 242 } 243 ret[2] = tempVec.toArray(ret[2]); 244 245 return ret; 246 } 247}