1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 | package Torello.Languages; import java.util.*; import java.util.regex.*; import java.io.*; import java.util.stream.IntStream; import Torello.HTML.*; import Torello.Java.*; /** * Pin1Yin1 (罗马拼音 - Online Internet Translation Service). * * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=P1Y1> */ public class Pin1Yin1 { /** PinYin {@code URL} */ public static final String PIN1_YIN1_URL = "https://pin1yin1.com/pinyin/convert/?c="; private static final String sampleText = "中国政府网_中央人民政府门户网站"; /** {@code pin1yin1.com} JSON parse */ private static final Pattern P1, P2, P3; static { // If boot-strapping a data-file, put these four lines in comments. Vector<Pattern> v = RegExFiles.LFEC_JAR_ZIP (Torello.Data.DataFileLoader.class, "data09.sdat"); P1 = v.elementAt(0); // The JSON reply P2 = v.elementAt(1); // Words wrapped in quotation marks P3 = v.elementAt(2); // Integer array } /** * This is the original query-string - <I>in Simplified Chinese</I>. * * <BR /><BR /><B>NOTE:</B> How accurate the traditional/simplified * conversion is is not known, and probably not exact. */ public final String simpSENTENCE; /** This is the original query-string in Traditional Chinese */ public final String tradSENTENCE; /** * The Simplified Conversion - in this version, characters are grouped by "Word Boundaries" */ public final String[] simp; /** The Pronunciation of each element in the "Word Boundaries" version. */ public final String[] pron; /** * The traditional conversion - In this version, also, characters are grouped by * "Word Boundaries" */ public final String[] trad; /** * These are the English Translations of each word; some Chinese Words are translated into * multiple english words. */ public final String[] engl; /** * This is a 1-to-1 mapping between the chinese characters in the query, and their mandarin * pronunciation. Specifically, each element in this array contains a single ('one') syllable * of pinyin romanization - a.k.a. one chinese character's pronunciation. The array * {@code final String[] pron} contains as many syllables as the "Grouped By" Chinese Word * does. */ public final String[] pronSINGLE_WORD; /** * This is just an array of integers that was actually returned from the server. It identifies * the number of Chinese Characters that are in each Chinese Word. */ public final int[] wordGroups; /** * Unless specifically requested, this variable is null. It is the JSON as a String returned * from the server */ public final String JSON; /** Run the translation from the command-line. */ public static void main(String[] argv) throws IOException { System.out.println(getResponse((argv.length == 1) ? argv[0] : sampleText, true).toString()); } /** * This translates a sentence from Mandarin into English. The response object is an instance * of this exact class. * * @param chineseText This is a text String input, and is expected to be in Mandarin. The * server does not seem to mind whether the input is in "Simplified" or "Traditional." * * @param saveJSON If this is true, the field {@code public final String JSON} will contain the * original JSON string retrieved from the server. If this is FALSE, the JSON field will be * null. * * @return An instance of this class, with all of the constant {@code final} fields filled in. */ public static Pin1Yin1 getResponse(String chineseText, boolean saveJSON) throws IOException { return new Pin1Yin1(chineseText, saveJSON); } private Pin1Yin1(String chineseText, boolean saveJSON) throws IOException { String receivedJSON = Scrape.scrapePage(PIN1_YIN1_URL + chineseText).toString(); Matcher m = P1.matcher(receivedJSON); if (! m.find()) throw new IllegalStateException ("The regular expression didn't match the response from pin1yin1.com"); JSON = saveJSON ? receivedJSON : null; String question = m.group(1); simpSENTENCE = m.group(2); tradSENTENCE = m.group(3); pronSINGLE_WORD = parseQuotationJSONArray(m.group(4)).toArray(new String[0]); engl = parseQuotationJSONArray(m.group(5)).toArray(new String[0]); wordGroups = parseIntJSONArray(m.group(6)); String[][] matched = matchThemUp(simpSENTENCE, tradSENTENCE, pronSINGLE_WORD, wordGroups); simp = matched[0]; trad = matched[1]; pron = matched[2]; } /** This converts {@code 'this'} result to a {@code java.lang.String} */ public String toString() { StringBuilder sb = new StringBuilder(); if (JSON != null) sb.append(JSON + "\n"); sb.append("Simp: " + simpSENTENCE + "\n" + "Trad: " + tradSENTENCE); sb.append("\nSimp-Words:\t"); for (String s : simp) sb.append(s + ", "); int len = sb.length(); sb.delete(len - 2, len); // DELETE THE ENDING COMMA-SPACE sb.append("\nPron-Words:\t"); for (String s : pron) sb.append(s + ", "); len = sb.length(); sb.delete(len - 2, len); // DELETE THE ENDING COMMA-SPACE sb.append("\nTrad-Words:\t"); for (String s : trad) sb.append(s + ", "); len = sb.length(); sb.delete(len - 2, len); // DELETE THE ENDING COMMA-SPACE sb.append("\nEngl-Words:\t"); for (String s : engl) sb.append("[" + s + "] "); return sb.toString(); } private static Vector<String> parseQuotationJSONArray(String lineJSON) { Vector<String> ret = new Vector<String>(); Matcher m = P2.matcher(lineJSON); while (m.find()) ret.addElement(m.group(1)); return ret; } private static int[] parseIntJSONArray(String lineJSON) { IntStream.Builder b = IntStream.builder(); Matcher m = P3.matcher(lineJSON); while (m.find()) b.add(Integer.parseInt(m.group(1))); return b.build().toArray(); } /** * Matches the characters together. * * @param simp Simplified Character's * @param trad Traditional Charater's * @param pron Pronunciation * @param intMatchArr m * * @return Matches up the characters and pronunciation. * * <BR /><BR /><UL CLASS=JDUL> * <LI>{@code ret[0]:} Simplified Chinese Characters Grouped into words</LI> * <LI>{@code ret[1]:} Traditional Chinese Characters Grouped into words</LI> * <LI>{@code ret[2]:} Pronunciation Grouped into words</LI> * <UL> */ private static String[][] matchThemUp (String simp, String trad, String[] pron, int[] intMatchArr) { String tempStr = ""; Vector<String> tempVec = new Vector<String>(); String[][] ret = new String[3][0]; int pos = 0; // Line up the Simplified characters into word boundaries for (int numChineseChars : intMatchArr) { for (int i=0; i < numChineseChars; i++) tempStr += simp.charAt(pos + i); pos += numChineseChars; tempVec.addElement(tempStr); tempStr = ""; } ret[0] = tempVec.toArray(ret[0]); // Line up the Traditional Characters into word boundaries pos = 0; tempVec.removeAllElements(); for (int numChineseChars : intMatchArr) { for (int i=0; i < numChineseChars; i++) tempStr += trad.charAt(pos + i); pos += numChineseChars; tempVec.addElement(tempStr); tempStr = ""; } ret[1] = tempVec.toArray(ret[1]); // Lne up the Pronunciation "PinYin-Characters" into word boundaries pos = 0; tempVec.removeAllElements(); for (int numChineseChars : intMatchArr) { for (int i=0; i < numChineseChars; i++) tempStr += pron[pos + i]; pos += numChineseChars; tempVec.addElement(tempStr); tempStr = ""; } ret[2] = tempVec.toArray(ret[2]); return ret; } } |