https://github.com/mozillazg/pinyin-data 整理了汉字的拼音数据,我们取其中的pinyin.txt再做些处理,让其更加易用。
预处理
pinyin.txt看起来是这样的(部分内容):
U+3007: líng # 〇
U+3400: qiū # 㐀
U+3401: tiàn # 㐁
U+3404: kuà # 㐄
U+3405: wǔ # 㐅
U+3406: yǐn # 㐆
U+340C: yí # 㐌
U+3416: xié # 㐖
用下面的python脚本处理一下:
# coding: utf-8
import json
import codecs
table = {}
for line in open('./pinyin.txt', 'r'):
line = line.strip()
if len(line) == 0:
continue
if '#' not in line:
raise Exception('no # in line: '+line)
ls = line.split('#')
hanzi = ls[1].strip()
unicode_pys = ls[0].split(':')
unicode_code = unicode_pys[0].strip()
pys = unicode_pys[1].strip()
if ',' in pys:
pys = [item.strip() for item in pys.split(',')]
else:
pys = [pys]
table[hanzi] = pys
print 'gen pinyin.db'
with open('pinyin.db', 'w') as out:
s = ''
for hanzi in table:
s = s + hanzi + '=' + ','.join(table[hanzi]) + '\n'
out.write(s)
print 'gen pinyin.json'
with open('pinyin.json', 'w') as out:
json.dump(table, out, ensure_ascii=False, indent=4)
print 'finish'
得到的pinyin.db的部分内容如下:
?=qiáng
?=náng
?=zhǎn
?=yǒng
?=tà
?=xiè,wén
?=ǒu
?=xiàng
?=guó
pinyin.json文件是json格式数据:
{
"?": [
"gǔ"
],
"?": [
"gǒng"
],
"?": [
"lǐ"
],
"?": [
"gǔ"
],
// .....
"?": [
"yì"
],
"煭": [
"liè"
],
"煬": [
"yáng",
"yàng"
]
}
json数据可以直接拿到nodejs等程序中使用。
在上面给出的转换脚本中生成json数据的一个代码片段如下:
with open('pinyin.json', 'w') as out:
json.dump(table, out, ensure_ascii=False, indent=4)
indent参数是为了让json在文件中以较为美观的形式保存起来。ensure_ascii参数设置为False是为了让unicode字符不保存成下面这样:
{
"\ud856\udf9d": [
"g\u01d4"
],
"\ud856\uddc2": [
"g\u01d2ng"
],
"\ud869\udd74": [
"l\u01d0"
],
"\ud869\udd77": [
"g\u01d4"
],
// ......
java工具类
eclipse下:
android studio下:
SimplePinyin.java源码:
package me.letiantian.simplepinyin;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Map;
public class SimplePinyin {
private SimplePinyin() {}
public static final Map<String, String> TABLE = SimplePinyin.getPinyinResource();
public static Map<String, String> getPinyinResource() {
Map<String, String> map = new HashMap<String, String>();
InputStream is = SimplePinyin.class.getClassLoader().getResourceAsStream("pinyin.db");
BufferedReader br;
try {
br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
String line = null;
while ((line = br.readLine()) != null) {
String[] tokens = line.trim().split("=");
map.put(tokens[0], tokens[1]);
}
br.close();
} catch (IOException e) {
e.printStackTrace();
}
return map;
}
public static String getPinyin(char c) {
String sChar = String.valueOf(c);
return TABLE.get(sChar);
}
public static String getPinyin(String s) {
return TABLE.get(s);
}
}
示例(test.java):
package me.letiantian.simplepinyin;
public class test {
public static void main(String[] args) {
System.out.println(SimplePinyin.getPinyin('乐'));
System.out.println(SimplePinyin.getPinyin("了"));
System.out.println(SimplePinyin.getPinyin("了ad"));
}
}
运行结果:
lè,yuè
le,liǎo,liào
null