拼音分词:
提示:解决pygame无法输入中文。
- 实现自动分词
拼音分词:
(一)、建立字典树
class Node:
def __init__(self):
self.children = {}
self.is_end_of_pinyin = False
class PinyinDictTree:
def __init__(self):
self.root = Node()
def insert(self, pinyin):
node = self.root
for i in range(len(pinyin)):
if pinyin[i] not in node.children:
node.children[pinyin[i]] = Node()
node = node.children[pinyin[i]]
node.is_end_of_pinyin = True
def search(self, pinyin):
node = self.root
for i in range(len(pinyin)):
if pinyin[i] not in node.children:
return False
node = node.children[pinyin[i]]
return node.is_end_of_pinyin
(二)、构建字典树
将所有声母与声母后面可以拼接的韵母进行组合。
def Construction_Dictionary():
pinyin_tree = PinyinDictTree()
special_cases = ['ao', 'ai', 'a', 'er', 'an', 'o', 'ang']
pinyin_dict = {
'b':['a', 'o', 'e', 'i', 'ia', 'ie', 'iao', 'iu', 'ian', 'in', 'iang', 'ing', 'u', 'ua', 'uo', 'uai', 'ui',
'uan', 'uang', 've', 'ai'],
'p': ['a', 'o', 'e', 'i', 'ia', 'ie', 'iao', 'ou', 'an', 'ang', 'eng', 'i', 'in', 'ing', 'u', 'uan', 'uo',
'uai', 'ui', 'uan', 'uang', 've'],
'm': ['a', 'o', 'e', 'i', 'ia', 'ie', 'iao', 'ou', 'an', 'ang', 'eng', 'i', 'in', 'ing', 'u', 'uan', 'ua', 'ui',
'uan', 'uang'],
'f': ['a', 'o', 'e', 'ei', 'ou', 'an', 'en', 'ang', 'eng', 'u', 'iu'],
'd': ['a', 'e', 'i', 'ia', 'ie', 'iao', 'ou', 'an', 'ang', 'eng', 'i', 'iu', 'ra', 'ian', 'iang', 'ing', 'u',
'ua', 'uo', 'uai', 'ui', 'uan', 'uang'],
't': ['a', 'e', 'i', 'ia', 'ie', 'iao', 'iou', 'ai', 'ui', 'an', 'ang', 'eng', 'ian', 'iang', 'ing', 'u', 'uan',
'uang', 've'],
'n': ['a', 'e', 'i', 'ia', 'ie', 'iao', 'iu', 'ian', 'in', 'iang', 'ing', 'u', 'uan', 'uang', 've', 'v'],
'l': ['a', 'o', 'e', 'ai', 'ei', 'ia', 'iao', 'ian', 'iang', 'ing', 'u', 'uan', 'uang', 'uo', 've', 'i'],
'g': ['a', 'e', 'ai', 'ei', 'ui', 'ao', 'ou', 'an', 'ang', 'eng', 'er', 'i', 'ua', 'uo', 'uai', 'ui', 'uan',
'uang','on','ong'],
'k': ['a', 'e', 'ai', 'ei', 'ou', 'an', 'ang', 'u', 'ua', 'uo', 'uai', 'ui', 'uan', 'uang'],
'h': ['a', 'e', 'ai', 'ei', 'ao', 'ou', 'an', 'ang', 'i', 'ia', 'ie', 'iao', 'iu', 'u', 'ua', 'uo', 'uai', 'ui',
'uan', 'uang', 'ua'],
'j': ['i', 'ia', 'ie', 'iao', 'iu', 'ian', 'in', 'iang', 'ing', 'u', 'uan', 'uang'],
'q': ['i', 'ia', 'ie', 'iao', 'iu', 'ian', 'iang', 'ing', 'u', 'uan', 'uang'],
'x': ['i', 'ia', 'ie', 'iao', 'iu', 'ian', 'iang', 'ing', 'u', 'uan', 'uang', 've'],
'zh': ['a', 'e', 'ai', 'ei', 'ao', 'ou', 'an', 'ang', 've', 'i', 'ua', 'uo', 'uai', 'ui', 'uan', 'uang','en','eng'],
'ch': ['a', 'e', 'ai', 'ei', 'ao', 'ou', 'an', 'ang', 've', 'u', 'ua', 'uo', 'uai', 'ui', 'uan', 'uang', 'en', 'eng'],
'sh': ['a', 'e', 'i', 'ou', 'an', 'ang', 've', 'u', 'ua', 'uo', 'uai', 'ui', 'uan', 'uang','en','eng'],
'r': ['e', 'i', 'en', 'eng', 'i', 'ian', 'iang', 'u', 'ua'],
'z': ['a', 'e', 'i', 'iu', 'an', 'ang', 'u', 'ua', 'uo', 'ui', 'uan', 'uang', 'i', 'ie'],
'c': ['a', 'e', 'i', 'i', 'ou', 'an', 'ang', 'u', 'uo', 'ui', 'uan', 'uang'],
's': ['a', 'e', 'i', 'i', 'ou', 'an', 'ang', 'eng', 'i', 'ong', 'u', 'ong', 'ong', 'ong', 'ong'],
'y': ['a', 'ao', 'e', 'i', 'in', 'ing', 'i', 'o', 'ong', 'in', 'ing', 'in', 'ing', 'u', 'uan', 'ue', 'un', 'uan', 'uen', 'un'],
'w': ['a', 'ai', 'an', 'ang', 'ei', 'en', 'eng', 'o', 'u', 'ei', 'en', 'eng']
}
for special in special_cases:
pinyin_tree.insert(special)
for shengmu,yunmu in pinyin_dict.items():
if 'a' in yunmu and 'i' in yunmu:
for i in ['i','o']:
yunmu.append('a'+i)
for y in yunmu:
pinyin_tree.insert(shengmu + y)
return pinyin_tree
(三)、通过PinyinDictTree的search方法进行分词。
def find_last_true_index(lst):
last_true_index = -1
for i, val in enumerate(lst):
if val:
last_true_index = i
return last_true_index
def pinyin_format(tree, pinyin):
x = pinyin
resault = []
while True:
flag = []
s = ''
for i in x:
s += i
r = tree.search(s)
flag.append(r)
y = find_last_true_index(flag)
p = x[:y+1]
if y == -1:
resault.append(x[::])
break
if x[:2] in ['zh', 'ch', 'sh']:
if x[2:4] in ['ia', 'ie', 'in',] or x[2:5] in ['iao', 'ian',]or x[2:6] in [ 'iang']:
resault.append(x[:3])
x = x[3:]
elif x[2:4] in ['en','on','an']:
if x[4:5] == 'g' and len(x) != 5:
if x[5:6] == 'g':
resault.append(p)
x = x[y + 1:]
else:
resault.append(x[:y+1])
x = x[y+1:]
else:
resault.append(x[:y+1])
x = x[y+1:]
else:
resault.append(p)
x = x[y + 1:]
else:
resault.append(p)
x = x[y+1:]
if not x:
break
return resault
解析拼音可以使用Pinyin2Hanzi,具体解析内容下一期更新。。。。