拼音转汉字主要是针对具有连续序列如:woaizhongguorenmingya,这种类型转化为汉字。其主要思路分为两步走,第一步是对拼音进行分割,以分割出正确的拼音;第二步是利用hmm方法,计算汉字的最大观测序列,从而得到最大概率的中文字链。
算法描述:
第一步:
1、首先构建拼音的TrieNode树;
2、利用TrieNode树查找出字符串中所有匹配词;
3、选取最大匹配词,并将字符串的位置后移;
4、重复2步骤,直到到词尾结束;
5、输出分割的拼音;
第二步:
1、利用汉字转拼音工具,对语料进行拼音转换;
2、分别统计并计算出拼音到汉字的发射概率、汉字到汉字的转移概率,如p(欢|喜),p(喜|B),p(欢|E),其中B表示词条的开始,E表示词条的末尾。
3、利用维特比算法计算拼音观测序列的最大化生成概率,从而输出状态序列,得到最终的结果。
代码如下:
# 拼音转为汉字 import pickle # 第一步,拼音进行拆分 # 定义TrieNode树的节点 class TrieNode: def __init__(self): self.value = None self.children = {} # 遍历树 class SearchIndex: def __init__(self,index,char=None,parent=None): self.index = index self.char = char self.parent = parent # 定义trie树字典 class Trie: def __init__(self): self.root = TrieNode() self.trie_path = 'D:\workspace\project\\NLPcase\\ping2han\\data\\pinyin_trie.model' self.pinyin_path = 'D:\workspace\project\\NLPcase\\ping2han\\data\\pinyin.txt' # 添加树节点 def inser(self,key): node = self.root for char in key: if char not in node.children: child = TrieNode() node.children[char] = child node = child else: node = node.children[char] node.value = key # 最后一个字符用于存储字符 # 查找节点 def search(self,key): node = self.root matches = [] for char in key: if char not in node.children: break node = node.children[char] if node.value: matches.append(node.value) return matches # 构建一颗trie树 def build_trie(self): trie = Trie()# 这个只会初始化对应的参数 for line in open(self.pinyin_path,encoding='utf-8'): word = line.strip().lower() trie.inser(word) with open(self.trie_path,'wb',encoding='utf-8') as f: pickle.dump(trie,f) # 拼音切分 class PinyinCut: def __init__(self): self.trie_path = 'D:\workspace\project\\NLPcase\\ping2han\\data/pinyin_trie.model' self.trie = self.load_trie(self.trie_path) def load_trie(self,trie_path): with open(trie_path,'rb') as f: return pickle.load(f) def cut(self,sent): # 句子总长度 len_sent = len(sent) # 返回的结果 chars = [] # 候选序列 candidate_index = [SearchIndex(0)] # 当前单词的最后一个位置 last_index = None while candidate_index: p = candidate_index.pop()# 选取是最大匹配的拼音字符 # 如果当前字符所在索引为句子长度,则结束 if p.index == len_sent: last_index == p break matches = self.trie.search(sent[p.index:]) for m in matches: new_index = SearchIndex(p.index+len(m),m,p)# 这种结构倒是省了很多事儿也,边移位置边匹配,而且可以很好的记录后进行分割 candidate_index.append(new_index) index = last_index while index: if index.parent: chars.insert(0,index.char) index = index.parent return chars # 第二部分,进行拼音转中文
import math from ping2han.pinyincut import PinyinCut class PinyinWordTrans: def __init__(self): self.bigram_path = 'D:\workspace\project\\NLPcase\\ping2han\\data\\bigram.model' self.pinyin2word_path = 'D:\workspace\project\\NLPcase\\ping2han\\data/pinyin2word.model' self.wordfreq_path = 'D:\workspace\project\\NLPcase\\ping2han\\data/wordfreq.model' self.bigram_dict = self.load_model(self.bigram_path) self.pinyin2word_dict = self.load_model(self.pinyin2word_path) self.wordfreq_dict = self.load_model(self.wordfreq_path) self.pinyincuter = PinyinCut() self.min_trans = 1e-10 self.min_emit = 1e-10 def load_model(self, model_path): f = open(model_path, 'r',encoding='utf-8') a = f.read() word_dict = eval(a) f.close() return word_dict # 提取转移矩阵 def trans(self, sent): pinyin_list = self.pinyincuter.cut(sent) route_dict = {len(pinyin_list):{'E':1.0}} for index, pinyin in enumerate(pinyin_list): route_dict[index] = {} if index == 0: for word, p_word in self.pinyin2word_dict[pinyin].items(): p0 = p_word * self.bigram_dict['B'].get(word, self.min_emit) if p0 >0 : route_dict[index][word] = p0 else: for word, p_word in self.pinyin2word_dict[pinyin].items(): route_dict[index][word] = p_word result = self.viterbi(route_dict) return result '''verterbi算法求解''' def viterbi(self, route_dict): V = [{}] result = list() for state in route_dict[0]: V[0][state] = route_dict[0][state] for t in range(1, len(route_dict)): V.append({}) for word, word_prob in route_dict[t].items(): tmp = [] for pre_state in V[t - 1].keys(): last_p = V[t - 1][pre_state] current_p = word_prob if pre_state not in self.bigram_dict: trans_p = 0 else: trans_p = self.bigram_dict[pre_state].get(word, self.min_trans) score = last_p * current_p * trans_p tmp.append(score) print(tmp) max_prob = max(tmp) V[t][word] = max_prob #将序列进行提取 for vector in V: max_state = sorted(vector.items(), key=lambda asd: asd[1], reverse=True)[0][0] result.append(max_state) return result[:-1]
总结
主要利用hmm模型求解序列问题,用于学习笔记。
参考资料:
https://blog.csdn.net/jiangzhenkang/article/details/84555947
https://github.com/liuhuanyong/Pinyin2Chinese/blob/master/pinyincut.py