Viterebi—Solu: DP
中文分词
**Part1 分词:**句子all可能的划分方式划分出来
方法一;基于枚举方式
#recursive_fengci
def full_segmentation(input_str, words): #words:词典库
if input_str=='': return [[]]
else:
result =[]
for i in range(len(input_str)+1): #一直到空字符,故加1
if input_str[:i] in words:
for x in full_segmentation(input_str[i:],words):
print(x)
result.append([input_str[:i]] + x) #说明:1.前段:type(input_str[:i])是str x是list, 故加[]
# 2.后段:x是剩下input_str[i:]化分词的结果整体list,不用想太复杂
return result
words=['我们', '学习', '人工', '智能'] #ex1
input_str = "我们学习人工智能"
words=['我们', '学习', '人工', '智能', '人工智能', '未来', '是'] #ex1返回全部可能性,注意区别
input_str = "我们学习人工智能人工智能是未来"
result= full_segmentation(input_str,words)
print(result)
[ 经 典 \color{blue}{经典} 经典]方法二;基于Viterebi
创建带权重的有向图(Directed Graph):根据 输入句子和word_prob
有向图每一条边:一个单词的概率
import collections
import math
import pandas
df_vocabulary=pandas.read_excel(
'综合类中文词库.xlsx', sheet_name='Sheet1',header=None, index_col=0) #查看##df_vocabulary.[i:j] or df_vocabulary.index[:5]
dic_words={x for x in df_vocabulary.index} #保存词典库中读取的单词,因words位于第一列(等价于index)
#词向量构建的一种‘专家’方式:先指定大prob的words, others小prob[others来自于词库(自己提供的)]
word_prob={"北京":0.03, "的":0.08, "天":0.005, "气":0.005, "天气":0.06, "真":0.04,"好":0.03,
}
word_prob.update(
{word:0.00001 for word in dic_words if word not in word_prob.keys()}) #!.update()
word_score={word: -math.log(word_prob[word]) for word in word_prob.keys()}
%-------------------------------------------------------------------------------------
from collections import OrderedDict
def word_segment_viterbi(input_str):
# TODO step1:基于输入句子(str),词典,以及给定的unigram概率=》创建DAG(有向图)
def get_graph(input_str):
graph = collections.OrderedDict() #有向图:OrderedDict(Dict) #or: graph = OrderedDict()
for i in range(len(input_str) + 1): #unimportant(因下面j<i的设置)
if i not in graph:
graph[i] = {} #Dict
for j in range(i):
if input_str[j:i] in word_score.keys(): # 不断往前回馈
graph[i][j] = word_score[input_str[j:i]]
return graph
graph = get_graph(input_str) # !!衔接code
#测试
input_str="北京的天气真好啊"
Out[27]:
OrderedDict([(0, {}),
...
(7, {5: 11.512925464970229, 6: 3.506557897319982}),
(8, {7: 11.512925464970229})])
#_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
# TODO step2:Viterebi alg寻找最优的Path
def get_best_path(graph):
score={0: 0}
path= {}
for i in graph.keys():
if graph[i]:
for j in graph[i].keys():
if i not in score.keys() or score[i]>graph[i][j]+score[j]: #DP,以i为终点遍历,得到哪一条path的(词prob和)最小;
score[i]=graph[i][j]+score[j]
path[i]=j #path.values:前一个点的位置==分词位置
return path
path=get_best_path(graph)
#_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
# TODO step3:返回分词结果
def path_to_segment(path, input_str):
segment = []
i =max(path.keys())
while i>0:
segment= [ input_str[path[i]:i] ] + segment #从后往前:句子的最后分词开始抽取
i=path[i]
return segment
segment=path_to_segment(path, input_str)
print(word_segment_viterbi(input_str))
小总结:
片段一:
#从后往前
segment={}
i=max(path.keys())
while i>0:
segment=[input_str[path[i]:i]]+segment #和最初的i in range(len(input_str) + 1)呼应了
i=path[i]