NLP之：中文分词-Viterebi alg-CSDN博客

本文链接：https://blog.csdn.net/u012114900/article/details/107774776

Viterebi—Solu: DP

中文分词

**Part1 分词：**句子all可能的划分方式划分出来

方法一；基于枚举方式

 #recursive_fengci
def full_segmentation(input_str, words):   #words:词典库
     if input_str=='': return [[]]
     else:
         result =[]
         for i in range(len(input_str)+1):  #一直到空字符，故加1
             if input_str[:i] in words:
                 for x in full_segmentation(input_str[i:],words):
                     print(x)
                     result.append([input_str[:i]] + x)     #说明：1.前段：type(input_str[:i])是str   x是list, 故加[]
                                                               #  2.后段：x是剩下input_str[i:]化分词的结果整体list,不用想太复杂
         return result

words=['我们', '学习', '人工', '智能']              #ex1
input_str = "我们学习人工智能"
words=['我们', '学习', '人工', '智能', '人工智能', '未来', '是'] #ex1返回全部可能性，注意区别
input_str = "我们学习人工智能人工智能是未来"
result= full_segmentation(input_str,words)
print(result)

[ $\color{blue}{经典}$ ]方法二；基于Viterebi
创建带权重的有向图（Directed Graph）:根据输入句子和word_prob
有向图每一条边：一个单词的概率

import collections
import math
import pandas
df_vocabulary=pandas.read_excel(
    '综合类中文词库.xlsx', sheet_name='Sheet1',header=None, index_col=0)  #查看##df_vocabulary.[i:j] or df_vocabulary.index[:5]
dic_words={x for x in df_vocabulary.index} #保存词典库中读取的单词,因words位于第一列(等价于index)
#词向量构建的一种‘专家’方式：先指定大prob的words, others小prob[others来自于词库(自己提供的)]
word_prob={"北京":0.03, "的":0.08, "天":0.005, "气":0.005, "天气":0.06, "真":0.04,"好":0.03,
}
word_prob.update(
    {word:0.00001  for word in dic_words if word not in word_prob.keys()})  #!.update()
word_score={word: -math.log(word_prob[word])   for word in word_prob.keys()}
%-------------------------------------------------------------------------------------
from collections import OrderedDict
def word_segment_viterbi(input_str):

    # TODO step1:基于输入句子(str)，词典，以及给定的unigram概率=》创建DAG(有向图)
    def get_graph(input_str):
        graph = collections.OrderedDict() #有向图：OrderedDict(Dict)  #or: graph = OrderedDict()

        for i in range(len(input_str) + 1): #unimportant(因下面j<i的设置)
            if i not in graph:
                graph[i] = {}   #Dict
            for j in range(i):
                if input_str[j:i] in word_score.keys():  # 不断往前回馈
                    graph[i][j] = word_score[input_str[j:i]]
        return graph

    graph = get_graph(input_str)  # ！！衔接code

#测试
input_str="北京的天气真好啊"
Out[27]:
OrderedDict([(0, {}),
             ...
             (7, {5: 11.512925464970229, 6: 3.506557897319982}),
             (8, {7: 11.512925464970229})])

#_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
    # TODO step2:Viterebi alg寻找最优的Path
    def get_best_path(graph):
        score={0: 0}
        path= {}
        for i in graph.keys():
            if graph[i]:
                for j in graph[i].keys():
                    if i not in score.keys() or score[i]>graph[i][j]+score[j]: #DP，以i为终点遍历，得到哪一条path的(词prob和)最小；
                        score[i]=graph[i][j]+score[j]
                        path[i]=j   #path.values:前一个点的位置==分词位置
        return path
    path=get_best_path(graph)

#_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
    # TODO step3:返回分词结果
    def path_to_segment(path, input_str):
        segment = []
        i =max(path.keys())
        while i>0:
            segment= [ input_str[path[i]:i] ] + segment  #从后往前：句子的最后分词开始抽取
            i=path[i]
        return segment
    segment=path_to_segment(path, input_str)
print(word_segment_viterbi(input_str))

小总结：
片段一：
#从后往前
segment={}
i=max(path.keys())
while i>0:
    segment=[input_str[path[i]:i]]+segment  #和最初的i in range(len(input_str) + 1)呼应了
    i=path[i]