hmmcut

最新推荐文章于 2022-11-25 11:35:23 发布

yongquanfengjie

最新推荐文章于 2022-11-25 11:35:23 发布

阅读量161

点赞数

本文链接：https://blog.csdn.net/yongquanfengjie/article/details/107193426

版权

#!/usr/bin/env python3
# coding: utf-8
# File: hmm_cut.py
# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
# Date: 18-3-26

class HmmCut:
    def __init__(self):
        trans_path = './model/prob_trans.model'
        emit_path = './model/prob_emit.model'
        start_path = './model/prob_start.model'
        self.prob_trans = self.load_model(trans_path)
        self.prob_emit = self.load_model(emit_path)
        self.prob_start = self.load_model(start_path)

    '''加载模型'''
    def load_model(self, model_path):
        f = open(model_path, 'r',encoding = 'utf-8')
        a = f.read()
        word_dict = eval(a)
        f.close()
        return word_dict

    '''verterbi算法求解'''
    def viterbi(self, obs, states, start_p, trans_p, emit_p):  # 维特比算法（一种递归算法）
        # 算法的局限在于训练语料要足够大，需要给每个词一个发射概率,.get(obs[0], 0)的用法是如果dict中不存在这个key,则返回0值
        V = [{}]
        path = {}
        for y in states:
            V[0][y] = start_p[y] * emit_p[y].get(obs[0], 0)  # 在位置0，以y状态为末尾的状态序列的最大概率
            path[y] = [y]

        for t in range(1, len(obs)):
            V.append({})
            newpath = {}
            for y in states:
                state_path = ([(V[t - 1][y0] * trans_p[y0].get(y, 0) * emit_p[y].get(obs[t], 0), y0) for y0 in states if V[t - 1][y0] > 0])
                if state_path == []:
                    (prob, state) = (0.0, 'S')
                else:
                    (prob, state) = max(state_path)
                V[t][y] = prob
                newpath[y] = path[state] + [y]

            path = newpath  # 记录状态序列
        (prob, state) = max([(V[len(obs) - 1][y], y) for y in states])  # 在最后一个位置，以y状态为末尾的状态序列的最大概率
        return (prob, path[state])  # 返回概率和状态序列

    # 分词主控函数
    def cut(self, sent):
        prob, pos_list = self.viterbi(sent, ('B', 'M', 'E', 'S'), self.prob_start, self.prob_trans, self.prob_emit)
        seglist = list()
        word = list()
        for index in range(len(pos_list)):
            if pos_list[index] == 'S':
                word.append(sent[index])
                seglist.append(word)
                word = []
            elif pos_list[index] in ['B', 'M']:
                word.append(sent[index])
            elif pos_list[index] == 'E':
                word.append(sent[index])
                seglist.append(word)
                word = []
        seglist = [''.join(tmp) for tmp in seglist]

        return seglist

    #测试
    # def test(self):
sent = '维特比算法viterbi的简单实现 python版'
sent = '''目前在自然语言处理技术中，中文处理技术比西文处理技术要落后很大一段距离，许多西文的处理方法中文不能直接采用，就是因为中文必需有分词这道工序。中文分词是其他中文信息处理的基础，搜索引擎只是中文分词的一个应用。'''
# sent = '北京大学学生前来应聘'
# sent = '新华网驻东京记者报道'
# sent = '我们在野生动物园玩'
cuter = HmmCut()
seglist = cuter.cut(sent)
print(seglist)


# HmmCut()

yongquanfengjie

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
hmmcut

#!/usr/bin/env python3# coding: utf-8# File: hmm_cut.py# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io># Date: 18-3-26class HmmCut: def __init__(self): trans_path = './model/prob_trans.model' emit_path = './mod
复制链接

扫一扫