#!/usr/bin/env python3
# coding: utf-8
# File: hmm_cut.py
# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
# Date: 18-3-26
class HmmCut:
def __init__(self):
trans_path = './model/prob_trans.model'
emit_path = './model/prob_emit.model'
start_path = './model/prob_start.model'
self.prob_trans = self.load_model(trans_path)
self.prob_emit = self.load_model(emit_path)
self.prob_start = self.load_model(start_path)
'''加载模型'''
def load_model(self, model_path):
f = open(model_path, 'r',encoding = 'utf-8')
a = f.read()
word_dict = eval(a)
f.close()
return word_dict
'''verterbi算法求解'''
def viterbi(self, obs, states, start_p, trans_p, emit_p): # 维特比算法(一种递归算法)
# 算法的局限在于训练语料要足够大,需要给每个词一个发射概率,.get(obs[0], 0)的用法是如果dict中不存在这个key,则返回0值
V = [{}]
path = {}
for y in states:
V[0][y] = start_p[y] * emit_p[y].get(obs[0], 0) # 在位置0,以y状态为末尾的状态序列的最大概率
path[y] = [y]
for t in range(1, len(obs)):
V.append({})
newpath = {}
for y in states:
state_path = ([(V[t - 1][y0] * trans_p[y0].get(y, 0) * emit_p[y].get(obs[t], 0), y0) for y0 in states if V[t - 1][y0] > 0])
if state_path == []:
(prob, state) = (0.0, 'S')
else:
(prob, state) = max(state_path)
V[t][y] = prob
newpath[y] = path[state] + [y]
path = newpath # 记录状态序列
(prob, state) = max([(V[len(obs) - 1][y], y) for y in states]) # 在最后一个位置,以y状态为末尾的状态序列的最大概率
return (prob, path[state]) # 返回概率和状态序列
# 分词主控函数
def cut(self, sent):
prob, pos_list = self.viterbi(sent, ('B', 'M', 'E', 'S'), self.prob_start, self.prob_trans, self.prob_emit)
seglist = list()
word = list()
for index in range(len(pos_list)):
if pos_list[index] == 'S':
word.append(sent[index])
seglist.append(word)
word = []
elif pos_list[index] in ['B', 'M']:
word.append(sent[index])
elif pos_list[index] == 'E':
word.append(sent[index])
seglist.append(word)
word = []
seglist = [''.join(tmp) for tmp in seglist]
return seglist
#测试
# def test(self):
sent = '维特比算法viterbi的简单实现 python版'
sent = '''目前在自然语言处理技术中,中文处理技术比西文处理技术要落后很大一段距离,许多西文的处理方法中文不能直接采用,就是因为中文必需有分词这道工序。中文分词是其他中文信息处理的基础,搜索引擎只是中文分词的一个应用。'''
# sent = '北京大学学生前来应聘'
# sent = '新华网驻东京记者报道'
# sent = '我们在野生动物园玩'
cuter = HmmCut()
seglist = cuter.cut(sent)
print(seglist)
# HmmCut()
hmmcut
最新推荐文章于 2022-11-25 11:35:23 发布