(一)算法介绍
隐马尔可夫模型(HMM)就是估算隐藏于表面事件背后的事件的概率模型。一般包含观测序列,隐序列,转移概率分布,发射概率分布以及初始状态。
维特比(viterbi)算法属于隐马尔可夫模型中的对于解码时的一种算法,依据最后一个时刻中概率最高的状态,逆向通过找其路径中的上一个最大部分最优路径,从而找到整个最优路径。
(二)算法原理
假设观测序列长度为T,隐序列长度为N,Vt[s,t]表示在t时刻以状态s终止的最大概率,ai,j表示观测状态i到观测状态j的转移概率,bs(Ot)表示t时刻观测状态为s情况下对应隐序列的隐状态的发射概率
1、创建两个N+2行,T列的二维表,一个为Viterbi[N+2,T]记录概率,另一个为backpointer[N+2,T]
2、对这个二维表的第一列的前N行做初始化,Viterbi[S,1]=a0,s*bs(01),即t = 1时刻分别求出N个状态下产生观测变量1的概率,backpointer[s,1]=0
3、
第一个式子的是当t和s不变时i= 1,2,3,…,N是分别求出t - 1时刻所有可能的状态,转移到t时刻状态i的概率的最大值,最后乘以观测概率就是t状态 i 最有可能产生观测变量t的概率。
argmax是求在t-1时刻的状态最有可能转移到t时刻的状态s。
通过这两个公式求出各个时刻最有可能的状态。
4、取得T时刻最有可能的状态,根据T时刻最有可能的状态反向推t=T-1, t=T-2,…,2,1时刻最有可能的状态。
语料链接:链接,密码:885p
数据处理:
# 数据处理,我认为每一篇新闻的第一个时间是多余的,这会让训练得到的转移矩阵专注于该种形式的文章。
data_file = './dependency/199801.txt'
pos = []
A_dict = {}
B = {}
fw = open('./dependency/new199801.txt', 'w', encoding='utf-8')
with open(data_file, 'r', encoding='utf-8')as f:
for line in f:
if line.strip() != '':
line = line.strip().split(' ')
temp_line = []
for word in line[1:]: # 去掉所有的第一部分,即人民日报的时间戳
if word.strip() != '':
temp_line.append(word)
sentence = ' '.join(temp_line)
print(sentence)
fw.write(sentence + '\n')
fw.close()
获得词性标注标签:
def get_set_tag():
data_file = './dependency/new199801.txt'
all_pos = ['hos', 'eos']
with open(data_file, 'r', encoding='utf-8')as fr:
for line in fr:
line = line.strip()
line = 'HOS/hos ' + line + ' EOS/eos'
if len(line) == 0:
continue
word_pos = line.split(' ')
temp_word = ''
for w_p in word_pos:
w_p = w_p.strip()
w_p = w_p.replace('[', '')
w = w_p.split('/')[0]
p = w_p.split('/')[1]
if ']' in w_p: # 如果数据存在],则是分解词的结尾,这时候,需要得到大会堂/n和人民大会堂/ns
p1 = p.split(']')[0]
p2 = p.split(']')[1]
all_pos.append(p1)
all_pos.append(p2)
else:
all_pos.append(p)
with open('./dependency/pos_tags.txt', 'w', encoding='utf-8')as f:
for pos in all_pos:
f.write(pos + '\n')
获得转移矩阵:
import pandas as pd
import pickle
def a_process():
data_file = './dependency/new199801.txt'
pos = []
A_dict = {}
B = {}
with open('pos_tags.txt', 'r', encoding='utf-8')as fr:
for line in fr:
pos.append(line.strip())
A_matrix = [[0 for i in range(len(pos))] for j in range(len(pos))]
print(len(A_matrix))
# 初始化
for p1 in pos:
for p2 in pos:
A_dict[p1, p2] = 0
with open(data_file, 'r', encoding='utf-8')as fr:
for line in fr:
line = line.strip()
if len(line) == 0:
continue
line = 'HOS/hos ' + line + ' EOS/eos'
word_pos = line.split(' ')
for index in range(len(word_pos)):
w_p = word_pos[index].strip()
next_index = 1
if w_p != '' and w_p != 'EOS/eos': # 结尾词没有下个词,即index=lth-1时,结束循环,不做处理
next_w_p = word_pos[index + next_index].strip()
while next_w_p == '':
next_index += 1
next_w_p = word_pos[index + next_index].strip()
p = w_p.split('/')[1]
next_p = next_w_p.split('/')[1]
if '[' in w_p and ']' in next_p: # 这里的思想也是为了解决分解词,不过为了获取
next_p1 = next_p.split(']')[0]
A_dict[next_p1, p] += 1
elif '[' in w_p and ']' not in next_p:
A_dict[next_p, p] += 1
elif ']' in w_p:
p1 = p.split(']')[0]
p2 = p.split(']')[1]
A_dict[next_p, p1] += 1
A_dict[next_p, p2] += 1
elif ']' in next_w_p:
next_p1 = next_p.split(']')[0]
A_dict[next_p1, p] += 1
else:
A_dict[next_p, p] += 1
for key, value in A_dict.items():
p1 = key[0]
p2 = key[1]
A_matrix[pos.index(p2)][pos.index(p1)] = value # 得到转移矩阵
result_to_csv = []
for p, a in zip(pos, A_matrix):
temp_a = [str(n) for n in a]
result_to_csv.append([p] + temp_a)
result_to_csv = pd.DataFrame(result_to_csv)
result_to_csv.to_csv('./dependency/A.csv', index=None, header=['0'] + pos) # 可以不用这个
#
with open('./dependency/A.pickle', 'wb')as fb:
pickle.dump(A_matrix, fb)
获得发射矩阵:
def B_process():
data_file = './dependency/new199801.txt'
pos = []
A = {}
B = {}
with open('./dependency/pos_tags.txt', 'r', encoding='utf-8')as fr:
for line in fr:
pos.append(line.strip())
with open(data_file, 'r', encoding='utf-8')as fr:
for line in fr:
line = line.strip()
line = 'HOS/hos ' + line + ' EOS/eos'
if len(line) == 0:
continue
word_pos = line.split(' ')
temp_word = ''
for w_p in word_pos:
w_p = w_p.strip()
w_p = w_p.replace('[', '')
w = w_p.split('/')[0]
p = w_p.split('/')[1]
if '[' in w_p: # 数据中存在这种情况[人民/n 大会堂/n]ns,这里的思想是判断[是否出现,出现的话,需要拼接人民--大会堂得到共同的词性ns
temp_word += w # 用一个中间变量不断去加分解词的中间词
if w not in B.keys():
B[w] = [0 for i in range(len(pos))] # 初始化发射矩阵
p_index = pos.index(p)
B[w][p_index] += 1
else:
p_index = pos.index(p)
B[w][p_index] += 1
elif ']' in w_p: # 如果数据存在],则是分解词的结尾,这时候,需要得到大会堂/n和人民大会堂/ns
p1 = p.split(']')[0]
p2 = p.split(']')[1]
temp_word += w
if temp_word not in B.keys():
B[temp_word] = [0 for i in range(len(pos))] # 初始化发射矩阵
p_index = pos.index(p2)
B[temp_word][p_index] += 1
else:
p_index = pos.index(p2)
B[temp_word][p_index] += 1
if w not in B.keys():
B[w] = [0 for i in range(len(pos))] # 初始化发射矩阵
p_index = pos.index(p1)
B[w][p_index] += 1
else:
p_index = pos.index(p1)
B[w][p_index] += 1
temp_word = ''
else:
if w not in B.keys():
B[w] = [0 for i in range(len(pos))] # 初始化发射矩阵
p_index = pos.index(p)
B[w][p_index] += 1
else:
p_index = pos.index(p)
B[w][p_index] += 1
result_to_csv = []
for key, value in B.items():
temp_value = [key] + value # 这里是得到词及其对应的发射矩阵
result_to_csv.append(temp_value)
result_to_csv = pd.DataFrame(result_to_csv)
result_to_csv.to_csv('./dependency/B.csv', index=None, header=['0']+pos) # 可以不用这个
with open('./dependency/B.pickle', 'wb')as fb:
pickle.dump(B, fb)
HMM+Viterbi算法:
#!/usr/bin/python3
# -*- coding:utf-8 -*-
# Author:ChenYuan
import pickle
import numpy as np
import math
import PMWS
class HMM(object):
def __init__(self):
self.A_matrix = None # 转移矩阵
self.B_matrix = None # 发射矩阵
self.pos = [] # 存放词性标签
self.A = [] # 转移概率矩阵
self.B = {} # 发射概率矩阵
self.initialize()
def read_matrix(self):
with open('./dependency/A.pickle', 'rb')as fw:
self.A_matrix = pickle.load(fw) # A矩阵是发射矩阵
with open('./dependency/B.pickle', 'rb')as fw:
self.B_matrix = pickle.load(fw)
def read_pos_tags(self):
with open('./dependency/pos_tags.txt', 'r', encoding='utf-8')as fr:
for line in fr:
self.pos.append(line.strip())
def get_probability_matrix(self):
for a in self.A_matrix:
self.A.append([math.log(num / sum(a)) for num in a])
for key, value in self.B_matrix.items():
self.B[key] = [math.log(v / sum(value)) for v in value]
def initialize(self):
self.read_matrix()
self.read_pos_tags()
self.get_probability_matrix()
def get_result(self, cws):
cws = cws + ' EOS'
first_pos = ''
all_word_pro = {}
viterbi_dict = {}
result_1 = ''
result_2 = ''
observe_seq = cws.split(' ')
for index in range(len(observe_seq)):
temp_pro = []
word = observe_seq[index]
if index == 0:
b = self.B.get(word)
if not b:
b = np.zeros((len(self.pos)))
else:
b = np.array(b)
pos_index = self.pos.index('hos') # 这里用hos作为开始标签,所以这里的开始初始值为P(tags|hos)
pos_parse_num = self.A_matrix[pos_index] #
PI = [math.log(p / sum(pos_parse_num)) for p in pos_parse_num] # 获取
assert len(b) == len(PI)
for w, p in zip(b, PI):
temp_pro.append(w + p)
word_pos = self.pos[int(np.argmax(temp_pro, axis=0))]
all_word_pro[word, index] = temp_pro
first_pos = word_pos
result_1 += word + '/' + word_pos + ' '
# print(all_word_pro)
else:
last_v = all_word_pro[observe_seq[index - 1], index - 1]
b = self.B.get(word)
if not b:
b = np.zeros((len(self.pos)))
else:
b = np.array(b)
last_v = np.array(last_v)
word_pos = ''
pos_pro = None
count_num = 0 # 用于捕获上个节点位置,即上个词的词性下标
for a in self.A:
a = np.array(a)
temp_pos_pro = last_v + a + b
temp_pro.append(max(temp_pos_pro))
temp_pos_index = np.argmax(temp_pos_pro, axis=0) # 当前词的词性下标
if not pos_pro or max(temp_pos_pro) > pos_pro:
pos_pro = max(temp_pos_pro)
word_pos = self.pos[int(temp_pos_index)]
# 意思是 当前word的下标为index,上个词性下标是count_num,它的最大累计概率的词性下标是temp_pos_index, 可用于回溯
viterbi_dict[word, index, self.pos[count_num]] = self.pos[int(temp_pos_index)]
count_num += 1
all_word_pro[word, index] = temp_pro
if word != 'EOS':
result_1 += word + '/' + word_pos + ' '
# 这个是EOS前一个词的最优词性
final_word_pos = None
for key, value in all_word_pro.items():
if key[1] == len(observe_seq) - 1:
final_word_pos = self.pos[int(np.argmax(value))]
result_pos = []
last_pos = final_word_pos
# 回溯法
for index in reversed(range(1, len(observe_seq)-1)):
last_pos = viterbi_dict[observe_seq[index], index, last_pos]
result_pos.append(observe_seq[index] + '/' + last_pos)
result_pos.append(observe_seq[0] + '/' + first_pos)
result_pos.reverse()
result_2 = ' '.join(result_pos)
return result_1.strip(), result_2.strip()
if __name__ == '__main__':
sentence = '通过输入不同的测试样例,输出算法实现结果,并做截图演示和文字介绍。'
pmws = PMWS.Pwms() # 最大概率分词算法
cws_result = pmws.get_result(sentence)
print('分词结果:', cws_result)
hmm = HMM()
pos_result = hmm.get_result(cws_result)
print('每一次最优的词性标注结果:', pos_result[0])
print('维特比算法的词性标注结果:', pos_result[1])
这里引用我另外一篇文章写的最大概率分词算法。
思考:这次我是想着用比较复杂但简单的语句去实现这个算法,从原理上一边理解一边实现,所以写的比较复杂,其次跟语料也是有关系,如果有误,请指出,谢谢。