python实现隐马可夫算法和维特比算法,用于中文标注

(一)算法介绍
隐马尔可夫模型(HMM)就是估算隐藏于表面事件背后的事件的概率模型。一般包含观测序列,隐序列,转移概率分布,发射概率分布以及初始状态。
维特比(viterbi)算法属于隐马尔可夫模型中的对于解码时的一种算法,依据最后一个时刻中概率最高的状态,逆向通过找其路径中的上一个最大部分最优路径,从而找到整个最优路径。

(二)算法原理
假设观测序列长度为T,隐序列长度为N,Vt[s,t]表示在t时刻以状态s终止的最大概率,ai,j表示观测状态i到观测状态j的转移概率,bs(Ot)表示t时刻观测状态为s情况下对应隐序列的隐状态的发射概率
1、创建两个N+2行,T列的二维表,一个为Viterbi[N+2,T]记录概率,另一个为backpointer[N+2,T]
2、对这个二维表的第一列的前N行做初始化,Viterbi[S,1]=a0,s*bs(01),即t = 1时刻分别求出N个状态下产生观测变量1的概率,backpointer[s,1]=0
3、
在这里插入图片描述
在这里插入图片描述
第一个式子的是当t和s不变时i= 1,2,3,…,N是分别求出t - 1时刻所有可能的状态,转移到t时刻状态i的概率的最大值,最后乘以观测概率就是t状态 i 最有可能产生观测变量t的概率。
argmax是求在t-1时刻的状态最有可能转移到t时刻的状态s。
通过这两个公式求出各个时刻最有可能的状态。
4、取得T时刻最有可能的状态,根据T时刻最有可能的状态反向推t=T-1, t=T-2,…,2,1时刻最有可能的状态。

语料链接:链接,密码:885p
数据处理:

# 数据处理,我认为每一篇新闻的第一个时间是多余的,这会让训练得到的转移矩阵专注于该种形式的文章。
data_file = './dependency/199801.txt'
pos = []
A_dict = {}
B = {}
fw = open('./dependency/new199801.txt', 'w', encoding='utf-8')
with open(data_file, 'r', encoding='utf-8')as f:
    for line in f:
        if line.strip() != '':
            line = line.strip().split(' ')
            temp_line = []
            for word in line[1:]:  # 去掉所有的第一部分,即人民日报的时间戳
                if word.strip() != '':
                    temp_line.append(word)
            sentence = ' '.join(temp_line)
            print(sentence)
            fw.write(sentence + '\n')
fw.close()

获得词性标注标签:

def get_set_tag():
    data_file = './dependency/new199801.txt'
    all_pos = ['hos', 'eos']
    with open(data_file, 'r', encoding='utf-8')as fr:
        for line in fr:
            line = line.strip()
            line = 'HOS/hos ' + line + ' EOS/eos'
            if len(line) == 0:
                continue
            word_pos = line.split(' ')
            temp_word = ''
            for w_p in word_pos:
                w_p = w_p.strip()
                w_p = w_p.replace('[', '')
                w = w_p.split('/')[0]
                p = w_p.split('/')[1]
                if ']' in w_p:  # 如果数据存在],则是分解词的结尾,这时候,需要得到大会堂/n和人民大会堂/ns
                    p1 = p.split(']')[0]
                    p2 = p.split(']')[1]
                    all_pos.append(p1)
                    all_pos.append(p2)
                else:
                    all_pos.append(p)
    with open('./dependency/pos_tags.txt', 'w', encoding='utf-8')as f:
        for pos in all_pos:
            f.write(pos + '\n')

获得转移矩阵:

import pandas as pd
import pickle


def a_process():
    data_file = './dependency/new199801.txt'
    pos = []
    A_dict = {}
    B = {}
    with open('pos_tags.txt', 'r', encoding='utf-8')as fr:
        for line in fr:
            pos.append(line.strip())
    A_matrix = [[0 for i in range(len(pos))] for j in range(len(pos))]
    print(len(A_matrix))
    # 初始化
    for p1 in pos:
        for p2 in pos:
            A_dict[p1, p2] = 0
    with open(data_file, 'r', encoding='utf-8')as fr:
        for line in fr:
            line = line.strip()
            if len(line) == 0:
                continue
            line = 'HOS/hos ' + line + ' EOS/eos'
            word_pos = line.split(' ')
            for index in range(len(word_pos)):
                w_p = word_pos[index].strip()
                next_index = 1
                if w_p != '' and w_p != 'EOS/eos':  # 结尾词没有下个词,即index=lth-1时,结束循环,不做处理
                    next_w_p = word_pos[index + next_index].strip()
                    while next_w_p == '':
                        next_index += 1
                        next_w_p = word_pos[index + next_index].strip()
                    p = w_p.split('/')[1]
                    next_p = next_w_p.split('/')[1]
                    if '[' in w_p and ']' in next_p:  # 这里的思想也是为了解决分解词,不过为了获取
                        next_p1 = next_p.split(']')[0]
                        A_dict[next_p1, p] += 1
                    elif '[' in w_p and ']' not in next_p:
                        A_dict[next_p, p] += 1
                    elif ']' in w_p:
                        p1 = p.split(']')[0]
                        p2 = p.split(']')[1]
                        A_dict[next_p, p1] += 1
                        A_dict[next_p, p2] += 1
                    elif ']' in next_w_p:
                        next_p1 = next_p.split(']')[0]
                        A_dict[next_p1, p] += 1
                    else:
                        A_dict[next_p, p] += 1
    for key, value in A_dict.items():
        p1 = key[0]
        p2 = key[1]
        A_matrix[pos.index(p2)][pos.index(p1)] = value  # 得到转移矩阵
    result_to_csv = []
    for p, a in zip(pos, A_matrix):
        temp_a = [str(n) for n in a]
        result_to_csv.append([p] + temp_a)
    result_to_csv = pd.DataFrame(result_to_csv)
    result_to_csv.to_csv('./dependency/A.csv', index=None, header=['0'] + pos)  # 可以不用这个
    #
    with open('./dependency/A.pickle', 'wb')as fb:
        pickle.dump(A_matrix, fb)

获得发射矩阵:

def B_process():
    data_file = './dependency/new199801.txt'
    pos = []
    A = {}
    B = {}
    with open('./dependency/pos_tags.txt', 'r', encoding='utf-8')as fr:
        for line in fr:
            pos.append(line.strip())
    with open(data_file, 'r', encoding='utf-8')as fr:
        for line in fr:
            line = line.strip()
            line = 'HOS/hos ' + line + ' EOS/eos'
            if len(line) == 0:
                continue
            word_pos = line.split(' ')
            temp_word = ''
            for w_p in word_pos:
                w_p = w_p.strip()
                w_p = w_p.replace('[', '')
                w = w_p.split('/')[0]
                p = w_p.split('/')[1]
                if '[' in w_p:  # 数据中存在这种情况[人民/n 大会堂/n]ns,这里的思想是判断[是否出现,出现的话,需要拼接人民--大会堂得到共同的词性ns
                    temp_word += w  # 用一个中间变量不断去加分解词的中间词
                    if w not in B.keys():
                        B[w] = [0 for i in range(len(pos))]  # 初始化发射矩阵
                        p_index = pos.index(p)
                        B[w][p_index] += 1
                    else:
                        p_index = pos.index(p)
                        B[w][p_index] += 1
                elif ']' in w_p:  # 如果数据存在],则是分解词的结尾,这时候,需要得到大会堂/n和人民大会堂/ns
                    p1 = p.split(']')[0]
                    p2 = p.split(']')[1]
                    temp_word += w
                    if temp_word not in B.keys():
                        B[temp_word] = [0 for i in range(len(pos))]  # 初始化发射矩阵
                        p_index = pos.index(p2)
                        B[temp_word][p_index] += 1
                    else:
                        p_index = pos.index(p2)
                        B[temp_word][p_index] += 1
                    if w not in B.keys():
                        B[w] = [0 for i in range(len(pos))]  # 初始化发射矩阵
                        p_index = pos.index(p1)
                        B[w][p_index] += 1
                    else:
                        p_index = pos.index(p1)
                        B[w][p_index] += 1
                    temp_word = ''
                else:
                    if w not in B.keys():
                        B[w] = [0 for i in range(len(pos))]  # 初始化发射矩阵
                        p_index = pos.index(p)
                        B[w][p_index] += 1
                    else:
                        p_index = pos.index(p)
                        B[w][p_index] += 1
    result_to_csv = []
    for key, value in B.items():
        temp_value = [key] + value  # 这里是得到词及其对应的发射矩阵
        result_to_csv.append(temp_value)
    result_to_csv = pd.DataFrame(result_to_csv)
    result_to_csv.to_csv('./dependency/B.csv', index=None, header=['0']+pos)  # 可以不用这个
    
    with open('./dependency/B.pickle', 'wb')as fb:
        pickle.dump(B, fb)

HMM+Viterbi算法:

#!/usr/bin/python3
# -*- coding:utf-8 -*-
# Author:ChenYuan
import pickle
import numpy as np
import math
import PMWS


class HMM(object):
    def __init__(self):
        self.A_matrix = None  # 转移矩阵
        self.B_matrix = None  # 发射矩阵
        self.pos = []  # 存放词性标签
        self.A = []  # 转移概率矩阵
        self.B = {}  # 发射概率矩阵
        self.initialize()

    def read_matrix(self):
        with open('./dependency/A.pickle', 'rb')as fw:
            self.A_matrix = pickle.load(fw)  # A矩阵是发射矩阵
        with open('./dependency/B.pickle', 'rb')as fw:
            self.B_matrix = pickle.load(fw)

    def read_pos_tags(self):
        with open('./dependency/pos_tags.txt', 'r', encoding='utf-8')as fr:
            for line in fr:
                self.pos.append(line.strip())

    def get_probability_matrix(self):
        for a in self.A_matrix:
            self.A.append([math.log(num / sum(a)) for num in a])

        for key, value in self.B_matrix.items():
            self.B[key] = [math.log(v / sum(value)) for v in value]

    def initialize(self):
        self.read_matrix()
        self.read_pos_tags()
        self.get_probability_matrix()

    def get_result(self, cws):
        cws = cws + ' EOS'
        first_pos = ''
        all_word_pro = {}
        viterbi_dict = {}
        result_1 = ''
        result_2 = ''
        observe_seq = cws.split(' ')
        for index in range(len(observe_seq)):
            temp_pro = []
            word = observe_seq[index]
            if index == 0:
                b = self.B.get(word)
                if not b:
                    b = np.zeros((len(self.pos)))
                else:
                    b = np.array(b)
                pos_index = self.pos.index('hos')  # 这里用hos作为开始标签,所以这里的开始初始值为P(tags|hos)
                pos_parse_num = self.A_matrix[pos_index]  #
                PI = [math.log(p / sum(pos_parse_num)) for p in pos_parse_num]  # 获取
                assert len(b) == len(PI)
                for w, p in zip(b, PI):
                    temp_pro.append(w + p)
                word_pos = self.pos[int(np.argmax(temp_pro, axis=0))]
                all_word_pro[word, index] = temp_pro
                first_pos = word_pos
                result_1 += word + '/' + word_pos + ' '
                # print(all_word_pro)
            else:
                last_v = all_word_pro[observe_seq[index - 1], index - 1]
                b = self.B.get(word)
                if not b:
                    b = np.zeros((len(self.pos)))
                else:
                    b = np.array(b)
                last_v = np.array(last_v)
                word_pos = ''
                pos_pro = None
                count_num = 0  # 用于捕获上个节点位置,即上个词的词性下标
                for a in self.A:
                    a = np.array(a)
                    temp_pos_pro = last_v + a + b
                    temp_pro.append(max(temp_pos_pro))
                    temp_pos_index = np.argmax(temp_pos_pro, axis=0)  # 当前词的词性下标
                    if not pos_pro or max(temp_pos_pro) > pos_pro:
                        pos_pro = max(temp_pos_pro)
                        word_pos = self.pos[int(temp_pos_index)]
                    # 意思是 当前word的下标为index,上个词性下标是count_num,它的最大累计概率的词性下标是temp_pos_index, 可用于回溯
                    viterbi_dict[word, index, self.pos[count_num]] = self.pos[int(temp_pos_index)]
                    count_num += 1
                all_word_pro[word, index] = temp_pro

                if word != 'EOS':
                    result_1 += word + '/' + word_pos + ' '
        # 这个是EOS前一个词的最优词性
        final_word_pos = None
        for key, value in all_word_pro.items():
            if key[1] == len(observe_seq) - 1:
                final_word_pos = self.pos[int(np.argmax(value))]

        result_pos = []
        last_pos = final_word_pos
        # 回溯法
        for index in reversed(range(1, len(observe_seq)-1)):
            last_pos = viterbi_dict[observe_seq[index], index, last_pos]

            result_pos.append(observe_seq[index] + '/' + last_pos)
        result_pos.append(observe_seq[0] + '/' + first_pos)
        result_pos.reverse()
        result_2 = ' '.join(result_pos)
        return result_1.strip(), result_2.strip()


if __name__ == '__main__':
    sentence = '通过输入不同的测试样例,输出算法实现结果,并做截图演示和文字介绍。'
    pmws = PMWS.Pwms()  # 最大概率分词算法
    cws_result = pmws.get_result(sentence)
    print('分词结果:', cws_result)
    hmm = HMM()
    pos_result = hmm.get_result(cws_result)
    print('每一次最优的词性标注结果:', pos_result[0])
    print('维特比算法的词性标注结果:', pos_result[1])

这里引用我另外一篇文章写的最大概率分词算法

思考:这次我是想着用比较复杂但简单的语句去实现这个算法,从原理上一边理解一边实现,所以写的比较复杂,其次跟语料也是有关系,如果有误,请指出,谢谢。

  • 1
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值