参数估计代码:
# -*- coding: utf-8 -*-
# 二元隐马尔科夫模型(Bigram HMMs)
# 'trainText.txt_utf8'为人民日报已经人工分词的预料,29万多条句子
下载地址:
链接:https://pan.baidu.com/s/1kXosD1P 密码:mq61
import sys
def getList(input_str):
outpout_str = []
if len(input_str) == 1:
outpout_str.append('S')
elif len(input_str) == 2:
outpout_str = ['B', 'E']
else:
M_num = len(input_str) - 2
M_list = ['M'] * M_num
outpout_str.append('B')
outpout_str.extend(M_list)
outpout_str.append('E')
return outpout_str
#state_M = 4
#word_N = 0
A_dic = {}
B_dic = {}
Count_dic = {} #number of all emerged state
Pi_dic = {} #number of begin state
word_set = set()
state_list = ['B', 'M', 'E', 'S']
line_num = -1
INPUT_DATA = "trainText.txt_utf8"
PROB_START = "trainHMM\prob_start.py"
PROB_EMIT = "trainHMM\prob_emit.py"
PROB_TRANS = "trainHMM\prob_trans.