# 数据集已对每个词进行了标注
#生成 词库,词标签,词性标签
word2id, id2word = {
},{
}
tag2id, id2tag = {
}, {
}
for line in open('traindata.txt'):
item = line.split('/')
word, tag = item[0], item[1].rstrip()
if word not in word2id:
word2id[word] = len(word2id)
id2word[len(word2id)] = word
if tag not in tag2id:
tag2id[tag] = len(tag2id)
id2tag[len(id2tag)] = tag
M = len(word2id)
N = len(tag2id)
# 构建pi,A, B
import numpy as np
pi = np.zeros(N) # 每个tag(词性)出现在句首的概率
A = np.zeros