词性标注:
Jieba的词性标注模块仍然采用基于HMM模型的viterbi算法,在状态的描述上应用({BMES},词性)二元对。状态集合为{BMSE}这四个词位与词性(39类)的笛卡尔积,也就是有156种状态。
从文本的起始位置开始逐字扫描,判断每个字可能的状态,与状态转移得到的期待状态求与,也就是下一个字可能的状态。与的结果是空集时以期待状态为下一个字的状态,期待状态集为空时,以整个状态集为其状态。
def viterbi(obs, states, start_p, trans_p, emit_p):#obs:待处理文本,states,文字可能的状态集,start_p:初始概率,
V = [{}] # tabular
mem_path = [{}]
all_states = trans_p.keys()
for y in states.get(obs[0], all_states): # init
#get返回turple,表示首字所有可能的状态
V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT)
mem_path[0][y] = ''
for t in xrange(1, len(obs)):
V.append({})
mem_path.append({})
#prev_states = get_top_states(V[t-1])
prev_states = [
x for x in mem_path[t - 1].keys() if len(trans_p[x]) > 0]
#如果下一状态集不为空,x为上一状态
prev_states_expect_next = set(
(y for x in prev_states for y in trans_p[x].keys()))
#y:下一步可能的状态
obs_states = set(
states.get(obs[t], all_states)) & prev_states_expect_next
#汉字可能的状态与期待状态的交
if not obs_states:
obs_states = prev_states_expect_next if prev_states_expect_next else all_states
for y in obs_states:
prob, state = max((V[t - 1][y0] + trans_p[y0].get(y, MIN_INF) +
emit_p[y].get(obs[t], MIN_FLOAT), y0) for y0 in prev_states)
V[t][y] = prob
mem_path[t][y] = state
last = [(V[-1][y], y) for y in mem_path[-1].keys()]
# if len(last)==0:
# print obs
prob, state = max(last)
route = [None] * len(obs)
i = len(obs) - 1
while i >= 0:
route[i] = state
state = mem_path[i][state]
i -= 1
return (prob, route)