很多人写的viterbi算法过于依赖HMM,进行分词、命名实体识别什么的,因为反而对于这种单纯计算字符串概率最大组合的程序较少:
MIN_FLOAT = -3.14E100
"""每一步包含的词汇列表"""
step1 = ["two"]
step2 = ["of", "off", "on"]
step3 = ["the", "thew"]
step4 = ["people"]
status = [step1, step2, step3, step4] #状态集
"""状态转移概率"""
trans_probability = {"two": {"on": 0.1, "off": 0.2, "of": 0.6},
"of": {"the": 0.6, "thew": 0.8},
"off": {"the": 0.3, "thew": 0.5},
"on": {"the": 0.2, "thew": 0.52},
"the": {"people": 0.5},
"thew": {"people": 0.4}
}
init_state_p = [("two", 1.0)]
def viterbi(status, init_state_p, trans_p):
V = [{}]
path = {}
for y, prob in init_state_p:
V[0][y] = prob
path[y] = [y]
for t in range(1, len(status)):
V.append({})
newpath = {}
for y in status[t]:
(prob, state) = max([(V[t-1][y0] + trans_p[y0].get(y, MIN_FLOAT), y0) for y0 in status[t-1]])
V[t][y] = prob
newpath[y] = path[state] + [y]
path = newpath
(prob, state) = max((V[len(status)-1][y], y) for y in status[-1])
for y in status[-1]:
print("last candidate: ", (V[len(status)-1][y], path[y]))
return (prob, path[state])
print(viterbi(status=status, init_state_p=init_state_p, trans_p=trans_probability))