# !/usr/bin/python# -*- coding:utf-8 -*-import math
import matplotlib.pyplot as plt
import numpy as np
import codecs
import random
infinite =float(-2**31)deflog_normalize(a):
s =0for x in a:
s += x
if s ==0:print"Error..from log_normalize."return
s = math.log(s)for i inrange(len(a)):if a[i]==0:
a[i]= infinite
else:
a[i]= math.log(a[i])- s
deflog_sum(a):ifnot a:# a为空return infinite
m =max(a)
s =0for t in a:
s += math.exp(t-m)return m + math.log(s)defcalc_alpha(pi, A, B, o, alpha):for i inrange(4):
alpha[0][i]= pi[i]+ B[i][ord(o[0])]
T =len(o)
temp =[0for i inrange(4)]del i
for t inrange(1, T):for i inrange(4):for j inrange(4):
temp[j]=(alpha[t-1][j]+ A[j][i])
alpha[t][i]= log_sum(temp)
alpha[t][i]+= B[i][ord(o[t])]defcalc_beta(pi, A, B, o, beta):
T =len(o)for i inrange(4):
beta[T-1][i]=1
temp =[0for i inrange(4)]del i
for t inrange(T-2,-1,-1):for i inrange(4):
beta[t][i]=0for j inrange(4):
temp[j]= A[i][j]+ B[j][ord(o[t+1])]+ beta[t+1][j]
beta[t][i]+= log_sum(temp)defcalc_gamma(alpha, beta, gamma):for t inrange(len(alpha)):for i inrange(4):
gamma[t][i]= alpha[t][i]+ beta[t][i]
s = log_sum(gamma[t])for i inrange(4):
gamma[t][i]-= s
defcalc_ksi(alpha, beta, A, B, o, ksi):
T =len(alpha)
temp =[0for x inrange(16)]for t inrange(T-1):
k =0for i inrange(4):for j inrange(4):
ksi[t][i][j]= alpha[t][i]+ A[i][j]+ B[j][ord(o[t+1])]+ beta[t+1][j]
temp[k]=ksi[t][i][j]
k +=1
s = log_sum(temp)for i inrange(4):for j inrange(4):
ksi[t][i][j]-= s
defbw(pi, A, B, alpha, beta, gamma, ksi, o):
T =len(alpha)for i inrange(4):
pi[i]= gamma[0][i]
s1 =[0for x inrange(T-1)]
s2 =[0for x inrange(T-1)]for i inrange(4):for j inrange(4):for t inrange(T-1):
s1[t]= ksi[t][i][j]
s2[t]= gamma[t][i]
A[i][j]= log_sum(s1)- log_sum(s2)
s1 =[0for x inrange(T)]
s2 =[0for x inrange(T)]for i inrange(4):print"bw", i
for k inrange(65536):
valid =0if k %10000==0:print"bw - k", k
for t inrange(T):iford(o[t])== k:
s1[valid]= gamma[t][i]
valid +=1
s2[t]= gamma[t][i]if valid ==0:
B[i][k]= infinite
else:
B[i][k]= log_sum(s1[:valid])- log_sum(s2)defbaum_welch(pi, A, B):
f =file(".\\1.txt")
sentence = f.read()[3:].decode('utf-8')
f.close()
T =len(sentence)
alpha =[[0for i inrange(4)]for t inrange(T)]
beta =[[0for i inrange(4)]for t inrange(T)]
gamma =[[0for i inrange(4)]for t inrange(T)]
ksi =[[[0for j inrange(4)]for i inrange(4)]for t inrange(T-1)]for time inrange(3):print"calc_alpha"
calc_alpha(pi, A, B, sentence, alpha)# alpha(t,i):给定lamda,在时刻t的状态为i且观测到o(1),o(2)...o(t)的概率print"calc_beta"
calc_beta(pi, A, B, sentence, beta)# beta(t,i):给定lamda和时刻t的状态i,观测到o(t+1),o(t+2)...oT的概率print"calc_gamma"
calc_gamma(alpha, beta, gamma)# gamma(t,i):给定lamda和O,在时刻t状态位于i的概率print"calc_ksi"
calc_ksi(alpha, beta, A, B, sentence, ksi)# ksi(t,i,j):给定lamda和O,在时刻t状态位于i且在时刻i+1,状态位于j的概率print"bw"
bw(pi, A, B, alpha, beta, gamma, ksi, sentence)print"time", time
print"Pi:", pi
print"A", A
defmle():# 0B/1M/2E/3S
pi =[0]*4# npi[i]:i状态的个数
a =[[0]*4for x inrange(4)]# na[i][j]:从i状态到j状态的转移个数
b =[[0]*65536for x inrange(4)]# nb[i][o]:从i状态到o字符的个数
f =file(".\\pku_training.utf8")
data = f.read()[3:].decode('utf-8')
f.close()
tokens = data.split(' ')# # 增加英文词训练集# f = file('Englishword.train')# data = f.read().decode('utf-8')# f.close()# tokens.extend(data.split(' '))# 开始训练
last_q =2
iii =0
old_progress =0print'进度:'for k, token inenumerate(tokens):
progress =float(k)/float(len(tokens))if progress > old_progress +0.1:print'%.3f%%'%(progress *100)
old_progress = progress
token = token.strip()
n =len(token)if n <=0:continueif n ==1:
pi[3]+=1
a[last_q][3]+=1# 上一个词的结束(last_q)到当前状态(3S)
b[3][ord(token[0])]+=1
last_q =3continue# 初始向量
pi[0]+=1
pi[2]+=1
pi[1]+=(n-2)# 转移矩阵
a[last_q][0]+=1
last_q =2if n ==2:
a[0][2]+=1else:
a[0][1]+=1
a[1][1]+=(n-3)
a[1][2]+=1# 发射矩阵
b[0][ord(token[0])]+=1
b[2][ord(token[n-1])]+=1for i inrange(1, n-1):
b[1][ord(token[i])]+=1# 正则化
log_normalize(pi)for i inrange(4):
log_normalize(a[i])
log_normalize(b[i])return[pi, a, b]deflist_write(f, v):for a in v:
f.write(str(a))
f.write(' ')
f.write('\n')defsave_parameter(pi, A, B):
f_pi =open(".\\pi.txt","w")
list_write(f_pi, pi)
f_pi.close()
f_A =open(".\\A.txt","w")for a in A:
list_write(f_A, a)
f_A.close()
f_B =open(".\\B.txt","w")for b in B:
list_write(f_B, b)
f_B.close()if __name__ =="__main__":
pi, A, B = mle()
save_parameter(pi, A, B)print"训练完成..."
隐马尔科夫模型# !/usr/bin/python# -*- coding:utf-8 -*-import mathimport matplotlib.pyplot as pltimport numpy as npimport codecsimport randominfinite = float(-2**31)def log_normalize(a): s = 0 for x in a: s += x if s == 0: