自然语言处理中EM算法抛硬币的实现:
import numpy
import math
def em_single(theta,O):#对于当前的模型求对应的期望值(估算步骤)
# 三个值分别为选择硬币c1的概率、抛c1硬币为正面的概率、抛c2硬币为正面的概率。
pi=theta[0]
h1=theta[1]
h2=theta[2]
new_pi=0
new_h1=0
new_h2=0
PB=[]
head=[]
for o in O:
t=len(o)
cnt = o.sum()#正面的数量
head.append(cnt)#head=[3.0,3,0]
for o in O:
t=len(o)
heads = o.sum()#正面的数量
tails=t-heads#反面的数量
u=(pi*math.pow(h1,heads)*math.pow(1-h1,tails))/(pi*math.pow(h1,heads)*math.pow(1-h1,tails)+(1-pi)*math.pow(h2,heads)*math.pow(1-h2,tails))
PB.append(u)
l=len(PB)
new_pi=sum(PB)/l
for i in range(l):
new_h1+=PB[i]*head[i]/t
new_h2+=(1-PB[i])*head[i]/t
new_h1=new_h1/sum(PB)
new_h2=new_h2/(l-sum(PB))#因为有c1 c2两种可能,总观测数减去c1的即为c2的
return [new_pi,new_h1,new_h2]
def em(theta,Y,tol):#最大化步骤
j=0
while j<1000:
new_theta=em_single(theta,Y)
change=numpy.abs(new_theta[0] - theta[0])
if change<tol:
break
else:
theta = new_theta
j=j+1
print("迭代后的参数为:")
print(new_theta)
return new_theta
if __name__=="__main__":
# 硬币投掷结果
O = numpy.array([[1,1,1],#Y={<HHH>,<TTT>,<HHH>,<TTT>}
[0,0,0],
[1,1,1],
[0,0,0]])
theta=[0.3,0.3,0.6]
t=len(theta)
print("初始参数为:")
print(theta)
theta=em(theta,O,1e-15)
输出:
初始参数为:
[0.3, 0.3, 0.6]
迭代后的参数为:
[0.3737649610410474, 0.0680206318504191, 0.7578245253976398]
迭代后的参数为:
[0.4859367224237726, 0.0004438948678989641, 0.9722232981496766]
迭代后的参数为:
[0.49998864947922056, 8.997364138533166e-11, 0.9999772993837971]
迭代后的参数为:
[0.49999999999999417, 7.2837620757937955e-31, 0.9999999999999885]
迭代后的参数为:
[0.5, 3.864268131526994e-91, 1.0]