话不多说,上代码
【注意】p(x | y)在计算的时候会有零值,需要用贝叶斯估计来解决,这里采用拉普拉斯平滑,Sj为特征取值个数,我尝试了二值和八值,效果差不多,还有一点很重要的是,因为数值太小,相差的化最后都会变成零,所以用log函数来处理,最后的连乘就变成了累加,因为用的argmax只需要比较相对大小,所以最后的P(X)不用除,否则要计算的话得先将各个p[k]取指数再累加,此外,log函数也不用再变回去,因为它是单调递增的。
#图像设为二值,accuracy=84.27%,time=50.65s
#图像设为八值,accuracy=84.77%,time=52.44s
import numpy as np
import time
def loadData(filename):
image_array=[]
label_array=[]
file=open(filename,'r')
for line in file.readlines():
curline=line.strip().split(',')
image=[int(item)//32 for item in curline[1:]]
image_array.append(image)
label_array.append(int(curline[0]))
return np.array(image_array),np.array(label_array)
def calculate_probability(train_image_array,train_label_array):
feature_num=784
class_num=10
instance_num=train_image_array.shape[0]
pixy=np.zeros((class_num,feature_num,8))
py=np.zeros((class_num))
for i in range(class_num):
py[i]=(np.sum(train_label_array==i))
for i in range(instance_num):
image=train_image_array[i]
#print('sum:',np.sum(image))
label=train_label_array[i]
for j in range(feature_num):
pixy[label,j,image[j]]+=1
#for i in range(class_num):
#print(np.mat(pixy[i,200:250,:]).T)
for i in range(class_num):
pixy[i,:,:]=np.log((pixy[i,:,:]+1)/(py[i]+2))
py[i]=np.log((py[i]+1)/(instance_num+10))
#pixy[i,:,:]=pixy[i,:,:]-py[i]
#class*feature*2
#print('x',pixy[0,:,5])
#py=np.sum(pixy,axis=(1,2))
#pixy=pixy/py[:,np.newaxis,np.newaxis]
#print('px',pixy)
#py=py/instance_num
#print('py',py)
#print(pixy,py)
return pixy,py
def naive_bayes(pixy,py,x):
#P(Y=Yk/Xi1=j,Xi2=j)=P(Xij,Yk)/P(Xij)=P(Yk)*P(Xij/Yk)/sigama(k)P(Yk)*P(Xij/Yk)
#上式要对所有的Xi值相乘
class_num=10
feature_num=784
p=[0]*class_num
for k in range(class_num):
p[k]=py[k]
for i in range(feature_num):
p[k]=p[k]+pixy[k,i,x[i]]
#print(p)
return np.argmax(p)
def test(pixy,py,test_image_array,test_label_array):
num=0
for i in range(len(test_image_array)):
image=test_image_array[i]
pred=naive_bayes(pixy,py,image)
label=test_label_array[i]
if pred==label:
num+=1
#print('pred:',pred)
accuracy=num/len(test_image_array)
return accuracy
if __name__=="__main__":
start=time.time()
print('start to read data')
train_image_array,train_label_array=loadData('../mnist/mnist_train.csv')
test_image_array,test_label_array=loadData('../mnist/mnist_test.csv')
print('start to test')
pixy,py=calculate_probability(train_image_array,train_label_array)
accuracy=test(pixy,py,test_image_array,test_label_array)
end=time.time()
print('accuracy:%f:'%accuracy)
print('time spend: ',end-start)