一.原理
朴素贝叶斯是一种构建分类器的简单方法。对于某些类型的概率模型,在监督式学习的样本集中能获取得非常好的分类效果。朴素贝叶斯方法常用于文本分类,这次我们将使用贝叶斯方法进行图像分类。贝叶斯方法的原理,可以使用一个公式表达:
详细内容可参考:贝叶斯分类器
二.介绍
这次实验主要是实现手写数字图像的分类。我们把每一个像素都作为图像的特征,Fij表示像素(x,y)的值,把每个类的数目和总的图像的比值作为先验概率,而似然概率:
后验概率
- P(class) ⋅ P(f1,1 | class) ⋅ P(f1,2 | class) ⋅ ... ⋅ P(f28,28 | class).
log P(class) + log P(f1,1 | class) + log P(f1,2 | class) + ... + log P(f28,28 | class).
图像形式:
00000000000001111000000000000000
00000000000011111110000000000000
00000000001111111111000000000000
00000001111111111111100000000000
00000001111111011111100000000000
00000011111110000011110000000000
00000011111110000000111000000000
00000011111110000000111100000000
00000011111110000000011100000000
00000011111110000000011100000000
00000011111100000000011110000000
00000011111100000000001110000000
00000011111100000000001110000000
00000001111110000000000111000000
00000001111110000000000111000000
00000001111110000000000111000000
00000001111110000000000111000000
00000011111110000000001111000000
00000011110110000000001111000000
00000011110000000000011110000000
00000001111000000000001111000000
00000001111000000000011111000000
00000001111000000000111110000000
00000001111000000001111100000000
00000000111000000111111000000000
00000000111100011111110000000000
00000000111111111111110000000000
00000000011111111111110000000000
00000000011111111111100000000000
00000000001111111110000000000000
00000000000111110000000000000000
00000000000011000000000000000000
数据下载:这里
三.实现
from numpy import *
import os
def img2matrix(filename):
im=open(filename)
m=32
mat=zeros((1,1024))
for i in range(m):
lineStr=im.readline()
for j in range(m):
mat[0,32*i+j]=int(lineStr[j])
return mat
def file2data():
trainlabel=[]
testlabel=[]
dir='D:\\digits'
dir1=os.path.join(dir,'trainingDigits')
traindata=zeros((len(os.listdir(dir1)),1024))
dir2=os.path.join(dir,'testDigits')
testdata=zeros((len(os.listdir(dir2)),1024))
x=0
for i in os.listdir(dir1):
filename=i.split('.')[0]
l=int(filename.split('_')[0])
trainlabel.append(l)
f=os.path.join(dir1,i)
traindata[x,:]=img2matrix(f)
x=x+1
x=0
for i in os.listdir(dir2):
filename=i.split('.')[0]
l=int(filename.split('_')[0])
testlabel.append(l)
f=os.path.join(dir2,i)
testdata[x,:]=img2matrix(f)
x+=1
return traindata,trainlabel,testdata,testlabel
def training(traindata,trainlabel):
dict1={}
prior=zeros((10,1))
trainall=len(trainlabel)
for i in trainlabel:
if i not in dict1:
dict1[i]=1
else:
dict1[i]+=1
for key,value in dict1.items():
prior[key,0]=1.0*value/trainall
p=ones((10,1024))
z=0
s=0
for key,value in dict1.items():
m=value
for j in range(1024):
for i in range(s,s+value):
if traindata[i,j]==0:
z+=1
p[key,j]=1.0*z/(1+m)
z=0
s+=value
return dict1,prior,p
def testing(testdata,prior,p):
predict=zeros((10,1))
tmp=0.0
t=1.0
for i in range(10):
for j in range(1024):
if testdata[j]==0.0:
tmp=p[i,j]
else:
tmp=1-p[i,j]
t=t*tmp
predict[i,0]=t*prior[i,0]
t=1.0
pt=argsort(-predict,axis=0) #descending sort
return pt[0]
def bayerClassifier(testdata,testlabel,prior,p):
m=len(testdata)
count=0
for i in range(m):
c=testing(testdata[i,:],prior,p)
if c==testlabel[i]:
count+=1
print 'the predict class is: %d, the real class is: %d' %(c,testlabel[i])
print 'The accuracy is: %0.2f%%'% (100.0*count/m)
if __name__=='__main__':
print '********Preparing data*****'
traindata,trainlabel,testdata,testlabel=file2data()
print '********Training***********'
dict1,prior,p=training(traindata,trainlabel)
print '********Testing************'
bayerClassifier(testdata,testlabel,prior,p)
********Preparing data*****
********Training***********
********Testing************
the predict class is: 0, the real class is: 0
the predict class is: 0, the real class is: 0
...
...
...
the predict class is: 9, the real class is: 9
the predict class is: 9, the real class is: 9
the predict class is: 9, the real class is: 9
the predict class is: 9, the real class is: 9
the predict class is: 9, the real class is: 9
the predict class is: 9, the real class is: 9
the predict class is: 9, the real class is: 9
the predict class is: 9, the real class is: 9
the predict class is: 9, the real class is: 9
the predict class is: 9, the real class is: 9
the predict class is: 9, the real class is: 9
The accuracy is: 92.18%
[Finished in 7.9s]