【贝叶斯】
import pandas as pd
import numpy as np
import cv2
import random
import time
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
#二值化
def binaryzation(img):
cv_img=img.astype(np.uint8)
cv2.threshold(cv_img,50,1,cv2.THRESH_BINARY_INV,cv_img)
return cv_img
def Train(trainset,train_labels):
prior_probability=np.zeros(class_num) #j记录先验概率
conditional_probability=np.zeros((class_num,feature_len,2))#记录条件概率
#计算条件概率和先验概率
for i in range(len(train_labels)):
img=binaryzation(trainset[i]) #图片进行二值化
label=train_labels[i]
prior_probability[label]+=1
for j in range(feature_len):
conditional_probability[label][j][img[j]]+=1
for i in range(class_num):
for j in range(feature_len):
#经过二值化后图像只有0,1两种取值
pix_0= conditional_probability[i][j][0]
pix_1= conditional_probability[i][j][1]
#计算0,1像素点对应的条件概率
probability_0=(float( pix_0)/float(pix_0+pix_1))* 10000 + 1
probability_1=(float( pix_1)/float(pix_0+pix_1))*10000 + 1
conditional_probability[i][j][0]=probability_0
conditional_probability[i][j][1]=probability_1
return prior_probability, conditional_probability
#计算概率
def caculate_probability(img,label):
probability =int(prior_probability[label])
for i in range(len(img)):
probability *=int( conditional_probability[label][i][img[i]])
return probability
def predict(testset,prior_probability,conditional_probability):
predict=[]
for img in testset:
img=binaryzation(img) #图片进行二值化
max_label=0
max_probability=caculate_probability(img,0)
for j in range(1,10):
probability=caculate_probability(img,j)
if max_probability < probability:
max_label=j
max_probability=probability
predict.append(max_label)
return np.array(predict)
class_num=10
feature_len=784
if __name__ == '__main__':
print('start read data')
time_1=time.time()
raw_data=pd.read_csv(r"C:\Users\Administrator\Desktop\train.csv",header=0)
data=raw_data.values
imgs=data[0::,1::]
labels=data[::,0]
#选取2/3数据作为训练集,1/3数据作为测试集
train_features,test_fetures,train_labels,test_labels=train_test_split(imgs,labels,test_size=0.33,random_state=23323)
#print train_fetures.shape
#print train_fetures.shape
time_2=time.time()
print('read data cost',time_2-time_1,'second','\n')
print('start training')
prior_probability,conditional_probability=Train( train_features,train_labels)
time_3=time.time()
print('training cost',time_3-time_2,'second','\n')
print('start predicting')
test_predict=predict(test_fetures,prior_probability,conditional_probability)
time_4=time.time()
print('predicting cost',time_4-time_3,'second','\n')
score = accuracy_score(test_labels,test_predict)
print('the accuracy score is',score)