实验三 贝叶斯分类
目录
- 实验目的
熟悉贝叶斯分类原理和方法,并对MNIST数据集(在我的上传资源可以找到https://download.csdn.net/download/lagoon_lala/11033126)进行分类。
MNIST数据集相似数据地址:
https://github.com/WenDesi/lihang_book_algorithm/blob/master/data/train.csv
- 实验内容
使用贝叶斯算法对MNIST数据集进行分类,并进行准确率计算。
- 编写程序调试
import numpy as np
from functools import reduce
import operator
# 读取数据集
def read_data(xfile, yfile, test_rate):
'''
:param xfile: x文件
:param yfile: y文件
:param test_rate: 测试集所占的比例
:return: 训练集和测试集
'''
# # 加载文件, 此处是dict
# x = scio.loadmat(xfile)
# y = scio.loadmat(yfile)
#
# x = x.get(xfile[8:-4])
# y = y.get(yfile[8:-4])
x = np.loadtxt(open(xfile,"rb"), delimiter=",",skiprows=0)
y = np.loadtxt(open(yfile,"rb"), delimiter=",",skiprows=0)
# x = x[:5000]
# y = y[:5000]
# 二值化
x = np.where(x>0, 1, 0)
num = x.shape[0]
index = int(num * (1 - test_rate))
x_train = x[0:index]
y_train = y[0:index]
x_test = x[index:num]
y_test = y[index:num]
return x_train, y_train, x_test, y_test
if __name__ == '__main__':
x_train, y_train, x_test, y_test = read_data("dataset/newdata.csv", "dataset/newdatalabel.csv", 1 / 12)
print("x_train.shape = {}".format(x_train.shape))
print("y_train.shape = {}".format(y_train.shape))
print("x_test.shape = {}".format(x_test.shape))
print("y_test.shape = {}".format(y_test.shape))
四、撰写实验报告
# 请同学们利用朴素贝叶斯算法进行分类。
# 计算p(y)
# 计算p(x | y)
# 计算p(yi | x)
# 计算训练集-多项式朴素贝叶斯准确率
# 计算测试集-多项式朴素贝叶斯准确率
np.where
np.where(condition, x, y)
满足条件(condition),输出x,不满足输出y。
np.zeros
https://blog.csdn.net/qq_36621927/article/details/79763585
实验代码1
import numpy as np
from functools import reduce
import operator
from sklearn.metrics import accuracy_score
# 读取数据集
def read_data(xfile, yfile, test_rate):
'''
:param xfile: x文件
:param yfile: y文件
:param test_rate: 测试集所占的比例
:return: 训练集和测试集
'''
# # 加载文件, 此处是dict
# x = scio.loadmat(xfile)
# y = scio.loadmat(yfile)
#
# x = x.get(xfile[8:-4])
# y = y.get(yfile[8:-4])
x = np.loadtxt(open(xfile,"rb"), delimiter=",",skiprows=0)
y = np.loadtxt(open(yfile,"rb"), delimiter=",",skiprows=0)
# x = x[:5000]
# y = y[:5000]
# 二值化
x = np.where(x>0, 1, 0)
num = x.shape[0]
index = int(num * (1 - test_rate))
x_train = x[0:index]
y_train = y[0:index]
x_test = x[index:num]
y_test = y[index:num]
return x_train, y_train, x_test, y_test
def Train(trainset,train_labels):
prior_probability = np.zeros((class_num,),dtype=np.int) # 先验概率
conditional_probability = np.zeros((class_num,feature_len,2)) # 条件概率
# print(train_labels)
# print("size:"+str(len(train_labels)))
# print("prior_p:{}".format(prior_probability))
# 计算先验概率及条件概率
for i in range(len(train_labels)):
# img = binaryzation(trainset[i]) # 图片二值化
img = trainset[i]
label = train_labels[i]
# print(label)
# print(type(prior_probability[label]))
# print("prior_p[lable]:{}".format(prior_probability[label]))
prior_probability[int(label)] += 1
for j in range(feature_len):
conditional_probability[int(label)][j][img[j]] += 1
# 将概率归到[1.10001]
for i in range(class_num):
for j in range(feature_len):
#
# 经过二值化后图像只有0,1两种取值
pix_0 = conditional_probability[i][j][0]
pix_1 = conditional_probability[i][j][1]
# print("pix_0",pix_0,"pix_1",pix_1)
# # 计算0,1像素点对应的条件概率
probalility_0 = (float(pix_0)/float(pix_0+pix_1))*1000000 + 1
probalility_1 = (float(pix_1)/float(pix_0+pix_1))*1000000 + 1
conditional_probability[i][j][0] = probalility_0
conditional_probability[i][j][1] = probalility_1
return prior_probability,conditional_probability
# 计算概率
def calculate_probability(img,label):
probability = int(prior_probability[label])
for i in range(len(img)):
probability *= int(conditional_probability[label][i][img[i]])
return probability
def Predict(testset,prior_probability,conditional_probability):
predict = []
for img in testset:
# 图像二值化
## img = binaryzation(img)
max_label = 0
max_probability = calculate_probability(img,0)
for j in range(1,10):
probability = calculate_probability(img,j)
if max_probability < probability:
max_label = j
max_probability = probability
predict.append(max_label)
return np.array(predict)
class_num = 10
feature_len = 784
if __name__ == '__main__':
x_train, y_train, x_test, y_test = read_data("dataset/newdata.csv", "dataset/newdatalabel.csv", 1 / 12)
print("x_train.shape = {}".format(x_train.shape))
print("y_train.shape = {}".format(y_train.shape))
print("x_test.shape = {}".format(x_test.shape))
print("y_test.shape = {}".format(y_test.shape))
# 请同学们利用朴素贝叶斯算法进行分类。
# 计算p(y)
# 计算p(x | y)
prior_probability,conditional_probability = Train(x_train,y_train)
# print("p(y)=",prior_probability,"p(x | y)=",conditional_probability)
# 计算p(yi | x)
print('Start predicting')
test_predict = Predict(x_test,prior_probability,conditional_probability)
# 计算训练集-多项式朴素贝叶斯准确率
train_predict = Predict(x_train,prior_probability,conditional_probability)
score = accuracy_score(y_train,train_predict)
print("The train accruacy socre is ", score)
# 计算测试集-多项式朴素贝叶斯准确率
score = accuracy_score(y_test,test_predict)
print("The test accruacy socre is ", score)
代码版本2
import numpy as np
from functools import reduce
import operator
import collections
from sklearn.naive_bayes import GaussianNB
# 读取数据集
def read_data(xfile, yfile, test_rate):
'''
:param xfile: x文件
:param yfile: y文件
:param test_rate: 测试集所占的比例
:return: 训练集和测试集
'''
# # 加载文件, 此处是dict
# x = scio.loadmat(xfile)
# y = scio.loadmat(yfile)
#
# x = x.get(xfile[8:-4])
# y = y.get(yfile[8:-4])
x = np.loadtxt(open(xfile,"rb"), delimiter=",",skiprows=0)
y = np.loadtxt(open(yfile,"rb"), delimiter=",",skiprows=0)
# x = x[:5000]
# y = y[:5000]
# 二值化
x = np.where(x>0, 1, 0)
#print(x)
num = x.shape[0]
index = int(num * (1 - test_rate))
x_train = x[0:index]
y_train = y[0:index]
x_test = x[index:num]
y_test = y[index:num]
return x_train, y_train, x_test, y_test
def predict(A,record,result,y_num_l):
maxi=-1
maxp=0
#计算p(yi | x)
for k in range(0,len(y_num_l)):#10
temp=1
for j in range(0,len(record)):
if (record[j]==0):
temp*=(1-A[k][j])
else:
temp*=A[k][j]
temp*=y_num_l[k]
#10个p(yi | x)比较大小,找出其中最大的作为分类值
if (temp>maxp):
maxp=temp
maxi=k
if maxi==int(result):
return 1
else:
return 0
if __name__ == '__main__':
x_train, y_train, x_test, y_test = read_data("newdata.csv", "newdatalabel.csv", 1 / 12)
print("x_train.shape = {}".format(x_train.shape))
print("y_train.shape = {}".format(y_train.shape))
print("x_test.shape = {}".format(x_test.shape))
print("y_test.shape = {}".format(y_test.shape))
print(y_train)
#求P(y)
y_num=collections.Counter(y_train)
print(y_num)
for i in y_num:
print("y={}的个数为{},P(y={})={}".format(i,y_num[i],i,y_num[i]/len(y_train)))
#将y转换成字典方便计算
y_num_l=dict(y_num)
print(y_num_l)
#计算p(x | y)
A=[]#矩阵A存储统计结果
for i in range(0,len(y_num)):#初始化
b=[]
for j in range(0,len(x_train[0])):
b.append(0)
A.append(b)
#print(A)
#统计结果,计算值为1的次数
for i in range(0,len(x_train)):#4583
for j in range(0,len(x_train[i])):#784
A[int(y_train[i])][j]+=x_train[i][j]
print(A)
print(len(A))
#将次数转换为概率
for i in range(0,len(y_num)):
for j in range(0,len(x_train[i])):
A[i][j]=A[i][j]/y_num_l[i]
print(A)
#计算准确率
prenum=0
for i in range(0,len(x_train)):
prenum+=predict(A,x_train[i],y_train[i],y_num_l)
print("训练集的准确率为{}".format(prenum/len(x_train)))
prenum=0
for i in range(0,len(x_test)):
prenum+=predict(A,x_test[i],y_test[i],y_num_l)
print("测试集的准确率为{}".format(prenum/len(x_test)))