朴素贝叶斯生成新的样本数据,主要是通过训练时计算的先验概率p(y)和似然p(x|y)生成。首先会通过先验概率大小随机生成第c类,然后通过c类的似然概率大小,生成每一维的数据,最后就得到了新的样本。数据集加载方法load_mnist可以自行修改,这里用到了mnist-original.mat文件需要下载后放入当前路径的datasets/mldata下。
from sklearn.datasets import fetch_mldata
from collections import Counter
import numpy as np
#加载MNIST数据集
def load_mnist():
mnist = fetch_mldata('MNIST original', data_home='./datasets')
x, y = mnist["data"], mnist["target"]
x = np.where(x>0, 1, x) #将大于1的数组全部换成1
X_train, X_test, y_train, y_test = x[:60000], x[60000:], y[:60000], y[60000:]
return X_train, y_train, X_test, y_test
class NaiveBayes():
prob_c = None #类别为c的先验概率
prob_cj = None #类别为C,维度为j的似然概率
y_pred = None #预测结果,类别
y_pred_prob = None #预测结果,类别概率
#训练模型,计算先验概率和似然概率
def fit(self, X_train, y_train):
#计算各类别先验概率 p(y)
prob_c = []
y_count = Counter(y_train)
for key in y_count.keys():
prob_c.append(y_count[key]/len(y_train))
#计算每一维的条件概率 p(x_ij | y), x_ij表示第i行第j维的元素
prob_cj = [] #第c类,第j维的条件概率
for c in y_count.keys():
prob_temp = []
for j in range(X_train.shape[1]):
c_train = X_train[y_train==c] #类别为c的训练样本
c_train_j = c_train[:, j] #类别为c样本中第j维(列)的数据
c_train_j_1 = c_train_j[c_train_j == 1] #类别为c样本中第j维(列)值为1的数据
prob_1_cj = (c_train_j_1.shape[0] + 1)/(c_train_j.shape[0] + c_train.shape[1]) # add_one smoothing计算条件概率
prob_temp.append(prob_1_cj)
prob_cj.append(prob_temp)
self.prob_c = prob_c
self.prob_cj = prob_cj
return prob_c, prob_cj
#预测
def predict(self, X_test):
y_pred_prob = []
#对每一条测试样本
for x in X_test:
temp_list = []
#对每一类,计算后验概率
for i in range(len(self.prob_c)):
prob_cond = 1
c_test_index = np.where(x==1)[0]
for z in c_test_index:
prob_cond *= self.prob_cj[i][z]
post_prob = prob_cond * self.prob_c[i]
temp_list.append(post_prob)
y_pred_prob.append(temp_list)
#后验概率最大的索引值,就是该测试样本的所属类别
y_pred = np.argmax(y_pred_prob, axis=1)
self.y_pred_prob = y_pred_prob
self.y_pred = y_pred
return y_pred
def lossFun(self, y_pred, y_test):
p = 0
for i in range(len(y_pred)):
if y_pred[i] == y_test[i]:
p += 1
return p/len(y_pred)
# 计算累加概率,可以参考轮盘赌算法
def sum_prob(self, prob_list):
sum_prob = []
for i in range(len(prob_list)):
temp_prob = 0
for j in range(i+1):
temp_prob += prob_list[j]
sum_prob.append(temp_prob)
return sum_prob
def generateData(self):
# 计算p(y)的累加概率
sum_prob_c = self.sum_prob(self.prob_c)
# 随机生成[0, 1]之间的随机数
rand_prob_c = np.random