机器学习手撕代码(1)贝叶斯分类器
- 本篇分享一下贝叶斯分类器的代码,naive_bayes.py为朴素贝叶斯分类器,normal_bayes.py中为正态贝叶斯分类器。utils.py中为可视化结果的工具。
- dataset见本系列第0篇。
naive_bayes.py
import numpy as np
from datasets.dataset import DataSet
from sklearn.model_selection import train_test_split
from utils import Visualization
class NaiveBayesClassifier:
def __init__(self,n_class):
self.pc = None
self.dis_i = []
self.con_i = []
self.n_feats = 0
self.dis_feat_count = None
self.con_feat_count = None
self.n_class = n_class
def fit(self,data_all,targets,dis_i=[]): # dis_i为离散特征的索引
cla_count = np.bincount(targets)
self.pc = cla_count/np.sum(cla_count) # 训练集中每个类的概率
self.dis_i = dis_i
self.con_i = list(range(data_all.shape[1]))
for i in dis_i:
self.con_i.remove(i) # 删除离散特征的索引,留下连续特征的索引
self.n_feats = data_all.shape[1]
all_dis_feat_count = [] # 存放每个类数据中离散特征的统计
all_con_feat_count = [] # 存放每个类数据中连续特征的统计
max_nv = 0
for i in dis_i:
max_nv = max(max_nv,np.max(np.unique(data_all[:,i])))
max_nv = int(max_nv+1)
for cla in np.unique(targets):
data = data_all[targets==cla] # 取出一个类的数据
dis_feat_count = []
con_feat_count = []
for i in range(data.shape[1]):
data_i = data[:,i]
if i in dis_i: # 离散特征
data_i = data_i.astype(np.int32)
count = np.bincount(data_i)+1 # 拉普拉斯平滑
dis_feat_count.append(count) # 一个类的所有特征的值的统计
else: # 如果是连续特征则计算均值和标准差
mu = np.mean(data_i)
sigma = np.std(data_i,ddof=1)
con_feat_count.append([mu,sigma])
for i in range(len(dis_feat_count)):
count_i = dis_feat_count[i]
if count_i.shape[0]<max_nv: # 这里统一数组的长度,便于转化为ndarray计算
dif = max_nv-count_i.shape[0]
dis_feat_count[i] = np.concatenate([count_i,np.zeros(dif)])
all_dis_feat_count.append(np.array(dis_feat_count))
all_con_feat_count.append(np.array(con_feat_count))
self.dis_feat_count = np.array(all_dis_feat_count)
self.con_feat_count = np.array(all_con_feat_count)
def _P(self,x,i): # 计算第i个特征属于两个类的概率
if i in self.dis_i:
index = self.dis_i.index(i)
x = x.astype(np.int32)
# 求离散特征的概率
p = self.dis_feat_count[:,index,x].reshape((x.shape[0],self.n_class))/np.sum(self.dis_feat_count[:,index],axis=1)
elif i in self.con_i:
index = self.con_i.index(i)
t = self.con_feat_count[:,index]
mu = t[:,0]
sigma = t[:,1]
# 求连续特征的概率,以正态分布概率密度函数值近似概率值
p = 1/(np.sqrt(2*np.pi)*sigma)*np.exp(-(x.reshape((x.shape[0],1))-mu)**2/(2*sigma**2))
return p*self.pc
def predict(self,data):
p = np.ones((data.shape[0],self.n_class))
for i in range(data.shape[1]):
feat = data[:,i]
p = p*self._P(feat,i) #朴素假设,总的概率等于每个特征的概率的累乘
cla = np.argmax(p,axis=1)
return cla
if __name__ == '__main__':
dataset = DataSet('F:\PycharmProjects\machine_leatning\datasets\winequalityN.csv')
data, target, target_head, data_head = dataset.get_data()
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=2021, test_size=0.3)
nb = NaiveBayesClassifier(n_class=2)
nb.fit(X_train,y_train,dis_i = [11])
res = nb.predict(X_test)
score = np.sum(res==y_test)/len(y_test)
print('Acc:',score)
vis = Visualization(colors=['red', 'blue'])
vis.fit(X_test)
vis.savefig(y_test, res, './rf_diy_res_in_testset.png')
normal_bayes.py
import numpy as np
from datasets.dataset import DataSet
from sklearn.model_selection import train_test_split
from utils import Visualization
class NormalBayesClassifier:
def __init__(self,n_class):
self.pc = None # 数据中每个类的概率
self.n_class = n_class
self.sigma = [] # 两个类的数据的协方差矩阵
self.mu = [] # 两个类的数据的均值向量
def fit(self,data_all,targets):
cla_count = np.bincount(targets)
self.pc = cla_count/np.sum(cla_count)
for i in range(cla_count.shape[0]):
data = data_all[targets==i]
mu = np.mean(data,axis=0)
self.mu.append(mu)
sigma = np.cov(data,rowvar=False)
self.sigma.append(sigma)
def _P(self,x):
n = x.shape[1]
p = []
for i in range(self.n_class): # 按公式求当前样本属于每个类的概率
a = 1/((2*np.pi)**(n/2)*np.linalg.det(self.sigma[i])**(1/2))
t = np.mat(x - self.mu[i]) * np.linalg.pinv(self.sigma[i])
b = np.exp(-(1/2)*np.sum(np.multiply(t,x-self.mu[i]),axis=1))
p.append(a*b*self.pc[i])
return np.array(p)
def predict(self,data):
p = self._P(data)
cla = np.argmax(p,axis=0)
return cla.reshape((data.shape[0],))
if __name__ == '__main__':
dataset = DataSet('F:\PycharmProjects\machine_leatning\datasets\winequalityN.csv')
data, target, target_head, data_head = dataset.get_data()
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=2021, test_size=0.3)
nb = NormalBayesClassifier(n_class=2)
nb.fit(X_train,y_train)
res = nb.predict(X_test)
score = np.sum(res==y_test)/len(y_test)
print(score)
vis = Visualization(colors=['red', 'blue'])
vis.fit(X_test)
vis.savefig(y_test, res, './nor.png')
utils.py
from datasets.dataset import DataSet
import matplotlib.pyplot as plt
import numpy as np
from sklearn import manifold
class Visualization:
def __init__(self,colors):
self.colors = colors
self.x_norm = None
def fit(self,data):
tsne = manifold.TSNE(n_components=2, init='pca', random_state=2021)
X_tsne = tsne.fit_transform(data)
x_min, x_max = X_tsne.min(0), X_tsne.max(0)
self.x_norm = (X_tsne - x_min) / (x_max - x_min)
def show(self,gt,pred):
plt.figure(figsize=(8, 8))
for i in range(self.x_norm.shape[0]):
plt.text(self.x_norm[i, 0], self.x_norm[i, 1], str(gt[i]), color=self.colors[pred[i]],
fontdict={'weight': 'bold', 'size': 8})
plt.xticks([])
plt.yticks([])
plt.show()
def savefig(self,gt,pred,file_name):
plt.figure(figsize=(8, 8))
for i in range(self.x_norm.shape[0]):
plt.text(self.x_norm[i, 0], self.x_norm[i, 1], str(gt[i]), color=self.colors[pred[i]],
fontdict={'weight': 'bold', 'size': 8})
plt.xticks([])
plt.yticks([])
plt.savefig(file_name)
class Metrics:
def __init__(self,n_classes=2):
self.n_classes = n_classes
self.confusion_matrix = [[0 for i in range(n_classes)] for j in range(n_classes)]
self.labels = []
self.preds = []
self.ALL = None
self.TN = None
self.FP = None
self.FN = None
self.TP = None
def empty(self):
self.confusion_matrix = [[0 for i in range(self.n_classes)] for j in range(self.n_classes)]
self.labels = []
self.preds = []
def update(self,preds,labels):
for i in range(len(labels)):
self.confusion_matrix[labels[i]][preds[i]]+=1
def count(self):
confusion_matrix = np.array(self.confusion_matrix)
self.ALL = np.sum(confusion_matrix)
self.TN = confusion_matrix[0,0]
self.FP = confusion_matrix[0,1]
self.FN = confusion_matrix[1,0]
self.TP = confusion_matrix[1,1]
def accuracy(self):
res = (self.TP + self.TN) / (self.TP + self.TN + self.FP + self.FN)
return res
def recall(self):
res = self.TP/(self.TP+self.FN)
return res
def precision(self):
res = self.TP/(self.TP+self.FP)
return res
if __name__ == '__main__':
dataset = DataSet('F:\PycharmProjects\machine_leatning\datasets\winequalityN.csv')
data, target, target_head, data_head = dataset.get_data()
X_train = np.concatenate([data[:200],data[-200:]])
Y_train = np.concatenate([target[:200],target[-200:]])
X_train = (X_train-np.min(X_train,axis=0))/(np.max(X_train,axis=0)-np.min(X_train,axis=0))
vis = Visualization(['red','blue'])
vis.fit(X_train)
vis.show(Y_train,Y_train)