机器学习手撕代码(1)贝叶斯分类器

机器学习手撕代码(1)贝叶斯分类器

  • 本篇分享一下贝叶斯分类器的代码,naive_bayes.py为朴素贝叶斯分类器,normal_bayes.py中为正态贝叶斯分类器。utils.py中为可视化结果的工具。
  • dataset见本系列第0篇。

naive_bayes.py

import numpy as np
from datasets.dataset import DataSet
from sklearn.model_selection import train_test_split
from utils import Visualization

class NaiveBayesClassifier:
    def __init__(self,n_class):
        self.pc = None
        self.dis_i = []
        self.con_i = []
        self.n_feats = 0
        self.dis_feat_count = None
        self.con_feat_count = None
        self.n_class = n_class

    def fit(self,data_all,targets,dis_i=[]): # dis_i为离散特征的索引
        cla_count = np.bincount(targets)
        self.pc = cla_count/np.sum(cla_count) # 训练集中每个类的概率
        self.dis_i = dis_i
        self.con_i = list(range(data_all.shape[1]))
        for i in dis_i:
            self.con_i.remove(i) # 删除离散特征的索引,留下连续特征的索引
        self.n_feats = data_all.shape[1]
        all_dis_feat_count = [] # 存放每个类数据中离散特征的统计
        all_con_feat_count = [] # 存放每个类数据中连续特征的统计
        max_nv = 0
        for i in dis_i:
            max_nv = max(max_nv,np.max(np.unique(data_all[:,i])))
        max_nv = int(max_nv+1)
        for cla in np.unique(targets):
            data = data_all[targets==cla] # 取出一个类的数据
            dis_feat_count = []
            con_feat_count = []
            for i in range(data.shape[1]):
                data_i = data[:,i]
                if i in dis_i: # 离散特征
                    data_i = data_i.astype(np.int32)
                    count = np.bincount(data_i)+1 # 拉普拉斯平滑
                    dis_feat_count.append(count) # 一个类的所有特征的值的统计
                else: # 如果是连续特征则计算均值和标准差
                    mu = np.mean(data_i)
                    sigma = np.std(data_i,ddof=1)
                    con_feat_count.append([mu,sigma])
            for i in range(len(dis_feat_count)):
                count_i = dis_feat_count[i]
                if count_i.shape[0]<max_nv: # 这里统一数组的长度,便于转化为ndarray计算
                    dif = max_nv-count_i.shape[0]
                    dis_feat_count[i] = np.concatenate([count_i,np.zeros(dif)])
            all_dis_feat_count.append(np.array(dis_feat_count))
            all_con_feat_count.append(np.array(con_feat_count))
        self.dis_feat_count = np.array(all_dis_feat_count)
        self.con_feat_count = np.array(all_con_feat_count)

    def _P(self,x,i): # 计算第i个特征属于两个类的概率
        if i in self.dis_i:
            index = self.dis_i.index(i)
            x = x.astype(np.int32)
            # 求离散特征的概率
            p = self.dis_feat_count[:,index,x].reshape((x.shape[0],self.n_class))/np.sum(self.dis_feat_count[:,index],axis=1)
        elif i in self.con_i:
            index = self.con_i.index(i)
            t = self.con_feat_count[:,index]
            mu = t[:,0]
            sigma = t[:,1]
            # 求连续特征的概率,以正态分布概率密度函数值近似概率值
            p = 1/(np.sqrt(2*np.pi)*sigma)*np.exp(-(x.reshape((x.shape[0],1))-mu)**2/(2*sigma**2))
        return p*self.pc

    def predict(self,data):
        p = np.ones((data.shape[0],self.n_class))
        for i in range(data.shape[1]):
            feat = data[:,i]
            p = p*self._P(feat,i) #朴素假设,总的概率等于每个特征的概率的累乘
        cla = np.argmax(p,axis=1)
        return cla


if __name__ == '__main__':
    dataset = DataSet('F:\PycharmProjects\machine_leatning\datasets\winequalityN.csv')
    data, target, target_head, data_head = dataset.get_data()
    X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=2021, test_size=0.3)
    nb = NaiveBayesClassifier(n_class=2)
    nb.fit(X_train,y_train,dis_i = [11])
    res = nb.predict(X_test)
    score = np.sum(res==y_test)/len(y_test)
    print('Acc:',score)
    vis = Visualization(colors=['red', 'blue'])
    vis.fit(X_test)
    vis.savefig(y_test, res, './rf_diy_res_in_testset.png')


normal_bayes.py

import numpy as np
from datasets.dataset import DataSet
from sklearn.model_selection import train_test_split
from utils import Visualization

class NormalBayesClassifier:
    def __init__(self,n_class):
        self.pc = None # 数据中每个类的概率
        self.n_class = n_class
        self.sigma = [] # 两个类的数据的协方差矩阵
        self.mu = [] # 两个类的数据的均值向量

    def fit(self,data_all,targets):
        cla_count = np.bincount(targets)
        self.pc = cla_count/np.sum(cla_count)
        for i in range(cla_count.shape[0]):
            data = data_all[targets==i]
            mu = np.mean(data,axis=0)
            self.mu.append(mu)
            sigma = np.cov(data,rowvar=False)
            self.sigma.append(sigma)
    def _P(self,x):
        n = x.shape[1]
        p = []
        for i in range(self.n_class): # 按公式求当前样本属于每个类的概率
            a = 1/((2*np.pi)**(n/2)*np.linalg.det(self.sigma[i])**(1/2))
            t = np.mat(x - self.mu[i]) * np.linalg.pinv(self.sigma[i])
            b = np.exp(-(1/2)*np.sum(np.multiply(t,x-self.mu[i]),axis=1))
            p.append(a*b*self.pc[i])
        return np.array(p)

    def predict(self,data):
        p = self._P(data)
        cla = np.argmax(p,axis=0)
        return cla.reshape((data.shape[0],))


if __name__ == '__main__':
    dataset = DataSet('F:\PycharmProjects\machine_leatning\datasets\winequalityN.csv')
    data, target, target_head, data_head = dataset.get_data()
    X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=2021, test_size=0.3)
    nb = NormalBayesClassifier(n_class=2)
    nb.fit(X_train,y_train)
    res = nb.predict(X_test)
    score = np.sum(res==y_test)/len(y_test)
    print(score)
    vis = Visualization(colors=['red', 'blue'])
    vis.fit(X_test)
    vis.savefig(y_test, res, './nor.png')

utils.py

from datasets.dataset import DataSet
import matplotlib.pyplot as plt
import numpy as np
from sklearn import manifold

class Visualization:
    def __init__(self,colors):
        self.colors = colors
        self.x_norm = None
    def fit(self,data):
        tsne = manifold.TSNE(n_components=2, init='pca', random_state=2021)
        X_tsne = tsne.fit_transform(data)
        x_min, x_max = X_tsne.min(0), X_tsne.max(0)
        self.x_norm = (X_tsne - x_min) / (x_max - x_min)
    def show(self,gt,pred):
        plt.figure(figsize=(8, 8))
        for i in range(self.x_norm.shape[0]):
            plt.text(self.x_norm[i, 0], self.x_norm[i, 1], str(gt[i]), color=self.colors[pred[i]],
                     fontdict={'weight': 'bold', 'size': 8})
        plt.xticks([])
        plt.yticks([])
        plt.show()
    def savefig(self,gt,pred,file_name):
        plt.figure(figsize=(8, 8))
        for i in range(self.x_norm.shape[0]):
            plt.text(self.x_norm[i, 0], self.x_norm[i, 1], str(gt[i]), color=self.colors[pred[i]],
                     fontdict={'weight': 'bold', 'size': 8})
        plt.xticks([])
        plt.yticks([])
        plt.savefig(file_name)


class Metrics:
    def __init__(self,n_classes=2):
        self.n_classes = n_classes
        self.confusion_matrix = [[0 for i in range(n_classes)] for j in range(n_classes)]
        self.labels = []
        self.preds = []
        self.ALL = None
        self.TN = None
        self.FP = None
        self.FN = None
        self.TP = None

    def empty(self):
        self.confusion_matrix = [[0 for i in range(self.n_classes)] for j in range(self.n_classes)]
        self.labels = []
        self.preds = []

    def update(self,preds,labels):
        for i in range(len(labels)):
            self.confusion_matrix[labels[i]][preds[i]]+=1

    def count(self):
        confusion_matrix = np.array(self.confusion_matrix)
        self.ALL = np.sum(confusion_matrix)
        self.TN = confusion_matrix[0,0]
        self.FP = confusion_matrix[0,1]
        self.FN = confusion_matrix[1,0]
        self.TP = confusion_matrix[1,1]

    def accuracy(self):
        res = (self.TP + self.TN) / (self.TP + self.TN + self.FP + self.FN)
        return res

    def recall(self):
        res = self.TP/(self.TP+self.FN)
        return res

    def precision(self):
        res = self.TP/(self.TP+self.FP)
        return res



if __name__ == '__main__':
    dataset = DataSet('F:\PycharmProjects\machine_leatning\datasets\winequalityN.csv')
    data, target, target_head, data_head = dataset.get_data()

    X_train = np.concatenate([data[:200],data[-200:]])
    Y_train = np.concatenate([target[:200],target[-200:]])
    X_train = (X_train-np.min(X_train,axis=0))/(np.max(X_train,axis=0)-np.min(X_train,axis=0))
    vis = Visualization(['red','blue'])
    vis.fit(X_train)
    vis.show(Y_train,Y_train)

  • 0
    点赞
  • 10
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值