Naive Bayes Classifier

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn import datasets

Naive Bayes Classifier

  1. suitable for very high dimensional dataset
  2. L:label
  3. F:feature
    P ( L ∣ F ) = P ( F ∣ L ) P ( L ) P ( F ) P(L|F)=\frac{P(F|L)P(L)}{P(F)} P(LF)=P(F)P(FL)P(L)

classification: between two label L1, L2

P ( L 1 ∣ F ) P ( L 2 ∣ F ) = P ( F ∣ L 1 ) P ( L 1 ) P ( F ∣ L 2 ) P ( L 2 ) \frac{P(L1|F)}{P(L2|F)}=\frac{P(F|L1)P(L1)}{P(F|L2)P(L2)} P(L2∣F)P(L1∣F)=P(FL2)P(L2)P(FL1)P(L1)
if P(L1|F)>P(L2|F), then we classify it as L1;
if P(L1|F)<P(L2|F), then we classify it as L2;

Bayes formula

P ( A , B ) = P ( A ∣ B ) P ( B ) , P ( A , B ) = P ( B ∣ A ) P ( A ) P(A,B)=P(A|B)P(B),P(A,B)=P(B|A)P(A) P(A,B)=P(AB)P(B),P(A,B)=P(BA)P(A)
P ( A ∣ B ) = P ( B ∣ A ) P ( A ) P ( B ) P(A|B)=\frac{P(B|A)P(A)}{P(B)} P(AB)=P(B)P(BA)P(A)

Gaussian Naive Bayes

The likelihood of features is assumed to be Gaussian:
P ( x i ∣ y ) = 1 2 π σ y 2 e − ( x i − μ y ) 2 2 σ y 2 P(x_i|y)=\frac{1}{\sqrt{2\pi\sigma^2_y}}e^{-\frac{(x_i-\mu_y)^2}{2\sigma_y^2}} P(xiy)=2πσy2 1e2σy2(xiμy)2

import seaborn as sns; sns.set()
from sklearn.naive_bayes import GaussianNB

X, y = make_blobs(100, 2, centers=2, random_state=2, cluster_std=1.5)
plt.scatter(X[:, 0], X[:, 1], c=y, s=20, cmap='RdBu')
plt.show()

model = GaussianNB()
model.fit(X,y)

rng = np.random.RandomState(0)
Xnew = [-6, -14] + [14, 18] * rng.rand(2000, 2)
ynew = model.predict(Xnew)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='RdBu')
lim = plt.axis()
plt.scatter(Xnew[:, 0], Xnew[:, 1], c=ynew, s=20, cmap='RdBu', alpha=0.3)
plt.axis(lim)
plt.show()
yprob = model.predict_proba(Xnew )
print(yprob[0:10].round(2))

在这里插入图片描述

在这里插入图片描述

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
print("%d sample, number of error: %d"% (X_test.shape[0], (y_test != y_pred).sum()))
print("ACC of Gaussian Naieve Bayes Classifier:%.0f %%"%(100*gnb.score(X_test,y_test)))
45 sample, number of error: 0
ACC of Gaussian Naieve Bayes Classifier:100 %

Bernoulli Naive Bayes

The likelihood of features is assumed to be Bernoulli:
P { X i = k } = p k ( 1 − p ) 1 − k P\{X_i=k\}=p^k(1-p)^{1-k} P{Xi=k}=pk(1p)1k

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
y_pred = bnb.predict(X_test)
print("%d sample, number of error: %d"% (X_test.shape[0], (y_test != y_pred).sum()))
print("ACC of Bernoulli Naive Bayes Classifier:%.0f %%"%(100*bnb.score(X_test,y_test)))
45 sample, number of error: 34
ACC of Bernoulli Naieve Bayes Classifier:24 %

Improvment: binarize the feature

feature_medien = [0, 0 ,0 ,0]
number_sample,number_feature = X.shape
for i in range(number_feature):
    feature_medien[i] = (max(X[:,i])+min(X[:,i]))/2
print(feature_medien)
X_binarize = X
for i in range(number_sample):
    for j in range(number_feature):
        if X_binarize[i,j]>feature_medien[j]:
            X_binarize[i,j] = 1
        else:
            X_binarize[i,j] = 0
X_binarize
[6.1, 3.2, 3.95, 1.3]
X_train, X_test, y_train, y_test = train_test_split(X_binarize, y, test_size=0.3, random_state=0)

bnb = BernoulliNB()
bnb.fit(X_train, y_train)
y_pred = bnb.predict(X_test)
print("%d sample, number of error: %d"% (X_test.shape[0], (y_test != y_pred).sum()))
print("ACC of Bernoulli Naive Bayes Classifier:%.0f %%"%(100*bnb.score(X_test,y_test)))
45 sample, number of error: 11
ACC of Bernoulli Naieve Bayes Classifier:76 %

Multinomial Naive Bayes

The likelihood of features is assumed to be multinomial:
P { X 1 = m 1 , X 2 = m 2 , . . . , X n = m n } = N ! m 1 ! m 2 ! . . . m n ! p 1 m 1 p 2 m 2 . . . p n m n P\{X_1=m_1,X_2=m_2,...,X_n=m_n\}=\frac{N!}{m_1!m_2!...m_n!}p_1^{m_1}p_2^{m_2}...p_n^{m_n} P{X1=m1,X2=m2,...,Xn=mn}=m1!m2!...mn!N!p1m1p2m2...pnmn

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
y_pred = mnb.predict(X_test)
print("%d sample, number of error: %d"% (X_test.shape[0], (y_test != y_pred).sum()))
print("ACC of Multinomial Naive Bayes Classifier:%.0f %%"%(100*mnb.score(X_test,y_test)))
45 sample, number of error: 18
ACC of Multinomial Naieve Bayes Classifier:60 %

eg: Classification for breast cancer dataset using Naive Bayes Model

import numpy as np
from matplotlib import pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split, learning_curve, ShuffleSplit
from sklearn.naive_bayes import GaussianNB


# 导入乳腺癌数据集
data_cancer = datasets.load_breast_cancer()
X = data_cancer.data
y = data_cancer.target
# 将乳腺癌数据集分类
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# 建立朴素贝叶斯模型并实例化
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
# print("%d个测试样本中错分样本数目: %d"% (X_test.shape[0], (y_test != y_pred).sum()))
# 用score评价训练和测试得分
print("高斯贝叶斯分类器分类对测试样本的分类正确率为:%.0f %%" % (100 * gnb.score(X_test, y_test)))
title = r"Learning Curves (Naive Bayes)"


# 绘图函数
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 20)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt


cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
estimator = GaussianNB()
NB = estimator
plot_learning_curve(estimator, title, X, y, ylim=(0.9, 1.01), cv=cv, n_jobs=1)
plt.show()

高斯贝叶斯分类器分类对测试样本的分类正确率为:92 %

在这里插入图片描述

  • 3
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值