%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn import datasets
Naive Bayes Classifier
- suitable for very high dimensional dataset
- L:label
- F:feature
P ( L ∣ F ) = P ( F ∣ L ) P ( L ) P ( F ) P(L|F)=\frac{P(F|L)P(L)}{P(F)} P(L∣F)=P(F)P(F∣L)P(L)
classification: between two label L1, L2
P
(
L
1
∣
F
)
P
(
L
2
∣
F
)
=
P
(
F
∣
L
1
)
P
(
L
1
)
P
(
F
∣
L
2
)
P
(
L
2
)
\frac{P(L1|F)}{P(L2|F)}=\frac{P(F|L1)P(L1)}{P(F|L2)P(L2)}
P(L2∣F)P(L1∣F)=P(F∣L2)P(L2)P(F∣L1)P(L1)
if P(L1|F)>P(L2|F), then we classify it as L1;
if P(L1|F)<P(L2|F), then we classify it as L2;
Bayes formula
P
(
A
,
B
)
=
P
(
A
∣
B
)
P
(
B
)
,
P
(
A
,
B
)
=
P
(
B
∣
A
)
P
(
A
)
P(A,B)=P(A|B)P(B),P(A,B)=P(B|A)P(A)
P(A,B)=P(A∣B)P(B),P(A,B)=P(B∣A)P(A)
P
(
A
∣
B
)
=
P
(
B
∣
A
)
P
(
A
)
P
(
B
)
P(A|B)=\frac{P(B|A)P(A)}{P(B)}
P(A∣B)=P(B)P(B∣A)P(A)
Gaussian Naive Bayes
The likelihood of features is assumed to be Gaussian:
P
(
x
i
∣
y
)
=
1
2
π
σ
y
2
e
−
(
x
i
−
μ
y
)
2
2
σ
y
2
P(x_i|y)=\frac{1}{\sqrt{2\pi\sigma^2_y}}e^{-\frac{(x_i-\mu_y)^2}{2\sigma_y^2}}
P(xi∣y)=2πσy21e−2σy2(xi−μy)2
import seaborn as sns; sns.set()
from sklearn.naive_bayes import GaussianNB
X, y = make_blobs(100, 2, centers=2, random_state=2, cluster_std=1.5)
plt.scatter(X[:, 0], X[:, 1], c=y, s=20, cmap='RdBu')
plt.show()
model = GaussianNB()
model.fit(X,y)
rng = np.random.RandomState(0)
Xnew = [-6, -14] + [14, 18] * rng.rand(2000, 2)
ynew = model.predict(Xnew)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='RdBu')
lim = plt.axis()
plt.scatter(Xnew[:, 0], Xnew[:, 1], c=ynew, s=20, cmap='RdBu', alpha=0.3)
plt.axis(lim)
plt.show()
yprob = model.predict_proba(Xnew )
print(yprob[0:10].round(2))
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
print("%d sample, number of error: %d"% (X_test.shape[0], (y_test != y_pred).sum()))
print("ACC of Gaussian Naieve Bayes Classifier:%.0f %%"%(100*gnb.score(X_test,y_test)))
45 sample, number of error: 0
ACC of Gaussian Naieve Bayes Classifier:100 %
Bernoulli Naive Bayes
The likelihood of features is assumed to be Bernoulli:
P
{
X
i
=
k
}
=
p
k
(
1
−
p
)
1
−
k
P\{X_i=k\}=p^k(1-p)^{1-k}
P{Xi=k}=pk(1−p)1−k
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
y_pred = bnb.predict(X_test)
print("%d sample, number of error: %d"% (X_test.shape[0], (y_test != y_pred).sum()))
print("ACC of Bernoulli Naive Bayes Classifier:%.0f %%"%(100*bnb.score(X_test,y_test)))
45 sample, number of error: 34
ACC of Bernoulli Naieve Bayes Classifier:24 %
Improvment: binarize the feature
feature_medien = [0, 0 ,0 ,0]
number_sample,number_feature = X.shape
for i in range(number_feature):
feature_medien[i] = (max(X[:,i])+min(X[:,i]))/2
print(feature_medien)
X_binarize = X
for i in range(number_sample):
for j in range(number_feature):
if X_binarize[i,j]>feature_medien[j]:
X_binarize[i,j] = 1
else:
X_binarize[i,j] = 0
X_binarize
[6.1, 3.2, 3.95, 1.3]
X_train, X_test, y_train, y_test = train_test_split(X_binarize, y, test_size=0.3, random_state=0)
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
y_pred = bnb.predict(X_test)
print("%d sample, number of error: %d"% (X_test.shape[0], (y_test != y_pred).sum()))
print("ACC of Bernoulli Naive Bayes Classifier:%.0f %%"%(100*bnb.score(X_test,y_test)))
45 sample, number of error: 11
ACC of Bernoulli Naieve Bayes Classifier:76 %
Multinomial Naive Bayes
The likelihood of features is assumed to be multinomial:
P
{
X
1
=
m
1
,
X
2
=
m
2
,
.
.
.
,
X
n
=
m
n
}
=
N
!
m
1
!
m
2
!
.
.
.
m
n
!
p
1
m
1
p
2
m
2
.
.
.
p
n
m
n
P\{X_1=m_1,X_2=m_2,...,X_n=m_n\}=\frac{N!}{m_1!m_2!...m_n!}p_1^{m_1}p_2^{m_2}...p_n^{m_n}
P{X1=m1,X2=m2,...,Xn=mn}=m1!m2!...mn!N!p1m1p2m2...pnmn
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
y_pred = mnb.predict(X_test)
print("%d sample, number of error: %d"% (X_test.shape[0], (y_test != y_pred).sum()))
print("ACC of Multinomial Naive Bayes Classifier:%.0f %%"%(100*mnb.score(X_test,y_test)))
45 sample, number of error: 18
ACC of Multinomial Naieve Bayes Classifier:60 %
eg: Classification for breast cancer dataset using Naive Bayes Model
import numpy as np
from matplotlib import pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split, learning_curve, ShuffleSplit
from sklearn.naive_bayes import GaussianNB
# 导入乳腺癌数据集
data_cancer = datasets.load_breast_cancer()
X = data_cancer.data
y = data_cancer.target
# 将乳腺癌数据集分类
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# 建立朴素贝叶斯模型并实例化
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
# print("%d个测试样本中错分样本数目: %d"% (X_test.shape[0], (y_test != y_pred).sum()))
# 用score评价训练和测试得分
print("高斯贝叶斯分类器分类对测试样本的分类正确率为:%.0f %%" % (100 * gnb.score(X_test, y_test)))
title = r"Learning Curves (Naive Bayes)"
# 绘图函数
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 20)):
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
return plt
cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
estimator = GaussianNB()
NB = estimator
plot_learning_curve(estimator, title, X, y, ylim=(0.9, 1.01), cv=cv, n_jobs=1)
plt.show()
高斯贝叶斯分类器分类对测试样本的分类正确率为:92 %