官方相关文档:Sklearn - 1.9. Naive Bayes
https://scikit-learn.org/stable/modules/naive_bayes.html
载入数据
from sklearn import datasets
iris = datasets.load_iris()
iris_feature = iris.data
iris_target = iris.target
1、为连续数据 训练分类器
from sklearn.naive_bayes import GaussianNB
# 创建高斯朴素贝叶斯对象
clf = GaussianNB()
model = clf.fit(iris_feature, iris_target)
new_ob = [[4, 4, 4, 0.4]]
model.predict(new_ob) # array([1])
# 给定每个分类 先验概率,创建一个高斯朴素贝叶斯对象
clf = GaussianNB(priors=[0.25, 0.25, 0.5])
model = clf.fit(iris_feature, iris_target)
model.predict(new_ob) # array([1])
2、为离散数据和计数数据 训练分类器
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
text_data = np.array([
'I love Brazil.',
'Brazil is better',
'France beats both',
])
# 创建词袋
count = CountVectorizer()
bag_of_words = count.fit_transform(text_data)
# 创建特征矩阵
features = bag_of_words.toarray()
target = np.array([0, 0, 1])
# 给定每个分类的先验概率,创建一个多项式朴素贝叶斯对象
clf = MultinomialNB(class_prior=[0.25, 0.25])
# 训练模型
model = clf.fit(features, target)
new_ob = [[0, 0, 0, 1, 0, 1, 0]]
model.predict(new_ob) # array([0])
3、为具有二元特征的数据,训练朴素贝叶斯分类器
from sklearn.naive_bayes import BernoulliNB
features = np.random.randint(2, size=(100, 3))
# 创建二元目标向量
target = np.random.randint(2, size=(100, 1)).ravel()
target
array([1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1,
1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0,
1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1])
# 给定每个分类的先验概率,创建一个多项式的朴素贝叶斯对象
clf = BernoulliNB(class_prior=[0.25, 0.5])
model = clf.fit(features, target)
# 想设置统一的先验概率,可设置 fit_prior=False
model_uniform_prior = BernoulliNB(class_prior=None, fit_prior=False)
4、校准预测概率
from sklearn.calibration import CalibratedClassifierCV
clf = GaussianNB()
# 创建使用 sigmoid 校准 调校过的 交叉验证模型
clf_sigmoid = CalibratedClassifierCV(clf, cv=2, method='sigmoid')
# 校准概率
clf_sigmoid.fit(iris_feature, iris_target)
# CalibratedClassifierCV(base_estimator=GaussianNB(), cv=2)
new_ob = [[2.6, 2.6, 2.6, 0.4]]
clf_sigmoid.predict_proba(new_ob)
# array([[0.31859969, 0.63663466, 0.04476565]])
# 训练一个 高斯朴素贝叶斯 分类器,来预测观察值的分类概率
clf.fit(iris_feature, iris_target).predict_proba(new_ob)
# array([[2.31548432e-04, 9.99768128e-01, 3.23532277e-07]])
# 查看校准后的概率
clf_sigmoid.predict_proba(new_ob)
# array([[0.31859969, 0.63663466, 0.04476565]])
2023-03-29