1. 概率
-
内涵:概率是一个事件发生的可能性大小的数字化度量!!!
-
归一性:[0, 1]
-
离散数据:
- 数个数
- 数据集中,某一类样本的先验概率
-
连续数据:
-
独立
- 在机器学习中,特征都是互相独立的!!!
-
条件概率
-
A和B任意情况:
-
求解方法:切分数据集!!!
-
A和B互相独立:
-
-
2. 朴素贝叶斯的前提假设
- 连续型变量的概率 →→ 概率密度函数的值
- 一个未知的连续型分布 →→ 看作高斯分布
- 把每个特征看作条件独立的
- 代码实现:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import numpy as np
class MYGaussianNB(object):
def __init__(self):
pass
def fit(self, X, y):
import numpy as np
#类型转换
X = np.array(X)
y = np.array(y)
if X.ndim != 2 or y.ndim != 1 or X.shape[0] != y.shape[0]:
raise ValueError("数据格式有误")
"""
训练过程
- 台下十年功
- 为了推理做准备
- 预制菜,把能做的都提前准备好
- 计算每个类别的先验概率
- 计算每个类型下的每个特征的均值和标准差
"""
self.y_probs = {}
self.y_params = {}
self.n_features = X.shape[1]
labels = set(y)
self.labels = list(labels)
for label in labels:
#计算每个类别的先验概率
self.y_probs[label] = (y == label).mean()
X_label = X[y == label]
mus = X_label.mean(axis=0)
sigmas = X_label.std(axis=0)
self.y_params[label] = {"mu":mus, "sigma":sigmas}
def _guassian_distribution(self, x, mu, sigma):
return 1 / np.sqrt(2 * np.pi)/ sigma * np.exp(- (x - mu) ** 2 / 2 / sigma ** 2)
def predict(self, X):
import numpy as np
#类型转换
X = np.array(X)
#数据校验
if X.ndim != 2 or X.shape[1] != self.n_features:
raise ValueError("数据格式有误")
results = []
for x in X:
#求每个样本的每种类型的概率,取最大值
labels = []
for label in self.labels:
prob = self.y_probs[label]
for feature_idx in range(self.n_features):
prob *= self._guassian_distribution(x=x[feature_idx],
mu=self.y_params[label]["mu"][feature_idx]
, sigma=self.y_params[label]["sigma"][feature_idx])
labels.append(prob)
results.append(self.labels[np.array(labels).argmax()])
return np.array(results)
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
my_gnb = MYGaussianNB()
my_gnb.fit(X=X_train, y=y_train)
y_pred = my_gnb.predict(X=X_test)
acc = (y_pred == y_test).mean()
acc