1,朴素贝叶斯法
朴素贝叶斯法是基于贝叶斯定理与特征条件独立假设的分类方法。简单来说,朴素贝叶斯分类器假设样本每个特征与其他特征都不相关。
朴素贝叶斯的一般过程
收集数据:可以使用任何方式
准备数据:需要数据型或是布尔型数据
分类数据:有大量特征时,绘制特征作用不大,此时使用直方图效果更好
训练算法:计算不同的独立特征的条件概率
测试算法:计算错误率
使用算法:文档分类
原理
主要是运用贝叶斯定理
2. 举例
使用python编写朴素贝叶斯算法
要求
– 算法可以使用训练集进行训练模型
– 算法可以对预测集进行预测
– 得到预测结果的评估结果, 使用accuracy、recall以及precision等指标来进行衡量
– 数据集使用西瓜书上的西瓜数据
– 算法的实现不能直接调库,可以使用numpy库来辅助实现
from sklearn.model_selection import train_test_split
import numpy as np
class NaiveBayes:
def fit(self, train, targets, tests):
targets = list(targets)
p_y = {} # 类先验概率
target_unique = set(targets)
for target in target_unique:
p_y[target] = (targets.count(target) + 1) / (
len(targets) + 2
) # p(好瓜 = 是) p(好瓜 = 否)
YesDataSet, NoDataSet = splitDataSet(train, targets) # 将好瓜数据和坏瓜数据分开
YesDataSet = YesDataSet.tolist() # array转为list
NoDataSet = NoDataSet.tolist()
p_xy = {}
p_xn = {}
p_yx = 1
p_nx = 1
p_1 = 0
p_2 = 0
result = []
for test in tests: # 选择一个待分类项
for j in range(len(test)):
for i in range(len(YesDataSet)):
if test[j] in YesDataSet[i]:
p_1 += 1
p_xy[test[j]] = (p_1 + 1) / (
len(YesDataSet) + len(set(train[j]))
) # 计算属性估计条件概率
p_yx *= p_xy[test[j]]
for i in range(len(NoDataSet)):
if test[j] in NoDataSet[i]:
p_2 += 1
p_xn[test[j]] = (p_2 + 1) / (len(NoDataSet) + len(set(train[j])))
p_nx *= p_xn[test[j]]
p_1 = 0
p_2 = 0
p_yx *= p_y["是"] # 计算当前分类项是好瓜的概率
p_nx *= p_y["否"]
if p_yx > p_nx:
result.append("是")
else:
result.append("否")
return result
def splitDataSet(train, targets): # 分割训练集
index = {}
target_unique = set(targets)
for target in target_unique:
index[target] = []
for i in range(len(targets)):
index[targets[i]] == index[targets[i]].append(i)
YesTrain = train[index["是"]]
NoTrain = train[index["否"]]
return YesTrain, NoTrain
def datatrans(data):
first_ele = True
for data in data.readlines():
data = data.strip("\n") ## 去掉每行的换行符,"\n"
nums = data.split(",") ## 按照逗号进行分割。
if first_ele:
matrix = np.array(nums) ## 添加到 matrix 中。
first_ele = False
else:
matrix = np.c_[matrix, nums]
arr = matrix.transpose()
dic = {"label": arr[0, :-1], "data": arr[1:, :-1], "target": arr[1:, -1]}
return dic
def anaylse(test_target, predict_result):
TP, TN, FP, FN = 0, 0, 0, 0
for i in range(len(test_target)):
if test_target[i] == "是":
if predict_result[i] == "是":
TP += 1
else:
FN += 1
else:
if predict_result[i] == "是":
FP += 1
else:
TN += 1
return TP, TN, FP, FN
if __name__ == "__main__":
data = open("watermelon", "r", encoding="utf-8")
dic = datatrans(data)
train_feature, test_feature, train_target, test_target = train_test_split(
dic["data"], dic["target"], test_size=0.3
)
nb = NaiveBayes()
result = nb.fit(train_feature, train_target, test_feature)
print(result)
print(test_target)
TP, TN, FP, FN = anaylse(test_target, result)
if TP + TN + FP + FN == 0:
acc_value = 0
else:
acc_value = (TP + TN) / (TP + TN + FP + FN)
print("acc_value:", acc_value)
if TP + FP == 0:
recall_value = 0
else:
recall_value = TP / (TP + FP)
print("recall_value:", recall_value)
if TP + FN == 0:
precision_value = 0
else:
precision_value = TP / (TP + FN)
print("precision_value:", precision_value)
其中,为了避免其他属性携带的信息被训练集中未出现的属性值“抹去”,在估计概率值时通常进行“平滑”,常用“拉普拉斯修正”。