import pandas as pd
def naive_bayes(train, test):
train_target = train[['class']]
target_pro = train_target.value_counts() / sum(train_target.value_counts()) # 计算先验概率
target_pro = pd.DataFrame(target_pro, columns=['pro']).reset_index()
cla = list(target_pro['class']) # 将类别名称和先验概率以列表形式保存
pro = list(target_pro['pro'])
col = list(test.columns) # 获取变量名
down = 0 # 初始化分母
up = [] # 初始化分子
for c, p in zip(cla, pro):
cla_pro = 1
for i in col:
con_p = train.loc[(train['class'] == c) & (train[i] == test[i].values[0])].shape[0] / train.loc[train['class'] == c].shape[0] # 条件概率
cla_pro *= con_p * p # 累乘(条件概率*先验概率)
up.append(cla_pro)
down += cla_pro # 累加(累乘(条件概率*先验概率))
aft_p = [item / down for item in up] # 计算后验概率
return {c: p for c, p in zip(cla, aft_p)} # 以字典的形式返回每个类别对应的概率
if __name__ == '__main__':
train = pd.DataFrame([[1, 1, 2, 0], [1, 1, 3, 1], [2, 1, 1, 1], [1, 2, 3, 2], [1, 1, 0, 2], [1, 2, 2, 1],
[1, 3, 2, 2], [1, 0, 1, 1],[1, 2, 1, 0]], columns=['x', 'y', 'z', 'class'])
test = pd.DataFrame([[1, 1, 3]], columns=['x', 'y', 'z'])
naive_bayes(train, test)
"""
{1: 0.6666666666666666, 2: 0.3333333333333333, 0: 0.0}
"""
所有变量都是离散变量情形的朴素贝叶斯模型简单代码实现。