朴素贝叶斯分类器(连续值)
某样本如下:
问题:身高170,体重130,鞋码42,请问是男是女?
当特征为连续值时,直接求条件概率就比较困难。假设特征均为正太分布,即身高、体重、鞋码均为正太分布,正太分布的均值、标准差由样本算出,根据正太分布算出某一个特征的具体值。
实现求正太分布中某一值的概率密度,如下:
from pandas import DataFrame
from scipy import stats
#step1 导入数据
data = DataFrame({'身高':[183,182,178,175,160,165,163,168],
'体重':[164,170,160,140,88,100,110,120],
'鞋码':[45,44,43,40,35,37,38,39],
'性别':['男','男','男','男','女','女','女','女']
})
#print(data)
#求男性的均值和标准差
male_height_mean = data[data['性别'] == '男']['身高'].mean()
male_height_std = data[data['性别'] == '男']['身高'].std()
#求身高为170的概率密度
x = 170
prob = stats.norm.pdf(x,loc = male_height_mean,scale =male_height_std)
print(prob)
求。身高170,体重130,鞋码42,请问是男是女?
#step1:设P(A1)为身高为170;P(A2)为体重为130;P(A3)为鞋码为42,P(B1)为男,P(B2)为女生。导入数据
from pandas import DataFrame
from scipy import stats
#step1 导入数据
data = DataFrame({'身高':[183,182,178,175,160,165,163,168],
'体重':[164,170,160,140,88,100,110,120],
'鞋码':[45,44,43,40,35,37,38,39],
'性别':['男','男','男','男','女','女','女','女']
})
#print(data)
#求不同label下特征的均值和标准差
male_height_mean = data[data['性别'] == '男']['身高'].mean()
male_height_std = data[data['性别'] == '男']['身高'].std()
famale_height_mean = data[data['性别'] == '女']['身高'].mean()
famale_height_std = data[data['性别'] == '女']['身高'].std()
male_weight_mean = data[data['性别'] == '男']['体重'].mean()
male_weight_std = data[data['性别'] == '男']['体重'].std()
famale_weight_mean = data[data['性别'] == '男']['体重'].mean()
famale_weight_std = data[data['性别'] == '男']['体重'].std()
male_shoesize_mean = data[data['性别'] == '男']['鞋码'].mean()
male_shoesize_std = data[data['性别'] == '男']['鞋码'].std()
famale_shoesize_mean = data[data['性别'] == '女']['鞋码'].mean()
famale_shoesize_std = data[data['性别'] == '女']['鞋码'].std()
#step2:计算已知分类结果下,各个特征的概率
#stats.norm.pdf()求概率,loc为均值,scale 为标准差
p_b1 = 1/2
p_b2 = 1/2
p_a1_b1 = stats.norm.pdf(x = 170,loc = male_height_mean,scale = male_height_std )
p_a2_b1 = stats.norm.pdf(x = 130,loc = male_weight_mean,scale = male_weight_std )
p_a3_b1 = stats.norm.pdf(x = 42,loc = male_shoesize_mean,scale = male_shoesize_std )
p_a1_b2 = stats.norm.pdf(x = 170,loc = famale_height_mean,scale = famale_height_std )
p_a2_b2 = stats.norm.pdf(x = 130,loc = famale_weight_mean,scale = famale_weight_std )
p_a3_b2 = stats.norm.pdf(x = 42,loc = famale_shoesize_mean,scale = famale_shoesize_std )
#print(p_a1_b1,p_a2_b1,p_a3_b1,p_a1_b2,p_a2_b2,p_a3_b2)
#step3.计算后验概率大小
p1 = p_a1_b1 * p_a2_b1 * p_a3_b1 * p_b1
p2 = p_a1_b2 * p_a2_b2 * p_a3_b2 * p_b2
if p1 > p2:
print('当身高为高,体重为中,鞋码为中时,性别为{}'.format('男'),p1)
elif p1 == p2:
print('当身高为高,体重为中,鞋码为中时,男生女生概率一样大',p1)
else:
print('当身高为高,体重为中,鞋码为中时,性别为{}'.format('女'),p2)