朴素贝叶斯也称为笨蛋贝叶斯,其思想简单,容易理解
简单理解就是对数据的每一个属性当作独立分布
例如属性颜色有青绿,乌黑,浅白,则分别求出好瓜和坏瓜中三种颜色的概率,对于离散属性可以直接求其概率,对于连续的属性值可以通过正态分布将其转化为离散值。
这样对所有的属性概率求过以后,对于一个任意样本,例:
A:(青绿,蜷缩,,浊响,清晰,凹陷,硬滑,密度,含糖率)
有P(好)= P(青绿&好)*P(蜷缩&好)*P(浊响&好)*P(清晰&好)*P(凹陷&好)*P(硬滑&好)*P(密度&好)*P(含糖率&好)
P(坏)= P(青绿&坏)*P(蜷缩&坏)*P(浊响&坏)*P(清晰&坏)*P(凹陷&坏)*P(硬滑&坏)*P(密度&坏)*P(含糖率&坏)
最后比较好瓜和坏瓜的概率,谁大就归为哪一类
代码如下:
#数据集 3 3 3 3 3 2 密度 含糖率 标签
data = [["青绿", "蜷缩", "浊响", "清晰", "凹陷", "硬滑", 0.697, 0.460, 1],
["乌黑", "蜷缩", "沉闷", "清晰", "凹陷", "硬滑", 0.774, 0.376, 1],
["乌黑", "蜷缩", "浊响", "清晰", "凹陷", "硬滑", 0.634, 0.264, 1],
["青绿", "蜷缩", "沉闷", "清晰", "凹陷", "硬滑", 0.608, 0.318, 1],
["浅白", "蜷缩", "浊响", "清晰", "凹陷", "硬滑", 0.556, 0.215, 1],
["青绿", "稍蜷", "浊响", "清晰", "稍凹", "软粘", 0.403, 0.237, 1],
["乌黑", "稍蜷", "浊响", "稍糊", "稍凹", "软粘", 0.481, 0.149, 1],
["乌黑", "蜷缩", "浊响", "清晰", "稍凹", "硬滑", 0.437, 0.211, 1],
["乌黑", "稍蜷", "沉闷", "稍糊", "稍凹", "硬滑", 0.666, 0.091, 0],
["青绿", "硬挺", "清脆", "清晰", "平坦", "软粘", 0.243, 0.267, 0],
["浅白", "硬挺", "清脆", "模糊", "平坦", "硬滑", 0.245, 0.057, 0],
["浅白", "蜷缩", "浊响", "稍糊", "平坦", "软粘", 0.343, 0.099, 0],
["青绿", "稍蜷", "浊响", "稍糊", "凹陷", "硬滑", 0.639, 0.161, 0],
["浅白", "稍蜷", "沉闷", "稍糊", "凹陷", "硬滑", 0.657, 0.198, 0],
["乌黑", "稍蜷", "浊响", "清晰", "稍凹", "软粘", 0.360, 0.370, 0],
["浅白", "蜷缩", "浊响", "稍糊", "平坦", "硬滑", 0.593, 0.042, 0],
["青绿", "蜷缩", "沉闷", "稍糊", "稍凹", "硬滑", 0.719, 0.103, 0]]
data_tr = [] #训练集
data_test = [] #测试集
#划分训练集与测试集
for i in range(len(data)):
if i%3 == 0:
data_test.append(data[i])
else:
data_tr.append(data[i])
#好瓜坏瓜个数
gua = len(data_tr)
hao_gua = 0
huai_gua = 0
for i in data_tr:
if i[8] == 1:
hao_gua += 1
else:
huai_gua += 1
#三属性数量
def num_gua(list,shuxing,a,b,c,):
na1 = 0
nb1 = 0
nc1 = 0
na0 = 0
nb0 = 0
nc0 = 0
for i in list:
if i[shuxing] == a and i[8] == 1:
na1 += 1
elif i[shuxing] == b and i[8] == 1:
nb1 += 1
elif i[shuxing] == c and i[8] == 1:
nc1 += 1
elif i[shuxing] == a and i[8] == 0:
na0 += 1
elif i[shuxing] == a and i[8] == 0:
nb0 += 1
else:
nc0 += 1
return na1, nb1, nc1, na0, nb0, nc0
#两属性数量
def num_two(list, shuxing, a, b):
na1 = 0
nb1 = 0
na0 = 0
nb0 = 0
for i in list:
if i[shuxing] == a and i[8] == 1:
na1 += 1
elif i[shuxing] == b and i[8] == 1:
nb1 += 1
elif i[shuxing] == a and i[8] == 0:
na0 += 1
else:
nb0 += 1
return na1, nb1, na0, nb0
#先验概率
def gailv(list):
gailvlist = []
for i in range(2):
gailvlist.append(list[i]/hao_gua)
for i in range(2, len(list)):
gailvlist.append(list[i]/huai_gua)
return gailvlist
#正态分布
def gaosi(list):
long = len(list)
num = 0
for i in list:
num += i
u = num/long #确定u的值
y = 0
for j in list:
xj = (j - u)**2
y += xj
s = y/long
satnd = s**0.5
return u, satnd
#连续值的先验概率
def proba(u, satand, x):
if (x >= (u - satand)) and (x <= (u + satand)):
m = 0.682689
elif (x < (u - satand) and x >= (u - 2*satand)) or (x >= (u + satand) and x <= (u + 2*satand)):
m = 0.271711
else:
m = 0.0456
return m
#求累积 后验概率
def product(list):
pro = 1
for i in list:
pro *= i
return pro
# 颜色
color = []
color = num_gua(data_tr, 0, "青绿", "乌黑", "浅白")
P_color = gailv(color)
#蜷缩
square = []
square = num_gua(data_tr, 1, "蜷缩", "稍蜷", "硬挺")
P_quan = gailv(square)
#声音
voice = []
voice = num_gua(data_tr, 2, "浊响", "沉闷", "清脆")
P_voice = gailv(voice)
#花纹
flower = []
flower = num_gua(data_tr, 3, "清晰", "稍糊", "模糊")
P_flower = gailv(flower)
#瓜蒂
base = []
base = num_gua(data_tr, 4, "凹陷", "稍凹", "平坦")
P_base = gailv(base)
# 手感
feel = []
feel = num_two(data_tr, 5, "硬滑", "软粘")
P_feel = []
for i in range(1):
P_feel.append(feel[i]/hao_gua)
for i in range(1,len(feel)):
P_feel.append(feel[i]/huai_gua)
#密度
midu_hao = []
midu_huai = []
for i in data_tr:
if i[8] == 1:
midu_hao.append(i[6])
else:
midu_huai.append(i[6])
mi_u_hao, mi_satand_hao = gaosi(midu_hao)
mi_u_huai, mi_satand_huai = gaosi(midu_huai)
#含糖率
sugur_hao = []
sugur_huai = []
for i in data_tr:
if i[8] == 1:
sugur_hao.append(i[7])
else:
sugur_huai.append(i[7])
han_u_hao, han_satand_hao = gaosi(sugur_hao)
han_u_huai, han_satand_huai = gaosi(sugur_huai)
#训练函数
def train(list):
data_hao = []
data_huai = []
if list[0] == "青绿":
data_hao.append(P_color[0])
data_huai.append(P_color[3])
elif list[0] == "乌黑":
data_hao.append(P_color[1])
data_huai.append(P_color[4])
else:
data_hao.append(P_color[2])
data_huai.append(P_color[5])
if list[1] == "蜷缩":
data_hao.append(P_quan[0])
data_huai.append(P_quan[3])
elif list[1] == "稍蜷":
data_hao.append(P_quan[1])
data_huai.append(P_quan[4])
else:
data_hao.append(P_quan[2])
data_huai.append(P_quan[5])
if list[2] == "浊响":
data_hao.append(P_voice[0])
data_huai.append(P_voice[3])
elif list[2] == "沉闷":
data_hao.append(P_voice[1])
data_huai.append(P_voice[4])
else:
data_hao.append(P_voice[2])
data_huai.append(P_voice[5])
if list[3] == "清晰":
data_hao.append(P_flower[0])
data_huai.append(P_flower[3])
elif list[3] == "稍糊":
data_hao.append(P_flower[1])
data_huai.append(P_flower[4])
else:
data_hao.append(P_flower[2])
data_huai.append(P_flower[5])
if list[4] == "凹陷":
data_hao.append(P_base[0])
data_huai.append(P_base[3])
elif list[4] == "稍凹":
data_hao.append(P_base[1])
data_huai.append(P_base[4])
else:
data_hao.append(P_base[2])
data_huai.append(P_base[5])
if list[5] == "硬滑":
data_hao.append(P_feel[0])
data_huai.append(P_feel[2])
else:
data_hao.append(P_feel[1])
data_huai.append(P_feel[3])
data_hao.append(proba(mi_u_hao, mi_satand_hao, list[6]))
data_huai.append(proba(mi_u_huai, mi_satand_huai, list[6]))
data_hao.append(proba(han_u_hao, han_satand_hao, list[7]))
data_huai.append(proba(han_u_huai, han_satand_huai, list[7]))
hao = product(data_hao)
huai = product(data_huai)
if hao >= huai:
return 1
if hao < huai:
return 0
def xunlian(list):
m = train(list)
if m == 1:
print(list, " 好瓜")
else:
print(list, " 坏瓜")
for i in data_test:
xunlian(i)