# 西瓜书课后题——第七章（贝叶斯分类器）

import numpy as np
import pandas as pd
import math

dataset = pd.read_excel('./WaterMelon_3.0.xlsx',encoding = 'gbk')  # 读取数据
Attributes = dataset.columns[1:]    # 属性名称列表
dataset = np.array(dataset)
dataset = dataset[:,1:]
m,n = np.shape(dataset)
dataList = []
for i in range(m):      # 生成数据列表，列表元素是集合类型
curset = {}
for j in range(n):
curset[Attributes[j]] = dataset[i,j]
dataList.append(curset)

attrNum = {}            # 统计每个属性的可取值个数
for i in range(n):
curSet = set()      # 使用集合是利用了集合里面元素不可重复的特性，从而提取出了每个属性的取值
for j in range(m):
attrNum[Attributes[i]] = len(curSet)
return dataList,attrNum

def getClassPrior(classname,classvalue,dataset,attrNum):     # 得到类先验概率，经过拉普拉斯平滑
count = 0
for i in range(len(dataset)):
if dataset[i][classname] == classvalue : count += 1
return (count+1)/(len(dataset) + attrNum[classname])

def getClassCondition(classname,classvalue,classCondname,classCondvalue,dataset,attrNum):   # 得到类条件概率
if classname=='密度'or classname=='含糖率':      # 若是连续属性，则用概率密度进行计算
value = []
for i in range(len(dataset)):
if dataset[i][classCondname]==classCondvalue:
value.append(dataset[i][classname])
mean = np.mean(value)
delt = np.std(value)
return (1/(math.sqrt(2*math.pi)*delt))*math.exp(-(classvalue-mean)**2/(2*delt**2))
else:                                             # 离散属性用频率代替概率，并进行拉普拉斯平滑
count = 0
count_ = 0
for i in range(len(dataset)):
if dataset[i][classname]==classvalue and dataset[i][classCondname]==classCondvalue:
count += 1
if dataset[i][classCondname]==classCondvalue : count_ += 1
return (count+1)/(count_+attrNum[classname])

def main():
test1 = {'色泽':'青绿','根蒂':'蜷缩','敲声':'浊响','纹理':'清晰','脐部':'凹陷','触感':'硬滑',\
'密度':0.697,'含糖率':0.460}
Pgood = getClassPrior('好瓜','是',dataset,attrNum)
for i in test1:
Pgood *= getClassCondition(i,test1[i],'好瓜','是',dataset,attrNum)

if __name__ == '__main__':
main()


0.0218012464059     4.91583402142e-05

import numpy as np
import pandas as pd

dataset = pd.read_excel('./WaterMelon_3.0.xlsx',encoding = 'gbk')  # 读取数据
Attributes = np.hstack((np.array(dataset.columns[1:-3]),np.array(dataset.columns[-1])))   # 属性名称列表
dataset = np.array(dataset)
dataset = np.hstack((dataset[:,1:-3],np.reshape(dataset[:,-1],newshape=(len(dataset[:,-1]),1))))
m,n = np.shape(dataset)
dataList = []
for i in range(m):      # 生成数据列表，列表元素是集合类型
curset = {}
for j in range(n):
curset[Attributes[j]] = dataset[i,j]
dataList.append(curset)

attrNum = {}            # 统计每个属性的可取值个数
for i in range(n):
curSet = set()      # 使用集合是利用了集合里面元素不可重复的特性，从而提取出了每个属性的取值
for j in range(m):
attrNum[Attributes[i]] = len(curSet)
return dataList,attrNum

def getClassPrior(classname1,classvalue1,classname2,classvalue2,dataset,attrNum):     # 得到类先验概率，经过拉普拉斯平滑
count = 0
for i in range(len(dataset)):
if dataset[i][classname1] == classvalue1 and dataset[i][classname2] == classvalue2 : count += 1
return (count+1)/(len(dataset) + attrNum[classname1]*attrNum[classname2])

def getClassCondition(classname1,classvalue1,classname2,classvalue2,classname,classvalue,dataset,attrNum):   # 得到类条件概率
count = 0
count_ = 0
for i in range(len(dataset)):
if dataset[i][classname1]==classvalue1 and dataset[i][classname2] == classvalue2 and dataset[i][classname]==classvalue:
count += 1
if dataset[i][classname1]==classvalue1 and dataset[i][classname2] == classvalue2 : count_ += 1
return (count+1)/(count_+attrNum[classname])

def main():
test1 = {'色泽':'青绿','根蒂':'蜷缩','敲声':'浊响','纹理':'清晰','脐部':'凹陷','触感':'硬滑'}
good = 0
for j in test1:
Pgood = getClassPrior('好瓜','是',j,test1[j],dataset,attrNum)
for i in test1:
Pgood *= getClassCondition(j,test1[j],'好瓜','是',i,test1[i],dataset,attrNum)
good += Pgood

if __name__ == '__main__':
main()


0.01867093430879439     0.00040009144947416545

10-17 2454

04-14 5052
07-06 94
06-03 850
06-15 271
06-13 361
02-22 3732
10-13 2086