一、实验目的
1.加深对贝叶斯原理的理解
2. 熟悉python的集成开发环境
3.掌握贝叶斯分类器的实现—西瓜判别
二、实验环境
Windows + Python3+
一台装有集成开发环境(IDE)—— PyCharm的计算机
三、实验内容
1.训练数据集
四、代码填写
#encoding=utf-8
import numpy as np
from math import sqrt
import csv
attr_num=[3,3,3,3,3,2]
"""
--------获取数据--------
"""
def loadCsv(filename):
lines = csv.reader(open(filename, "r"))
dataset = list(lines)
for i in range(1, len(dataset)):
dataset[i] = [float(x) for x in dataset[i]]
result = np.array(dataset[1:])
return result[:, 1:]
"""
--------计算先验--------
通过下图实现好瓜和坏瓜的概率
"""
def per_problity(datasets):
pos_prob=1.0*(np.sum(datasets[:,-1]==1.0)+1)/(np.shape(datasets[0]+2))
neg_prob=1.0*(np.sum(datasets[:,-1]==0.0)+1)/(np.shape(datasets[0]+2))
return [pos_prob,neg_prob]
"""
--------根据上面的公式计算P(=X_i|C_i)--------
"""
def cond_attr_problity(datasets, testdata):
#定义返回变量,里面用于存放输出结果
#cond_result[i,0]表示第i个属性 属于pos的概率
#cond_result[i,1]表示第i个属性 属于neg的概率
cond_result = np.zeros([np.shape(datasets)[1] - 1, 2])
#提取正负样本数据集
pos_data = datasets[datasets[:, -1] == 1.0, :]
neg_data = datasets[datasets[:, -1] == 0.0, :]
#计算测试数据的各种属性概率 即测试样本属性在正负样本中的概率
#首先计算离散变量
for i in range(len(attr_num)):
cond_result[i, 0] =1.0*(np.sum(pos_data[:,i]==testdata[0,i])+1)/(np.sum(datasets[:,-1]==1)+attr_num[i])
cond_result[i, 1]
=1.0*(np.sum(neg_data[:,i]==testdata[0,i])+1)/(np.sum(datasets[:,-1]==0)+attr_num[i])
#计算连续变量
for j in range(6, 8):
# mean,std computation
pos_mean = np.mean(datasets[(datasets[:,-1]==1),j])
pos_std =np.std(datasets[(datasets[:,-1]==1),j])
neg_mean =np.mean(datasets[(datasets[:,-1]==0),j])
neg_std =np.std(datasets[(datasets[:,-1]==0),j])
cond_result[j, 0] = 1.0/(sqrt(2*np.pi)*pos_std)*np.exp(
-1*(testdata[0,j]-pos_mean)**2/(2*pos_std**2))
cond_result[j, 1] = 1.0/(sqrt(2*np.pi)*neg_std)*np.exp(
-1*(testdata[0,j]-neg_mean)**2/(2*neg_std**2))
return cond_result
"""
--------判断函数
"""
def classify_data(cond_result, pre_result):
pos_result = pre_result[0]
neg_result = pre_result[1]
for i in range(np.shape(cond_result)[0]):
pos_result *= cond_result[i, 0]
neg_result *= cond_result[i, 1]
if pos_result > neg_result:
print('好瓜')
print("好瓜分数:",pos_result)
print("坏瓜分数:", neg_result)
else:
print('坏瓜')
print("好瓜分数:", pos_result)
print("坏瓜分数:", neg_result)
"""
-------主函数-----
"""
def main():
filename = 'watermelon3_0_En.csv'
dataset = loadCsv(filename)
testname = 'test.csv'
testdata = loadCsv(testname)
pre_result = pre_problity(dataset)
cond_result = cond_attr_problity(dataset, testdata)
classify_data(cond_result, pre_result)