以下代码是本人在学习西瓜书时花费两个礼拜根据原理进行原创,若需转载请咨询本人,谢谢!
自我研究模拟代码
附上离散类别截图
数据截图:
运行截图:
Bayes_config.py
"""
Filename: Bayes
Author: kdd_zyx
Description: 机器学习 - 朴素贝叶斯
Datas:kdd - 随机划分
Start: 2018.10.11
End:
"""
import random as r
from functools import reduce
text_tnum = 10545 # 抽取数据量
text_pret = 0.255 # 测试集百分比(计算需要排除属性集)
start = 1 # 数据上标
end = -1 # 结尾下标
def divi(x, y):
# 除法异常捕捉
if y == 0:
return 0
else:
return x / y
def radf(text_addr, text_splt):
with open(text_addr, 'r', encoding='UTF-8') as f:
dataset = [eachLine.replace('\n', '').split(text_splt) for eachLine in f]
f.close()
return dataset
# 引入训练集、验证集、测试集
def Lead_dataset():
try:
dataset = radf('Car_Data.txt', ',')
fin_dataset = []
pro_dataset = dataset[0]
txt_dataset = dataset[start : text_tnum]
r.shuffle(txt_dataset)
fin_dataset.append(pro_dataset)
for data in txt_dataset:
fin_dataset.append(data)
return fin_dataset
except Exception as e:
print('Error:', e)
Lead_dataset()
Bayes.py
"""
Filename: Bayes
Author: kdd_zyx
Description: 机器学习 - 朴素贝叶斯
Datas:kdd - 随机划分
Start: 2018.10.11
End:
"""
from Bayes_config import *
class Bayes(object):
# 初始化朴素贝叶斯对象
def __init__(self):
pass
# 引入属性集
def Lead_attrest(self):
new_data = [[] for i in range(len(self.tra_dataset[0]))]
for data in self.tra_dataset:
for num in range(len(data)):
new_data[num].append(data[num])
for num in range(len(new_data) - 1):
new_data[num] = len(list(set(new_data[num])))
# 类别集
new_data[end] = list(set(new_data[end]))
return new_data
# 构建参数集
def Main_data(self):
dataset = Lead_dataset()
if text_pret != 0:
self.tex_num = int(len(dataset[start: ]) * text_pret + 1)
self.tra_dataset = dataset[start : -self.tex_num]
self.tex_dataset = dataset[-self.tex_num: ]
self.tra_num = len(self.tra_dataset)
self.pro_dataset = dataset[0]
self.prd_dataset = self.Lead_attrest()
self.Forecast(self.tra_dataset, self.tex_dataset)
else:
pass
# 构建贝叶斯预测
def Forecast(self, tra_dataset, tex_dataset):
verify_num = 0
end_sort = self.Cata_sort(tra_dataset)
for data in tex_dataset:
Clas = self.Foresort(data, end_sort)
print(data[end], self.prd_dataset[end][Clas])
if data[end] == self.prd_dataset[end][Clas]:
verify_num += 1
accury = round(divi(verify_num, self.tex_num), 3)
print()
print('Training dataset number:', self.tra_num)
print('Texting dataset number:', self.tex_num)
print('The correct number:', verify_num)
print('Accuracy:', accury)
# 属性预测
def Foresort(self, data, end_sort):
Pc = [[]] * len(end_sort)
for sor_num in range(len(end_sort)):
sortn = len(end_sort[sor_num])
sortP = [0] * (len(data) - 1)
for cta_num in range(len(data) - 1):
for sort in end_sort[sor_num]:
if data[cta_num] == sort[cta_num]:
sortP[cta_num] += 1
sortP[cta_num] = divi(sortP[cta_num] + 1, sortn + self.prd_dataset[cta_num])
Pc[sor_num] = self.Laplacian(sortn, sortP)
return Pc.index(max(Pc))
# 拉普拉斯修正
def Laplacian(self, sortn, sortP):
new_clas = divi(sortn + 1, len(self.tra_dataset) + self.tra_num)
return new_clas * reduce(lambda x, y: x * y, sortP)
# 类别分类
def Cata_sort(self, tra_dataset):
sort = [[] for i in range(len(self.prd_dataset[end]))]
for data in tra_dataset:
num = self.prd_dataset[end].index(data[end])
sort[num].append(data)
return sort
if __name__ == '__main__':
bayes = Bayes()
bayes.Main_data()