#!/usr/bin/env python
# -*- coding:utf-8 -*-
# 原文章 https://blog.csdn.net/qq_38178543/article/details/107568803
# 数据集
columns = ['outlook', 'Temperature', 'Humidity', 'Wind', 'PlayTennis']
data = [["Sunny", "Hot", "High", "Weak", "No"],
["Sunny", "Hot", "High", "Strong", "No"],
["Overcast", "Hot", "High", "Weak", "Yes"],
["Rain", "Mild", "High", "Weak", "Yes"],
["Rain", "Cool", "Normal", "Weak", "Yes"],
["Rain", "Cool", "Normal", "Strong", "No"],
["Overcast", "Cool", "Normal", "Strong", "Yes"],
["Sunny", "Mild", "High", "Weak", "No"],
["Sunny", "Cool", "Normal", "Weak", "Yes"],
["Rain", "Mild", "Normal", "Weak", "Yes"],
["Sunny", "Mild", "Normal", "Strong", "Yes"],
["Overcast", "Mild", "High", "Strong", "Yes"],
["Overcast", "Hot", "Normal", "Weak", "Yes"],
["Rain", "Mild", "High", "Strong", "No"],
]
# 计算标签个数 {'No': 4, 'Yes': 6}
def calculate_result(train_data):
result_num = {}
for line in train_data:
label = line[len(line) - 1]
num = result_num.get(label)
if num is None:
num = 0
result_num[label] = num + 1
return result_num
# 计算属性个数
# 'Sunny': {'No': 3, 'Yes': 1, 'column': 4}, 'Hot': {'No': 2, 'Yes': 1}...
def calculate_attribute(train_data):
attribute_dict = {}
for line in train_data:
result = line.pop()
for index in range(len(line)):
num = attribute_dict.get(line[index])
if num is None:
num = {result: 1, 'index': index}
else:
value = num.get(result)
if value is None:
num[result] = 1
else:
num[result] = num[result] + 1
attribute_dict[line[index]] = num
return attribute_dict
def calculate_prior(train_data, result_num, attribute_dict):
column_num = {}
for attribute in attribute_dict:
num = attribute_dict[attribute]
attributes = column_num.get(num['index'])
if attributes is None:
attributes = {attribute}
else:
attributes.add(attribute)
column_num[num['index']] = attributes
print(column_num)
for attribute in attribute_dict:
label_num = attribute_dict[attribute]
for result in result_num:
if label_num.get(result):
label_num[result] = (label_num[result] + 1) / (result_num[result] + len(
column_num[label_num['index']])) * 1.0
else:
label_num[result] = 1 / (result_num[result] + len(
column_num[label_num['index']])) * 1.0
for label in result_num:
result_num[label] = (result_num[label] + 1) / (len(train_data) + len(result_num)) * 1.0
return result_num, attribute_dict
def bayes_test(data, attribute_dict, result_num):
for line in data:
# print(line.pop())
line.pop()
print('预测:', line)
max_value = 0
choice = ''
for result in result_num:
print(result, end=': ')
p = result_num[result]
for attribute in line:
# print(attribute_dict[attribute])
p *= attribute_dict[attribute][result]
print(p, end=' ')
if p > max_value:
max_value = p
choice = result
print('预测结果 : ', choice, 'score', max_value)
if __name__ == '__main__':
train_data = data[:10]
result_num = calculate_result(train_data)
attribute_dict = calculate_attribute(train_data)
result_num, attribute_dict = calculate_prior(train_data, result_num, attribute_dict)
bayes_test(data[10:], attribute_dict, result_num)
朴素贝叶斯-python实现
最新推荐文章于 2024-04-24 01:16:17 发布