# -*- coding: UTF-8 -*-
"""
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@File : naivebayes.py
@Contact : ffzzyy@126.com
@License : (C)Copyright 2017-2019
@Author : ffzzyy
@Version : 0.1
@Modify Time : 2019/3/15 15:04
@Desciption
"""
import numpy as np
import pandas as pd
from functools import reduce
def acount(x1, x2, axis=0):
"""
统计array like:x1 中 x2的个数
:param x1: 一个numpy ndarray
:param x2: 一个numpy ndarray
:param axis: 表示统计方向,默认为按列
:return:返回一个array like
Examples
--------
>>> x1=np.array([[1,"s"],[2,"l"],[1,"l"],[3,"m"]])
>>> x2=np.array([2,"l"])
>>> acount(x1,x2)
[1. 2.]
"""
result = np.zeros(len(x2))
for i, value in enumerate(x2):
x1_column = x1[:, i] # numpy array 按列切片
result[i] = np.sum(np.array(x1_column == value)) # 使用numpy bool索引进行统计个数
return result
class NBClassifier:
def __init__(self):
self._x_train = None
self._y_train = None
self._class = None
self._prior_proba = {} # 字典:先验概率
def _set_class(self):
"""
得到列表:分类序列
:return:
"""
self._class = list(np.unique(self._y_train))
self._class.sort()
def _set_prior_proba(self):
"""
计算先验概率
:param class_:
:return:
"""
for enum in self._class:
count_ = list(self._y_train).count(enum)
self._prior_proba[enum] = count_ / len(self._y_train)
def _get_condi_proba(self, x, c):
"""
计算P(X | C)的条件概率
:param x:
:param c:
:return: 列表
"""
y_train_index = [i for i, value in enumerate(self._y_train) if value == c]
x_train_eq_c = self._x_train[y_train_index]
condi_proba = acount(x_train_eq_c, x) / len(y_train_index)
"""
发现为0的概率的时候,通过那普拉斯修正
"""
if 0 in condi_proba:
for i, value in enumerate(condi_proba):
if value==0:
# 得到该 特征 的字可能取值数
proba_value_count_i=len(list(np.unique(self._x_train[:,i])))
condi_proba[i]=1/(len(y_train_index)+proba_value_count_i)
return reduce(lambda x, y: x * y, condi_proba)
def fit(self, x_train, y_train):
"""
训练函数
:param x_train:
:param y_train:
:return:
"""
self._x_train = x_train
self._y_train = y_train
self._class = None
self._prior_proba = {} # 字典:先验概率
self._set_class()
self._set_prior_proba()
def _predict(self, x):
"""
针对单个训练元组进行预测
:param x:
:return:
"""
result = {}
for enum in self._class:
result[enum] = self._get_condi_proba(x, enum)
for enum in self._class:
result[enum] = result[enum] * self._prior_proba[enum]
return result
def predict(self, X_predict):
"""
预测函数
:param X_predict:ndarray like
:return:
"""
y_predict = [self._predict(x) for x in X_predict]
return np.array(y_predict)
def load_data(file_path):
"""
从文件中得到训练集
:param file_path:
:return:
"""
df = pd.read_csv(file_path, encoding='cp936')
csv_arr = np.array(df)
# 最后一列是y_train
y_train = csv_arr[:, csv_arr.shape[1] - 1]
x_train = csv_arr[:, 0:csv_arr.shape[1] - 1]
return x_train, y_train
def main():
file_path = "贝叶斯测试.csv"
x_train, y_train = load_data(file_path)
nb = NBClassifier()
nb.fit(x_train, y_train)
print(nb._class)
print(nb._prior_proba)
print(nb.predict([[2, 's']]))
file_path = "西瓜数据.csv"
x_train, y_train = load_data(file_path)
# 西瓜测试数据需要删除第一列
x_train=np.delete(x_train, 0, axis=1)
nb = NBClassifier()
nb.fit(x_train, y_train)
print(nb._class)
print(nb._prior_proba)
print(nb.predict([['浅白','蜷缩','浊响','模糊','平坦','硬滑']]))
if __name__ == '__main__':
main()