引言
用Python实现一个朴素贝叶斯分类器。
数据组织
对于朴素贝叶斯分类器,如果我们不知道其数据组织,是没办法写的。
在这里,我仅仅假设每个样本有三个特征,每个特征用0和1表示。数据总共有两类,也用0和1表示。
这是非常简单的一种形式,实际上是做了很大的简化。
另外,周志华老师书中提到应当使用Laplacian correction的情形,我仅仅使用增加了一个小数的方式实现。
代码
"""
For understand naive bayes classfier
"""
import numpy as np
import scipy
class NaiveBayes(object):
def __init__(self):
"""
Naive bayes classifier is a classifier that supposed
all attribute are independent.
The bayes theory:
1. p(c|x) ~ p(x|c)·p(c)
The naive suppose:
2. p(x1|c)*p(x2|c) * ··· * p(xd|c) = p(x|c)
"""
self._x = None
self._y = None
self.model = None
def input_data(self, x, y):
"""
`x` should be vectors discribes the samples.
shape is (samples, features)
`y` should be the labels which class the samples
are.
"""
self._x = x
self._y = y
def solve(self, x):
"""
solve the model, input your new x, return prediction.
"""
if self._x is None or self._y is None:
print("No data here.")
return None
# suppose only has two class, 0 or 1
classes = 2
states = 2
samples = self._x.shape[0]
features = self._x.shape[1]
# calc prior p(c)
prior = []
for class_index in range(classes):
class_sample_num = np.sum(self._y == class_index)
class_prior = class_sample_num/samples
prior.append(class_prior)
# calc likelihood p(x|c)
# suppose that each x only has two state 1 or 0.
likelihood = np.zeros((states, features, classes), dtype = np.float32)
class_0_sample_number = 0 # statistic num of class 0
flag = 0
# for one clas
for class_index in range(classes):
# loop the samples
for sample_id in range(samples):
# if the sample is the class
if self._y[sample_id] == class_index:
if flag == 0 :
class_0_sample_number += 1
# statistic its features
# only sum the number first
for feature in range(features):
if self._x[sample_id, feature] == 0:
likelihood[0, feature, class_index] += 1
else:
likelihood[1, feature, class_index] += 1
flag = 1
likelihood[:, :, 0] = likelihood[:, :, 0]/class_0_sample_number
likelihood[:, :, 1] = likelihood[:, :, 1]/(samples - class_0_sample_number)
pi_c0 = []
pi_c1 = []
for feature in range(features):
data_state = x[feature]
pi_c0.append(likelihood[data_state, feature, 0])
pi_c1.append(likelihood[data_state, feature, 1])
p_c0 = prior[0]
p_c1 = prior[1]
# the reason add 0.1 is a simplify of Laplacian correction
for feature in range(features):
p_c0 = p_c0 * (pi_c0[feature] + 0.01)
p_c1 = p_c1 * (pi_c1[feature] + 0.01)
print("prediction for class 0:", p_c0)
print("prediction for class 1:", p_c1)
if p_c0 > p_c1:
return 0
else:
return 1
if __name__ == "__main__":
classfier = NaiveBayes()
x = np.array([[1, 1, 1], [1, 1, 0], [0, 1, 0], [1, 0, 1]])
y = np.array([[1], [0], [0], [1]])
classfier.input_data(x, y)
c = classfier.solve([0, 0, 1])
print(c)
参考
《机器学习》 周志华著。