定义
输入:训练数据集(T=
{
(
x
1
,
y
1
)
,
(
x
2
,
y
2
)
,
⋯
,
(
x
N
,
y
N
)
}
\left\{(x_1,y_1),(x_2,y_2),\cdots,(x_N,y_N)\right\}
{(x1,y1),(x2,y2),⋯,(xN,yN)}),
其中
x
i
=
(
x
i
(
1
)
,
x
i
(
2
)
,
⋯
,
x
i
(
n
)
)
T
,
x_i=(x_i^{(1)},x_i^{(2)},\cdots,x_i^{(n)})^T,
xi=(xi(1),xi(2),⋯,xi(n))T,
x
i
(
j
)
x_i^{(j)}
xi(j)是第i个样本的第j个特征,
x
i
j
∈
{
a
j
1
,
a
j
2
,
⋯
,
a
j
S
j
}
,
a
j
l
x_i^j \in \left\{ a_{j1},a_{j2},\cdots,a_{jS_j} \right\},a_{jl}
xij∈{aj1,aj2,⋯,ajSj},ajl是第j个特征可能取的第l个值,
j
=
1
,
2
,
⋯
,
n
,
l
=
1
,
2
,
⋯
,
S
j
,
y
i
∈
{
c
1
,
c
2
,
⋯
,
c
K
}
j=1,2,\cdots,n,l=1,2,\cdots,S_j,y_i \in \left\{ c_{1},c_{2},\cdots,c_{K} \right\}
j=1,2,⋯,n,l=1,2,⋯,Sj,yi∈{c1,c2,⋯,cK};实例
x
{x}
x;
输出:实例
x
{x}
x的分类。
(1)计算先验概率及条件概率
P
(
Y
=
c
k
)
=
∑
i
=
1
N
I
(
y
i
=
c
k
)
N
,
k
=
1
,
2
,
⋯
,
K
P(Y=c_k)=\dfrac{\sum_{i=1}^N I(y_i=c_k)}{N},k=1,2,\cdots,K
P(Y=ck)=N∑i=1NI(yi=ck),k=1,2,⋯,K
P
(
X
(
j
)
=
a
j
l
∣
Y
=
c
k
)
=
∑
i
=
1
N
I
(
x
i
(
j
)
=
a
j
l
,
y
i
=
c
k
)
∑
i
=
1
N
I
(
y
i
=
c
k
)
,
j
=
1
,
2
,
⋯
,
n
;
l
=
1
,
2
,
⋯
,
S
j
;
k
=
1
,
2
,
⋯
,
K
P(X^{(j)}=a_{jl}|Y=c_k)=\dfrac{\sum_{i=1}^N I(x_i^{(j)}=a_{jl},y_i=c_k)}{\sum_{i=1}^N I(y_i=c_k)},j=1,2,\cdots,n;l=1,2,\cdots,S_j;k=1,2,\cdots,K
P(X(j)=ajl∣Y=ck)=∑i=1NI(yi=ck)∑i=1NI(xi(j)=ajl,yi=ck),j=1,2,⋯,n;l=1,2,⋯,Sj;k=1,2,⋯,K
(2)对于给定的实例
x
=
(
x
(
1
)
,
x
(
2
)
,
⋯
,
x
(
n
)
)
,
计算
x=(x^{(1)},x^{(2)},\cdots,x^{(n)}),计算
x=(x(1),x(2),⋯,x(n)),计算
P
(
Y
=
c
k
)
∏
j
=
1
n
P
(
x
(
j
)
=
x
(
j
)
∣
Y
=
c
k
)
,
k
=
1
,
2
,
⋯
,
K
P(Y=c_k)\prod_{j=1}^n{P(x^{(j)}=x^{(j)}|Y=c_k),k=1,2,\cdots,K}
P(Y=ck)∏j=1nP(x(j)=x(j)∣Y=ck),k=1,2,⋯,K
(3)确定实例x的类
y
=
a
r
g
m
a
x
c
k
P
(
Y
=
c
k
)
∏
j
=
1
n
P
(
X
(
j
)
=
x
(
j
)
∣
Y
=
c
j
)
y=argmax_{c_k}P(Y=c_k)\prod_{j=1}^n{P(X^{(j)}=x^{(j)}|Y=c_j)}
y=argmaxckP(Y=ck)∏j=1nP(X(j)=x(j)∣Y=cj)
输入空间
T= { ( x 1 , y 1 ) , ( x 2 , y 2 ) , … , ( x N , y N ) } \left\{(x_1,y_1),(x_2,y_2),\dots,(x_N,y_N)\right\} {(x1,y1),(x2,y2),…,(xN,yN)}
import numpy as np
import time
def loadData(fileName,lines=60000):
'''
加载文件 下载地址:https://download.csdn.net/download/nanxiaotao/89720991)
:param fileName:要加载的文件路径
:return: 数据集
'''
# 定义数据集
dataSet = np.zeros((lines, 785))
#读取文件
fr = open(fileName)
#遍历文件中的每一行
i = 0
for line in fr.readlines():
curLine = line.strip().split(',')
x = [int(int(num) > 128) for num in curLine[1:]]
y = int(curLine[0])
dataSet[i] = np.append(x, y)
i=i+1
#返回数据集和标记
return dataSet
train_dataSet = loadData('../Mnist/mnist_train.csv')
np.shape(train_dataSet)
特征空间(Feature Space)
train_dataSet[0][0:784]
统计学习方法
模型
y = a r g m a x c k P ( Y = c k ) ∏ j = 1 n P ( X ( j ) = x ( j ) ∣ Y = c j ) y=argmax_{c_k}P(Y=c_k)\prod_{j=1}^n{P(X^{(j)}=x^{(j)}|Y=c_j)} y=argmaxckP(Y=ck)∏j=1nP(X(j)=x(j)∣Y=cj)
策略
a r g m a x c k P ( Y = c k ) ∏ j = 1 n P ( X ( j ) = x ( j ) ∣ Y = c j ) argmax_{c_k}P(Y=c_k)\prod_{j=1}^n{P(X^{(j)}=x^{(j)}|Y=c_j)} argmaxckP(Y=ck)∏j=1nP(X(j)=x(j)∣Y=cj)
算法
P
(
Y
=
c
k
)
=
∑
i
=
1
N
I
(
y
i
=
c
k
)
N
,
k
=
1
,
2
,
⋯
,
K
P(Y=c_k)=\dfrac{\sum_{i=1}^N I(y_i=c_k)}{N},k=1,2,\cdots,K
P(Y=ck)=N∑i=1NI(yi=ck),k=1,2,⋯,K
P
(
X
(
j
)
=
a
j
l
∣
Y
=
c
k
)
=
∑
i
=
1
N
I
(
x
i
(
j
)
=
a
j
l
,
y
i
=
c
k
)
∑
i
=
1
N
I
(
y
i
=
c
k
)
,
j
=
1
,
2
,
⋯
,
n
;
l
=
1
,
2
,
⋯
,
S
j
;
k
=
1
,
2
,
⋯
,
K
P(X^{(j)}=a_{jl}|Y=c_k)=\dfrac{\sum_{i=1}^N I(x_i^{(j)}=a_{jl},y_i=c_k)}{\sum_{i=1}^N I(y_i=c_k)},j=1,2,\cdots,n;l=1,2,\cdots,S_j;k=1,2,\cdots,K
P(X(j)=ajl∣Y=ck)=∑i=1NI(yi=ck)∑i=1NI(xi(j)=ajl,yi=ck),j=1,2,⋯,n;l=1,2,⋯,Sj;k=1,2,⋯,K
def getAllProbability(train_dataSet):
'''
通过训练集计算先验概率分布和条件概率分布
:param train_dataSet: 训练数据集
:return: 先验概率分布和条件概率分布
'''
#特征维度
featureNum = 784
#设置类别数目,0-9共十个类别
classNum = 10
trainDataArr = train_dataSet[:,0:784]
trainLabelArr = train_dataSet[:,784:785]
Py = np.zeros((classNum, 1))
for i in range(classNum):
Py[i] = ((np.sum(np.mat(trainLabelArr) == i)) + 1) / (len(trainLabelArr) + 10)
Py = np.log(Py)
Px_y = np.zeros((classNum, featureNum, 2))
#对标记集进行遍历
for i in range(len(trainLabelArr)):
#获取当前循环所使用的标记
label = trainLabelArr[i][0]
#获取当前要处理的样本
x = trainDataArr[i]
#对该样本的每一维特诊进行遍历
for j in range(featureNum):
#在矩阵中对应位置加1
#这里还没有计算条件概率,先把所有数累加,全加完以后,在后续步骤中再求对应的条件概率
Px_y[int(label)][j][int(x[j])] += 1
#循环每一个标记(共10个)
for label in range(classNum):
#循环每一个标记对应的每一个特征
for j in range(featureNum):
#获取y=label,第j个特诊为0的个数
Px_y0 = Px_y[label][j][0]
#获取y=label,第j个特诊为1的个数
Px_y1 = Px_y[label][j][1]
Px_y[label][j][0] = np.log((Px_y0 + 1) / (Px_y0 + Px_y1 + 2))
Px_y[label][j][1] = np.log((Px_y1 + 1) / (Px_y0 + Px_y1 + 2))
#返回先验概率分布和条件概率分布
return Py, Px_y
Py, Px_y = getAllProbability(train_dataSet)
y = a r g m a x c k P ( Y = c k ) ∏ j = 1 n P ( X ( j ) = x ( j ) ∣ Y = c j ) y=argmax_{c_k}P(Y=c_k)\prod_{j=1}^n{P(X^{(j)}=x^{(j)}|Y=c_j)} y=argmaxckP(Y=ck)∏j=1nP(X(j)=x(j)∣Y=cj)
def NaiveBayes(Py, Px_y, x):
'''
通过朴素贝叶斯进行概率估计
:param Py: 先验概率分布
:param Px_y: 条件概率分布
:param x: 要估计的样本x
:return: 返回所有label的估计概率
'''
#设置特征数目
featrueNum = 784
#设置类别数目
classNum = 10
#建立存放所有标记的估计概率数组
P = [0] * classNum
#对于每一个类别,单独估计其概率
for i in range(classNum):
sum = 0
#获取每一个条件概率值,进行累加
for j in range(featrueNum):
sum += Px_y[i][j][int(x[j])]
P[i] = sum + Py[i]
#max(P):找到概率最大值
#P.index(max(P)):找到该概率最大值对应的所有(索引值和标签值相等)
return P.index(max(P))
假设空间(Hypothesis Space)
{ f ∣ f ( x ) = a r g m a x c k P ( Y = c k ) ∏ j = 1 n P ( X ( j ) = x ( j ) ∣ Y = c j ) } \left\{f|f(x) = argmax_{c_k}P(Y=c_k)\prod_{j=1}^n{P(X^{(j)}=x^{(j)}|Y=c_j)} \right\} {f∣f(x)=argmaxckP(Y=ck)∏j=1nP(X(j)=x(j)∣Y=cj)}
输出空间
y {\tt y} y = { c 1 , c 2 , ⋯ , c k } = \{c_1,c_2,\cdots,c_k \} ={c1,c2,⋯,ck}
模型评估
训练误差(Training Error)
test_dataSet = loadData('../Mnist/mnist_test.csv',10000)
np.shape(test_dataSet)
def model_test(Py, Px_y, test_dataSet ):
'''
对测试集进行测试
:param Py: 先验概率分布
:param Px_y: 条件概率分布
:param test_dataSet : 测试集数据
:return: 准确率
'''
#错误值计数
errorCnt = 0
testDataArr=test_dataSet[:,0:784];
testLabelArr=test_dataSet[:,784:785]
#循环遍历测试集中的每一个样本
for i in range(len(testDataArr)):
#获取预测值
presict = NaiveBayes(Py, Px_y, testDataArr[i])
#与答案进行比较
if presict != testLabelArr[i][0]:
#若错误 错误值计数加1
errorCnt += 1
#返回准确率
return 1 - (errorCnt / len(testDataArr))
accuracy = model_test(Py, Px_y, test_dataSet )
#打印准确率
print('the accuracy is:', accuracy)