前言
本文主要对朴素贝叶斯中的例4.1和4.2进行代码实现(文末有完整代码)
1.引入库
import numpy as np
from functools import reduce
2.读入数据
def loadDataSet():
DataList = [[1, 'S'], [1, 'M'], [1, 'M'], [1, 'S'], [1, 'S'], [2, 'S'], [2, 'M'], [2, 'M'], [2, 'L'], [2, 'L'],
[2, 'L'], [3, 'M'], [3, 'M'], [3, 'L'], [3, 'L']]
classVec = [-1, -1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, -1]
return DataList, classVec
3.数据处理
def createFeatureList(dataSet):
featureSet = set([]) # 创建一个空的不重复列表
for document in dataSet:
featureSet = featureSet | set(document) # 取并集
return list(featureSet)
def Vec(featurelist, inputSet):
vector = [0] * len(featurelist)
for line in inputSet:
if line in featurelist:
vector[featurelist.index(line)] += 1
return vector
4.没有加拉普拉斯平滑的
def trainNB0(trainMat, classVec):
numTrain = len(trainMat) # 样本数量
numDataSet = len(trainMat[0]) # 总的特征取值
py1 = classVec.count(1) / float(numTrain) # 求p(y=1)
p1Num = np.zeros(numDataSet)
p0Num = np.zeros(numDataSet) # y=-1表示为y=0
p1Denom = 0.0
p0Denom = 0.0
for i in range(numTrain):
if classVec[i] == 1: # 统计属于y=1的条件概率所需的数据,即P(w0|1),P(w1|1),P(w2|1)···
p1Num += trainMat[i]
p1Denom += 1
else: # 统计属于y=-1的条件概率所需的数据,即P(w0|0),P(w1|0),P(w2|0)···
p0Num += trainMat[i]
p0Denom += 1
p1Vect = np.log(p1Num / p1Denom) # 取对数,防止下溢出
p0Vect = np.log(p0Num / p0Denom)
return p0Vect, p1Vect, py1
def classifyNB0(vec2Classify, p0Vec, p1Vec, pClass1):
p1 = reduce(lambda x, y: x * y, vec2Classify * p1Vec) * pClass1
p0 = reduce(lambda x, y: x * y, vec2Classify * p0Vec) * (1.0 - pClass1)
if p1 > p0:
return 1
else:
return -1
5.加拉普拉斯平滑的
def trainNB1(trainMat, classVec):
numTrain = len(trainMat) # 样本数量
numDataSet = len(trainMat[0]) # 总的特征取值
py1 = classVec.count(1) / float(numTrain) # 求p(y=1)先验概率
p1Num = np.ones(numDataSet) # 创建numpy.ones数组,特征出现数初始化为1,拉普拉斯平滑
p0Num = np.ones(numDataSet) # y=-1表示为y=0
p1Denom = 2.0 # 分母初始化为2,拉普拉斯平滑
p0Denom = 2.0
for i in range(numTrain):
if classVec[i] == 1: # 统计属于y=1的条件概率所需的数据,即P(w0|1),P(w1|1),P(w2|1)···
p1Num += trainMat[i]
p1Denom += 1
else: # 统计属于y=-1的条件概率所需的数据,即P(w0|0),P(w1|0),P(w2|0)···
p0Num += trainMat[i]
p0Denom += 1
p1Vect = np.log(p1Num / p1Denom) # 取对数,防止下溢出
p0Vect = np.log(p0Num / p0Denom)
return p0Vect, p1Vect, py1
def classifyNB1(vec2Classify, p0Vec, p1Vec, pClass1):
p1 = sum(vec2Classify * p1Vec) + np.log(pClass1) # 对应元素相乘。logA * B = logA + logB,所以这里加上log(pClass1)
p0 = sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
if p1 > p0:
return 1
else:
return -1
6.运行
if __name__ == "__main__":
trainList, classVec = loadDataSet()
featurelist = createFeatureList(trainList) # 创建所有特征可n能的取值集合
# print(featurelist)
trainMat = []
for inputSet in trainList:
trainMat.append(Vec(featurelist, inputSet))
# print(trainMat)
#4.1的测试
p0V, p1V, py1 = trainNB0(trainMat, classVec)
test = [2, 'S']
testVec = Vec(featurelist, test)
pred0 = classifyNB0(testVec, p0V, p1V, py1)
print("没有加拉普拉斯平滑的分类结果:",pred0)
# 4.2的测试
p0V, p1V, py1 = trainNB1(trainMat, classVec)
test = [2, 'S']
testVec = Vec(featurelist, test)
pred1 = classifyNB1(testVec, p0V, p1V, py1)
print("加拉普拉斯平滑的分类结果:",pred1)
完整代码
import numpy as np
from functools import reduce
def loadDataSet():
DataList = [[1, 'S'], [1, 'M'], [1, 'M'], [1, 'S'], [1, 'S'], [2, 'S'], [2, 'M'], [2, 'M'], [2, 'L'], [2, 'L'],
[2, 'L'], [3, 'M'], [3, 'M'], [3, 'L'], [3, 'L']]
classVec = [-1, -1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, -1]
return DataList, classVec
def createFeatureList(dataSet):
featureSet = set([]) # 创建一个空的不重复列表
for document in dataSet:
featureSet = featureSet | set(document) # 取并集
return list(featureSet)
def Vec(featurelist, inputSet):
vector = [0] * len(featurelist)
for line in inputSet:
if line in featurelist:
vector[featurelist.index(line)] += 1
return vector
# 没有加拉普拉斯平滑的
def trainNB0(trainMat, classVec):
numTrain = len(trainMat) # 样本数量
numDataSet = len(trainMat[0]) # 总的特征取值
py1 = classVec.count(1) / float(numTrain) # 求p(y=1)
p1Num = np.zeros(numDataSet)
p0Num = np.zeros(numDataSet) # y=-1表示为y=0
p1Denom = 0.0
p0Denom = 0.0
for i in range(numTrain):
if classVec[i] == 1: # 统计属于y=1的条件概率所需的数据,即P(w0|1),P(w1|1),P(w2|1)···
p1Num += trainMat[i]
p1Denom += 1
else: # 统计属于y=-1的条件概率所需的数据,即P(w0|0),P(w1|0),P(w2|0)···
p0Num += trainMat[i]
p0Denom += 1
p1Vect = np.log(p1Num / p1Denom) # 取对数,防止下溢出
p0Vect = np.log(p0Num / p0Denom)
return p0Vect, p1Vect, py1
# 加拉普拉斯平滑
def trainNB1(trainMat, classVec):
numTrain = len(trainMat) # 样本数量
numDataSet = len(trainMat[0]) # 总的特征取值
py1 = classVec.count(1) / float(numTrain) # 求p(y=1)先验概率
p1Num = np.ones(numDataSet) # 创建numpy.ones数组,特征出现数初始化为1,拉普拉斯平滑
p0Num = np.ones(numDataSet) # y=-1表示为y=0
p1Denom = 2.0 # 分母初始化为2,拉普拉斯平滑
p0Denom = 2.0
for i in range(numTrain):
if classVec[i] == 1: # 统计属于y=1的条件概率所需的数据,即P(w0|1),P(w1|1),P(w2|1)···
p1Num += trainMat[i]
p1Denom += 1
else: # 统计属于y=-1的条件概率所需的数据,即P(w0|0),P(w1|0),P(w2|0)···
p0Num += trainMat[i]
p0Denom += 1
p1Vect = np.log(p1Num / p1Denom) # 取对数,防止下溢出
p0Vect = np.log(p0Num / p0Denom)
return p0Vect, p1Vect, py1
def classifyNB0(vec2Classify, p0Vec, p1Vec, pClass1):
p1 = reduce(lambda x, y: x * y, vec2Classify * p1Vec) * pClass1
p0 = reduce(lambda x, y: x * y, vec2Classify * p0Vec) * (1.0 - pClass1)
if p1 > p0:
return 1
else:
return -1
def classifyNB1(vec2Classify, p0Vec, p1Vec, pClass1):
p1 = sum(vec2Classify * p1Vec) + np.log(pClass1) # 对应元素相乘。logA * B = logA + logB,所以这里加上log(pClass1)
p0 = sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
if p1 > p0:
return 1
else:
return -1
if __name__ == "__main__":
trainList, classVec = loadDataSet()
featurelist = createFeatureList(trainList) # 创建所有特征可n能的取值集合
# print(featurelist)
trainMat = []
for inputSet in trainList:
trainMat.append(Vec(featurelist, inputSet))
# print(trainMat)
#4.1的测试
p0V, p1V, py1 = trainNB0(trainMat, classVec)
test = [2, 'S']
testVec = Vec(featurelist, test)
pred0 = classifyNB0(testVec, p0V, p1V, py1)
print("没有加拉普拉斯平滑的分类结果:",pred0)
# 4.2的测试
p0V, p1V, py1 = trainNB1(trainMat, classVec)
test = [2, 'S']
testVec = Vec(featurelist, test)
pred1 = classifyNB1(testVec, p0V, p1V, py1)
print("加拉普拉斯平滑的分类结果:",pred1)