简介 该代码实现了一个通用的朴素贝叶斯,很久之前的代码,但是一直没有写出来,放上代码与数据集,需要的朋友自取
一:数据集
二:代码
1. normal-bayes.py
import numpy as np
import pandas as pd
def ListToSet(dataList):
dataSet = set([])
for data in dataList:
dataSet = dataSet | set(data)
return list(dataSet)
def wordToVector(datavector, dataTrain):
wordVector = [0]*len(datavector)
for data in dataTrain:
if data in datavector:
wordVector[datavector.index(data)] = 1
else:print("Error word Vector")
return wordVector
def trainBayes(dataList):
featureData = []
labelData = []
dataListLen = len(dataList)
for data in dataList:
featureData.append(data[:-1])
labelData.append(data[-1:])
labelSet = ListToSet(labelData)
dictLabel = {}
prePredict = {}
for label in labelSet:
dictLabel[label] = []
prePredict[label] = list()
count = 0.0
for data in dataList:
dataLabel = data[-1]
if dataLabel == label:
count+=1
dictLabel[label].append(data[:-1])
prePredict[label].append(count/dataListLen)
wordDict = {}
wordVector = {}
for label in labelSet:
wordDict[label] = ListToSet(dictLabel[label])
wordVector[label] = list()
for dataTrain in dictLabel[label]:
wordVector[label].append(wordToVector(wordDict[label], dataTrain))
predict = {}
for label in labelSet:
p1num = np.ones(len(wordVector[label][0]))
p1demo = 1.0
for vector in wordVector[label]:
p1num += vector
p1demo += sum(vector)
p1PredictVal = np.log(p1num/p1demo)
predict[label] = p1PredictVal
return prePredict, predict, wordDict
def classfy(testVector, prePredict, predict, key):
preDecision = {}
preVal = sum(testVector * predict[key]) + np.log(prePredict[key])
return preVal
def dataTest(trainData, testData):
print("test")
prePredict, predict, wordDict = trainBayes(trainData)
count = 0.0
testDataLen = len(testData)
for data in testData:
testDataVal = data[:-1]
testDataLabel = data[-1]
preVal = -float("inf")
returnKey = ""
for key in wordDict:
testVector = wordToVector(wordDict[key], testDataVal)
returnPreVal = classfy(testVector, prePredict, predict, key)
if returnPreVal > preVal:
preVal = returnPreVal
returnKey = key
if returnKey != testDataLabel:
count += 1
errorRate = count / testDataLen
return errorRate
if __name__ == '__main__':
trainData = pd.read_csv("trainFile.data").values
testData = pd.read_csv("testFile.data").values
error = dataTest(trainData,testData)
print(error)
2. normal-bayes-2.py
import numpy as np
import pandas as pd
def ListToSet(dataList):
"""
:param dataList: 列表词集,类型 []
:return: 列表词集, 不重复的词集
"""
dataSet = set([])
for data in dataList:
dataSet = dataSet | set(data)
return list(dataSet)
def wordToVector(datavector, dataTrain):
"""
:param datavector: 词集,类型: []
:param dataTrain: 词集,类型: []
:return: 向量集,标记位置的向量集
"""
wordVector = [0]*len(datavector)
for data in dataTrain:
if data in datavector:
wordVector[datavector.index(data)] = 1
else:
print("Error word Vector")
return wordVector
def trainBayes(dataList):
"""
:param dataList: 参数要求是一个[[],[]] 类型的数组,包含特征和标签值,并且 标签值为最后一列
:return: 返回值类型是字典中嵌套列表 {label: [] }
prePredict:类型:{label: [] } ,含义:先验概率,也就是每一个标签在所有样本中的概率
predict:类型:{label: [] } ,含义:条件概率,具体类别中,生成的不重复的词集合的概率
wordDict: 类型: {label: [] } , 含义:具体类别的词集合
"""
labelData = []
dataListLen = len(dataList)
for data in dataList:
labelData.append(data[-1:])
labelSet = ListToSet(labelData)
dictLabel = {}
prePredict = {}
for label in labelSet:
dictLabel[label] = []
prePredict[label] = list()
count = 0.0
for data in dataList:
dataLabel = data[-1]
if dataLabel == label:
count+=1
dictLabel[label].append(data[:-1])
prePredict[label].append(count/dataListLen)
wordDict = {}
wordVector = {}
for label in labelSet:
wordDict[label] = ListToSet(dictLabel[label])
wordVector[label] = list()
for dataTrain in dictLabel[label]:
wordVector[label].append(wordToVector(wordDict[label], dataTrain))
predict = {}
for label in labelSet:
p1num = np.ones(len(wordVector[label][0]))
p1demo = 1.0
for vector in wordVector[label]:
p1num += vector
p1demo += sum(vector)
p1PredictVal = np.log(p1num/p1demo)
predict[label] = p1PredictVal
return prePredict, predict, wordDict
def classfy(testVector, prePredict, predict, key):
"""
:param testVector: 类型:[], 测试词集
:param prePredict: 类型:{key:[]} 先验概率
:param predict: 类型:{key:[]} 条件概率
:param key: 类型:str 标签值
:return: 该标签下的概率值
"""
preDecision = {}
preVal = sum(testVector * predict[key]) + np.log(prePredict[key])
return preVal
def dataTest(trainData, testData):
"""
:param trainData: 训练数据集 ,类型:[[],[],...], 包含特征和标签,且最后一列为标签值
:param testData: 测试数据集, 类型:[[],[],...], 包含特征和标签,且最后一列为标签值
:return: 错误率
"""
prePredict, predict, wordDict = trainBayes(trainData)
count = 0.0
testDataLen = len(testData)
for data in testData:
testDataVal = data[:-1]
testDataLabel = data[-1]
preVal = -float("inf")
returnKey = ""
for key in wordDict:
testDataSet = ListToSet(testDataVal)
testVector = wordToVector(wordDict[key], testDataSet)
returnPreVal = classfy(testVector, prePredict, predict, key)
if returnPreVal > preVal:
preVal = returnPreVal
returnKey = key
if returnKey != testDataLabel:
count += 1
errorRate = count / testDataLen
return errorRate
if __name__ == '__main__':
trainData = pd.read_csv("trainFile.data").values
testData = pd.read_csv("testFile.data").values
error = dataTest(trainData,testData)
print(error)
"""
normal-bayes.py 测试数据集没有进行去重操作,错误率达到了 54% 左右
normal-bayes-2.py 测试数据集进行了去重处理, 错误率 30% 左右, 降低了 24% 错误率
"""
朴素贝叶斯参考博客(机器学习实战)