朴素贝叶斯分类
一、优缺点
- 优点:在数据较少的时候仍然有效,可以处理多类别问题
- 缺点:对于输入数据的准备方式较为敏感
- 适用数据类型:标称型数据
二、朴素贝叶斯的一般流程
- 收集数据
- 准备数据
- 分析数据
- 训练算法
- 测试算法
- 使用算法
三、代码实例
训练数据
链接: https://pan.baidu.com/s/1s4EubJm38qAUhe6PPieQkA 提取码: abj2
# -*- coding: utf-8 -*-
# @Time : 19-4-4 上午9:43
# @Author : MRB
# @File : email_classify.py
# @Software: PyCharm Community Edition
# import re
import sys,os
from numpy import *
#---------------------------词表到向量的转换函数--------------------------------------------------------
#创建实验样本
def loadDataSet():
postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classVec = [0, 1, 0, 1, 0, 1] # 1 is abusive, 0 not
return postingList, classVec
#创建一个包含所有文档中出现的不重复的列表
def createVocabList(dataSet):
vocabSet = set([]) #创建一个空集
for document in dataSet:
vocabSet = vocabSet|set(document) #两个集合的并集
return list(vocabSet)
#输出文档向量
def setOfWord2Vec(vocabList,inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else:
print("the word:{} is not in my Vocabulary!".format(word))
return returnVec
#文档词袋模型
def bagOfWords2VecMN(vocabList,inputSet):
returnVec = [0] * len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1
return returnVec
#---------------------------词表到向量的转换函数 end ------------------------------------------------------
'''
朴素贝叶斯分类器训练算法
'''
def trainNB0(trainMatrix,trainCategory):
'''
:param trainMatrix: 文档矩阵
:param trainCategory: 类别标签构成的向量
:return:
'''
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pAbusive = sum(trainCategory)/float(numTrainDocs) #计算侮辱类的概率--3/6
# 初始化概率
# p0Num = zeros(numWords)
# p1Num = zeros(numWords)
# p0Denom = 0.0
# p1Denom = 0.0
#优化
'''
防止一个概率为0,相乘是结果为0,改为对数相加防止这种
'''
p0Num = ones(numWords)
p1Num = ones(numWords)
p0Denom = 2.0
p1Denom = 2.0
#向量相加
for i in range(numTrainDocs):
# print(trainMatrix[i])
if trainCategory[i] == 1:
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
#对每个元素做除法
# p1Vect = p1Num/p1Denom #计算每个词条在侮辱类中出现的概率
# p0Vect = p0Num/p0Denom #计算每个词条在非侮辱类中出现的概率
#优化
p1Vect = log(p1Num / p1Denom) # 计算每个词条在侮辱类中出现的概率
p0Vect = log(p0Num / p0Denom) # 计算每个词条在非侮辱类中出现的概率
return p0Vect,p1Vect,pAbusive
def classifyNB(vec2Classify,p0vec,p1Vec,pClass1):
p1 = sum(vec2Classify * p1Vec) + log(pClass1)
p0 = sum(vec2Classify * p0vec) + log(1.0 - pClass1)
if p1 > p0:
return 1
else:
return 0
def testingNB():
listOPlsts,listClasses = loadDataSet()
myVocablist = createVocabList(listOPlsts)
trainMat = []
for postinDoc in listOPlsts:
trainMat.append(setOfWord2Vec(myVocablist,postinDoc))
p0v,p1v,pAb = trainNB0(trainMat,listClasses)
testEntry =['stupid','garbage']
thisDoc = array(setOfWord2Vec(myVocablist,testEntry))
print("{} classified as : {}".format(testEntry,classifyNB(thisDoc,p0v,p1v,pAb)))
testEntry = ['love','my','dalmation']
thisDoc = array(setOfWord2Vec(myVocablist, testEntry))
print("{} classified as : {}".format(testEntry, classifyNB(thisDoc, p0v, p1v, pAb)))
'''
文件解析以及完整的来及邮件分类
'''
def textParse(bigString):
import re
listOfTokens = re.split(r'\W*',bigString)
# print(listOfTokens)
# print([tok for tok in listOfTokens if len(tok)>2])
return [tok for tok in listOfTokens if len(tok)>2]
def spamTest():
docList = []
classList = []
fulltext = []
#导入解析文本文件
for i in range(1,26):
# print(i)
wordList = textParse(open('email/spam/{}.txt'.format(str(i))).read())
docList.append(wordList)
fulltext.extend(wordList)
classList.append(1)
wordList = textParse(open('email/ham/{}.txt'.format(str(i))).read())
docList.append(wordList)
fulltext.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList)
trainingSet = list(range(50))
testSet = []
#随机构建训练集
for i in range(10):
randIndex = int(random.uniform(0,len(trainingSet)))
testSet.append(trainingSet[randIndex])
print(trainingSet)
del(trainingSet[randIndex])
# print(trainingSet)
trainMat = []
trainClasses = []
for docIndex in trainingSet:
trainMat.append(setOfWord2Vec(vocabList,docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
errorCount = 0
for docIndex in testSet:
wordVector = setOfWord2Vec(vocabList,docList[docIndex])
if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
errorCount += 1
print("the error rate is : ",float(errorCount)/len(testSet))
if __name__ == '__main__':
spamTest()