朴素贝叶斯分类

最新推荐文章于 2018-06-06 15:24:09 发布

星之空殇

最新推荐文章于 2018-06-06 15:24:09 发布

阅读量432

点赞数

分类专栏：机器学习

本文链接：https://blog.csdn.net/dengjiaxing0321/article/details/50771708

版权

机器学习专栏收录该内容

4 篇文章 0 订阅

订阅专栏

这篇博客介绍了如何利用朴素贝叶斯算法构建基本的分类器，特别地，它展示了将这种分类器应用于邮件分类的实践代码。

摘要由CSDN通过智能技术生成

构建基本分类器代码：

# coding:utf-8
from numpy import *
def loadDataSet():
	postingList=[['my','dog','has','flea','problems','help','please'],\
	             ['maybe','not','take','him','to','dog','park','stupid'],\
	             ['my','dalmation','is','so','cute','I','love','him'],\
	             ['stop','position','stupid','worthless','garbage'],\
	             ['mr','licks','ate','my','steak','how','to','stop','him'],\
	             ['quite','buying','worthless','dog','food','stupid']]
	classVec=[0,1,0,1,0,1] #1代表侮辱性文字，0代表正常文字
	return postingList, classVec
def createVocabList(dataSet):
	vocabSet=set([])         #创建一个空集
	for document in dataSet: 
		vocabSet=vocabSet | set(document)  #创建两个集合的并集
	return list(vocabSet)

def setOfWords2Vec(vocabList,inputSet):  #词集模型   计算每个词只出现一次
<span style="white-space:pre">	</span>returnVec=[0]*len(vocabList)
<span style="white-space:pre">	</span>for word in inputSet:
<span style="white-space:pre">		</span>if word in vocabList:
<span style="white-space:pre">			</span>returnVec[vocabList.index(word)]=1
<span style="white-space:pre">		</span>else:
<span style="white-space:pre">			</span>print "the word: %s is not in my Vocabulary!" % word
<span style="white-space:pre">	</span>return returnVec
def bagOfWord2VecMN(vocabList,inputSet):#词袋模型  计算每个词出现多次
<span style="white-space:pre">	</span>returnVec=[0]*len(vocabList)
<span style="white-space:pre">	</span>for word in inputSet:
<span style="white-space:pre">		</span>if word in vocabList:
<span style="white-space:pre">			</span>returnVec[vocabList.index(word)]+=1;
<span style="white-space:pre">	</span>return returnVec
# listOPosts,listClasses=loadDataSet()
# myVocabList=createVocabList(listOPosts)
# print myVocabList
# print setOfWords2Vec(myVocabList,listOPosts[0])
# print setOfWords2Vec(myVocabList,listOPosts[3])
def trainNB0(trainMatrix,trainCategory):
	numTrainDocs=len(trainMatrix)
	numWords=len(trainMatrix[0])
	pAbusive=sum(trainCategory)/float(numTrainDocs)
	p0Num=ones(numWords);p1Num=ones(numWords)
	p0Denom =2.0; p1Denom=2.0
	for i in range(numTrainDocs):
		if trainCategory[i]==1:
			p1Num+=trainMatrix[i]
			p1Denom+=sum(trainMatrix[i])
		else:
			p0Num+=trainMatrix[i]
			p0Denom+=sum(trainMatrix[i])
	p1Vect=log(p1Num/p1Denom)
	p0Vect=log(p0Num/p0Denom)
	return p0Vect,p1Vect,pAbusive
listOPosts,listClasses=loadDataSet()
myVocabList=createVocabList(listOPosts)
trainMat=[]
for postinDoc in listOPosts:
	trainMat.append(setOfWords2Vec(myVocabList,postinDoc))		
p0V,p1V,PAb=trainNB0(trainMat,listClasses)
# print PAb
# print p0V
# print p1V
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
	p1=sum(vec2Classify*p1Vec)+log(pClass1)
	p0=sum(vec2Classify*p0Vec)+log(1.0-pClass1)
	if p1>p0:
		return 1
	else:
		return 0
def testingNB():
	listOPosts,listClasse=loadDataSet()
	myVocabList=createVocabList(listOPosts)
	trainMat=[]
	for postinDoc in listOPosts:
		trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
	p0V,p1V,pAb=trainNB0(array(trainMat),array(listClasse))
	testEntry=['love','my','dalmation']
	thisDoc=array(setOfWords2Vec(myVocabList,testEntry))
	print testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb)
	testEntry=['stupid','garbage']
	thisDoc=array(setOfWords2Vec(myVocabList,testEntry))
	print testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb)
testingNB()

通过上面分类器对邮件进行分类，添加如下代码：

#文本解析及完整的垃圾邮件测试函数
def textParse(bigString):
	import re
	listOfTokens=re.split(r'\W*',bigString)
	return [tok.lower() for tok in listOfTokens if len(tok)>2]
def spamTest():
	docList=[];classList=[];fullText=[]
	for i in range(1,26):
		wordList=textParse(open('email/spam/%d.txt'%i).read())
		docList.append(wordList)
		fullText.extend(wordList)
		classList.append(1)
		wordList=textParse(open('email/ham/%d.txt'%i).read())
		docList.append(wordList)
		fullText.extend(wordList)
		classList.append(0)
	vocabList=createVocabList(docList)
	trainingSet=range(50);testSet=[]
	for i in range(10):
		randIndex=int(random.uniform(0,len(trainingSet)))
		testSet.append(trainingSet[randIndex])
		del(trainingSet[randIndex])
	trainMat=[];trainClasses=[]
	for docIndex in trainingSet:
		trainMat.append(setOfWords2Vec(vocabList,docList[docIndex]))
		trainClasses.append(classList[docIndex])
	p0V,p1V,pSpam=trainNB0(array(trainMat),array(trainClasses))
	errorCount=0
	for docIndex in testSet:
		wordVector=setOfWords2Vec(vocabList,docList[docIndex])
		if classifyNB(array(wordVector),p0V,p1V,pSpam)!=classList[docIndex]:
			errorCount+=1
	print 'the error rate is:',float(errorCount)/len(testSet)
spamTest()