naive bayes with python

最新推荐文章于 2023-05-22 21:05:37 发布

jhlzlb

最新推荐文章于 2023-05-22 21:05:37 发布

阅读量420

点赞数

分类专栏： machine learning in action python 机器学习数据挖掘文章标签：数据挖掘机器学习

本文链接：https://blog.csdn.net/jhlzlb/article/details/8530590

版权

机器学习同时被 3 个专栏收录

4 篇文章 0 订阅

订阅专栏

数据挖掘

4 篇文章 0 订阅

订阅专栏

python

3 篇文章 0 订阅

订阅专栏

#!/usr/bin/python

from numpy import *
import re
from os import listdir

def loadDataSet():
	postingList = [['my' , 'dog' , 'has' , 'fea' , \
		'problems' , 'help', 'please' ],
		['maybe' , 'not' , 'take' ,'him' , \
		'to' , 'dog' , 'park' , 'stupid'],
		['my' , 'dalmation' , 'is' , 'so' , 'cute',\
		'I' , 'love' , 'him' ],
		['stop' , 'posting' , 'stupid', 'worthless',\
		'garbage'],
		['mr','licks','ate' , 'my' , 'steak' , 'how',\
		'to','stop' , 'him'],
		['quit' , 'buying' , 'worthless', 'dog',\
		'food' , 'stupid']]
	classVec =[0,1,0,1,0,1]
	return postingList , classVec

def createVocabList(dataSet):
	vocabSet = set([])
	for doc in dataSet:
		vocabSet = vocabSet | set(doc)
	return list(vocabSet)

def setOfWord(vocabList , inputSet):
	returnVec = [0]*len(vocabList)
	for word in inputSet:
		if word in vocabList:
			returnVec[vocabList.index(word)] += 1
		else:
			print "the word %s is not in my \
			vocabulary" %word
	return returnVec

def trainNB(trainMat , classLabels):
	numTrain = len(trainMat)
	numWords = len(trainMat[0])
	pAbsolut = sum(classLabels)/float(numTrain)
	p0Num = ones(numWords) 
	p1Num = ones(numWords)
	p0Denum = 2.0 ; p1Denum = 2.0
	for i in range(numTrain):
		if classLabels[i] == 1:
			p1Num += trainMat[i]
			p1Denum += sum(trainMat[i])
		else:
			p0Num += trainMat[i]
			p0Denum += sum(trainMat[i])
	p1Vec = log(p1Num / p1Denum) 
	p0Vec = log(p0Num / p0Denum) 
	return p0Vec , p1Vec , pAbsolut

def classifyNB(toClassify , p0Vec , p1Vec , p1Ab):
	p1 = sum(toClassify * p1Vec) + log(p1Ab)
	p0 = sum(toClassify * p0Vec) + log(1-p1Ab)
	if p1 > p0:
		return 1
	else:
		return 0

def testingNB():
	listPosts , listClasses = loadDataSet()
	vocabList = createVocabList(listPosts)
	trainMat =[]
	for doc in listPosts:
		trainMat.append(setOfWord(vocabList , doc))
	p0v , p1v , pAb = trainNB(trainMat , listClasses)
	testEntry = ['love' , 'my' , 'dalmation']
	thisVec = array(setOfWord(vocabList , testEntry))
	print testEntry , "classify is " , classifyNB(thisVec , \
		p0v , p1v , pAb)
	testEntry = ['stupid' , 'garbage']
	thisVec = array(setOfWord(vocabList , testEntry))
	print testEntry , "classify is " , classifyNB(thisVec , \
		p0v , p1v , pAb)

def textParse(bigString):
	regEx = re.compile('\\W*')	
	listOfTokens = regEx.split(bigString)
	return [tok.lower() for tok in listOfTokens if \
			len(tok) > 2]
	
def spamTest():
	docList=[] ; classList=[] 
	docNum = len(listdir("email/spam"))
	for i in range(1 , docNum +1):
		wordList = textParse(open("email/spam/%d.txt"%i).read())
		docList.append(wordList)
		classList.append(1)
		wordList = textParse(open("email/ham/%d.txt"%i).read())
		docList.append(wordList)
		classList.append(0)
	vocabList = createVocabList(docList)
	trainSet = range(50) ; testSet = []
	for i in range(10):
		randIndex = int(random.uniform(0 , len(trainSet)))
		testSet.append(trainSet[randIndex])
		del(trainSet[randIndex]) 	
	trainMat = [] ; trainClass=[]
	for docIndex in trainSet : 
		trainMat.append(setOfWord(vocabList , docList[docIndex]))
		trainClass.append(classList[docIndex])
	p0 , p1 , pSpam = trainNB(array(trainMat) , array(trainClass))
	errorCount = 0.0
	for docIndex in testSet:
		wordVec = setOfWord(vocabList , docList[docIndex])
		sign = classifyNB(wordVec , p0 , p1 , pSpam)
		if sign != classList[docIndex]:
			errorCount +=1 
		print "bayes come out : %d , the real class is %d" %(sign , classList[docIndex])
	print "the error rate is :",errorCount/float(len(testSet))

	
if __name__ == '__main__':
	postList , classVec = loadDataSet() 
#	print postList
	vocabList = createVocabList(postList)
#	print vocabList
	returnVec = setOfWord(vocabList , postList[1])
#	print returnVec	
	trainMat = []
	for doc in postList:
		trainMat.append(setOfWord(vocabList , doc))
#	print trainMat
	p0 , p1 , pA = trainNB(trainMat , classVec)
#	print p0 
#	print p1
#	print pA
#	testingNB()	
	spamTest()