#!/usr/bin/python
from numpy import *
import re
from os import listdir
def loadDataSet():
postingList = [['my' , 'dog' , 'has' , 'fea' , \
'problems' , 'help', 'please' ],
['maybe' , 'not' , 'take' ,'him' , \
'to' , 'dog' , 'park' , 'stupid'],
['my' , 'dalmation' , 'is' , 'so' , 'cute',\
'I' , 'love' , 'him' ],
['stop' , 'posting' , 'stupid', 'worthless',\
'garbage'],
['mr','licks','ate' , 'my' , 'steak' , 'how',\
'to','stop' , 'him'],
['quit' , 'buying' , 'worthless', 'dog',\
'food' , 'stupid']]
classVec =[0,1,0,1,0,1]
return postingList , classVec
def createVocabList(dataSet):
vocabSet = set([])
for doc in dataSet:
vocabSet = vocabSet | set(doc)
return list(vocabSet)
def setOfWord(vocabList , inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1
else:
print "the word %s is not in my \
vocabulary" %word
return returnVec
def trainNB(trainMat , classLabels):
numTrain = len(trainMat)
numWords = len(trainMat[0])
pAbsolut = sum(classLabels)/float(numTrain)
p0Num = ones(numWords)
p1Num = ones(numWords)
p0Denum = 2.0 ; p1Denum = 2.0
for i in range(numTrain):
if classLabels[i] == 1:
p1Num += trainMat[i]
p1Denum += sum(trainMat[i])
else:
p0Num += trainMat[i]
p0Denum += sum(trainMat[i])
p1Vec = log(p1Num / p1Denum)
p0Vec = log(p0Num / p0Denum)
return p0Vec , p1Vec , pAbsolut
def classifyNB(toClassify , p0Vec , p1Vec , p1Ab):
p1 = sum(toClassify * p1Vec) + log(p1Ab)
p0 = sum(toClassify * p0Vec) + log(1-p1Ab)
if p1 > p0:
return 1
else:
return 0
def testingNB():
listPosts , listClasses = loadDataSet()
vocabList = createVocabList(listPosts)
trainMat =[]
for doc in listPosts:
trainMat.append(setOfWord(vocabList , doc))
p0v , p1v , pAb = trainNB(trainMat , listClasses)
testEntry = ['love' , 'my' , 'dalmation']
thisVec = array(setOfWord(vocabList , testEntry))
print testEntry , "classify is " , classifyNB(thisVec , \
p0v , p1v , pAb)
testEntry = ['stupid' , 'garbage']
thisVec = array(setOfWord(vocabList , testEntry))
print testEntry , "classify is " , classifyNB(thisVec , \
p0v , p1v , pAb)
def textParse(bigString):
regEx = re.compile('\\W*')
listOfTokens = regEx.split(bigString)
return [tok.lower() for tok in listOfTokens if \
len(tok) > 2]
def spamTest():
docList=[] ; classList=[]
docNum = len(listdir("email/spam"))
for i in range(1 , docNum +1):
wordList = textParse(open("email/spam/%d.txt"%i).read())
docList.append(wordList)
classList.append(1)
wordList = textParse(open("email/ham/%d.txt"%i).read())
docList.append(wordList)
classList.append(0)
vocabList = createVocabList(docList)
trainSet = range(50) ; testSet = []
for i in range(10):
randIndex = int(random.uniform(0 , len(trainSet)))
testSet.append(trainSet[randIndex])
del(trainSet[randIndex])
trainMat = [] ; trainClass=[]
for docIndex in trainSet :
trainMat.append(setOfWord(vocabList , docList[docIndex]))
trainClass.append(classList[docIndex])
p0 , p1 , pSpam = trainNB(array(trainMat) , array(trainClass))
errorCount = 0.0
for docIndex in testSet:
wordVec = setOfWord(vocabList , docList[docIndex])
sign = classifyNB(wordVec , p0 , p1 , pSpam)
if sign != classList[docIndex]:
errorCount +=1
print "bayes come out : %d , the real class is %d" %(sign , classList[docIndex])
print "the error rate is :",errorCount/float(len(testSet))
if __name__ == '__main__':
postList , classVec = loadDataSet()
# print postList
vocabList = createVocabList(postList)
# print vocabList
returnVec = setOfWord(vocabList , postList[1])
# print returnVec
trainMat = []
for doc in postList:
trainMat.append(setOfWord(vocabList , doc))
# print trainMat
p0 , p1 , pA = trainNB(trainMat , classVec)
# print p0
# print p1
# print pA
# testingNB()
spamTest()
naive bayes with python
最新推荐文章于 2023-05-22 21:05:37 发布