构建基本分类器代码:
# coding:utf-8
from numpy import *
def loadDataSet():
postingList=[['my','dog','has','flea','problems','help','please'],\
['maybe','not','take','him','to','dog','park','stupid'],\
['my','dalmation','is','so','cute','I','love','him'],\
['stop','position','stupid','worthless','garbage'],\
['mr','licks','ate','my','steak','how','to','stop','him'],\
['quite','buying','worthless','dog','food','stupid']]
classVec=[0,1,0,1,0,1] #1代表侮辱性文字,0代表正常文字
return postingList, classVec
def createVocabList(dataSet):
vocabSet=set([]) #创建一个空集
for document in dataSet:
vocabSet=vocabSet | set(document) #创建两个集合的并集
return list(vocabSet)
def setOfWords2Vec(vocabList,inputSet): #词集模型 计算每个词只出现一次
<span style="white-space:pre"> </span>returnVec=[0]*len(vocabList)
<span style="white-space:pre"> </span>for word in inputSet:
<span style="white-space:pre"> </span>if word in vocabList:
<span style="white-space:pre"> </span>returnVec[vocabList.index(word)]=1
<span style="white-space:pre"> </span>else:
<span style="white-space:pre"> </span>print "the word: %s is not in my Vocabulary!" % word
<span style="white-space:pre"> </span>return returnVec
def bagOfWord2VecMN(vocabList,inputSet):#词袋模型 计算每个词出现多次
<span style="white-space:pre"> </span>returnVec=[0]*len(vocabList)
<span style="white-space:pre"> </span>for word in inputSet:
<span style="white-space:pre"> </span>if word in vocabList:
<span style="white-space:pre"> </span>returnVec[vocabList.index(word)]+=1;
<span style="white-space:pre"> </span>return returnVec
# listOPosts,listClasses=loadDataSet()
# myVocabList=createVocabList(listOPosts)
# print myVocabList
# print setOfWords2Vec(myVocabList,listOPosts[0])
# print setOfWords2Vec(myVocabList,listOPosts[3])
def trainNB0(trainMatrix,trainCategory):
numTrainDocs=len(trainMatrix)
numWords=len(trainMatrix[0])
pAbusive=sum(trainCategory)/float(numTrainDocs)
p0Num=ones(numWords);p1Num=ones(numWords)
p0Denom =2.0; p1Denom=2.0
for i in range(numTrainDocs):
if trainCategory[i]==1:
p1Num+=trainMatrix[i]
p1Denom+=sum(trainMatrix[i])
else:
p0Num+=trainMatrix[i]
p0Denom+=sum(trainMatrix[i])
p1Vect=log(p1Num/p1Denom)
p0Vect=log(p0Num/p0Denom)
return p0Vect,p1Vect,pAbusive
listOPosts,listClasses=loadDataSet()
myVocabList=createVocabList(listOPosts)
trainMat=[]
for postinDoc in listOPosts:
trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
p0V,p1V,PAb=trainNB0(trainMat,listClasses)
# print PAb
# print p0V
# print p1V
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
p1=sum(vec2Classify*p1Vec)+log(pClass1)
p0=sum(vec2Classify*p0Vec)+log(1.0-pClass1)
if p1>p0:
return 1
else:
return 0
def testingNB():
listOPosts,listClasse=loadDataSet()
myVocabList=createVocabList(listOPosts)
trainMat=[]
for postinDoc in listOPosts:
trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
p0V,p1V,pAb=trainNB0(array(trainMat),array(listClasse))
testEntry=['love','my','dalmation']
thisDoc=array(setOfWords2Vec(myVocabList,testEntry))
print testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb)
testEntry=['stupid','garbage']
thisDoc=array(setOfWords2Vec(myVocabList,testEntry))
print testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb)
testingNB()
通过上面分类器对邮件进行分类,添加如下代码:
#文本解析及完整的垃圾邮件测试函数
def textParse(bigString):
import re
listOfTokens=re.split(r'\W*',bigString)
return [tok.lower() for tok in listOfTokens if len(tok)>2]
def spamTest():
docList=[];classList=[];fullText=[]
for i in range(1,26):
wordList=textParse(open('email/spam/%d.txt'%i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList=textParse(open('email/ham/%d.txt'%i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList=createVocabList(docList)
trainingSet=range(50);testSet=[]
for i in range(10):
randIndex=int(random.uniform(0,len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
trainMat=[];trainClasses=[]
for docIndex in trainingSet:
trainMat.append(setOfWords2Vec(vocabList,docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V,p1V,pSpam=trainNB0(array(trainMat),array(trainClasses))
errorCount=0
for docIndex in testSet:
wordVector=setOfWords2Vec(vocabList,docList[docIndex])
if classifyNB(array(wordVector),p0V,p1V,pSpam)!=classList[docIndex]:
errorCount+=1
print 'the error rate is:',float(errorCount)/len(testSet)
spamTest()