使用朴素贝叶斯过滤垃圾邮件
说明: 将 `email` 文件夹放在当前目录下。
1.词集模型
## 1 词集模型
import numpy as np
import pandas as pd
import re
# 1.1文件解析及完整的垃圾邮件测试
def textParse(text):
listofTokens=re.split(r'\w+',text)
return [tok.lower() for tok in listofTokens if len(tok)>2]
def createWordList(dataSet):
wordSet=set([])
for dataList in dataSet:
wordSet=wordSet|set(dataList)
return list(wordSet)
def setOfWords2Vec(wordList,inputSet):
returnVec=[0]*len(wordList)
for word in inputSet:
if word in wordList:
returnVec[wordList.index(word)]=1
else:
print('the word:%s is not in my Vocabulary!'%word)
return returnVec
# 1.2 朴素贝叶斯模型训练
def trainNB(trainMatrix,trainCategory):
numTrainDocs=len(trainMatrix)
numWords=len(trainMatrix[0])
pClass1=sum(trainCategory)/float(numTrainDocs)
p0Num=np.ones(numWords)
p1Num=np.ones(numWords)
p0sum=2.0
p1sum=2.0
for index in range(numTrainDocs):
if trainCategory[inedx]==1:
p1Num+=trainMatrix[index]
p1Sum+=1
else:
p0Num+=trainMatrix[index]
p0Sum+=1
p0Vect=np.log(p0Num/p0Sum)
p1Vect=np.log(p1Num/p1Sum)
p0AntiVect=np.log(1-p0Num/p0Sum)
p1AntiVect=np.log(1-p0Num/p0Sum)
return pClass1,p0Vect,p1Vect,p0AntiVect,p1AntiVect
def classifyNB(classifyData,pClass1,p0Vec,p1Vec,p0Antiv,p1Antiv):
antiClassifyData=1-classifyData
p1=np.log(pClass1)+sum(classifyData*p1Vec)+sum(antiClassifyData*p1AntiV)
p0=np.log(1.0-pClass1)+sum(classifyData*p0Vec)+sum(antiClassifyData*p0AntiV)
if p1>p0:
return 1
else:
return 0
def testSetOfWordsModel():
docList=[]
classList=[]
fullText=[]
for index in range(1,26):
wordList=textParse(open('email/spam/%d.txt'% index).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList=textParse(open('email/ham/%d.txt'% index).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
myWordList=createWordList(docList)
trainingSet=list(range(50))
testSet=[]
for index in range(10):
randIndex=int(np.random.uniform(0,len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
trainMatrix=[]
trainClass=[]
for docIndex in trainingSet:
trainMatrix.append(setOfWords2Vec(myWordList,docList[docIndex]))
trainClass.append(classList[docIndex])
pClass1,p0V,p1V,p0AntiV,p1AntiV=trainNB(trainMatrix,trainClass)
errorcount=0
for docIndex in testSet:
wordVector=np.array(setOfWords2Vec(myWordList,docList[docIndex]))
if classifyNB(wordVector,pClass1,p0V,p1V,p0AntiV,p1AntiV)!=classList[docIndex]:
errorcount+=1
2 词袋模型
# # 2.词袋模型
import numpy as np
import pandas as pd
import re
def textParse(text):
listofTokens=re.split(r'\w+',text)
return [tok.lower() for tok in listofTokens if len(tok)>2]
def createWordList(dataSet):
wordSet=set([])
for dataList in dataSet:
wordSet=wordSet|set(dataList)
return list(wordSet)
def bagOfWords2Vec(wordList,inputSet):
returnVec=[0]*len(wordList)
for word in inputSet:
if word in inputSet:
if word in wordList:
returnVec[wordList.index(word)]+=1
return returnVec
def trainNB(trainMatrix,trainCategory):
numTrainDocs=len(trainMatrix)
numWords=len(trainMatrix[0])
p0Num=np.ones(numWords)
p1Num=np.ones(numWords)
p0Sum=0.0
p1Sum=0.0
for index in range(numTrainDocs):
if trainCategory[index]==1:
p1Num+=trainMatrix[index]
p1Sum+=sum(trainMatrix[index])
else:
p0Num+=trainMatrix[index]
p0Sum+=sum(trainMatrix[index])
pClass1=p1Sum/float(p1Sum + p0Sum)
p0Vect=np.log(p0Num/(p0Sum+numWords))
p1Vect=np.log(p1Num/(p1Sum+numWords))
return pClass1,p0Vect,p1Vect
def classifyNB(classifyData,pClass1,p0Vec,p1Vec):
p1=sum(classifyData*p1Vec)+np.log(pClass1)
p0=sum(classifyData*p0Vec)+np.log(1.0-pClass1)
if p1>p0:
return 1
else:
return 0
def testBagOfWordsModel():
docList=[]
classList=[]
fullText=[]
for index in range(1,26):
wordList=textParse(open('email/spam/%d.txt'% index).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList=textParse(open('email/ham/%d.txt'% index).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
myWordList=createWordList(docList)
trainingSet=list(range(50))
testSet=[]
for index in range(10):
randIndex=int(np.random.uniform(0,len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
trainMatrix=[]
trainClass=[]
for docIndex in trainingSet:
trainMatrix.append(bagOfWords2Vec(myWordList,docList[docIndex]))
trainClass.append(classList[docIndex])
pClass1,p0V,p1V=trainNB(trainMatrix,trainClass)
errorcount=0
for docIndex in testSet:
wordVector=np.array(bagOfWords2Vec(myWordList,docList[docIndex]))
if classifyNB(wordVector,pClass1,p0V,p1V)!=classList[docIndex]:
errorcount+=1