使用朴素贝叶斯过滤垃圾邮件
利用上文的方法,对垃圾邮件进行过滤。
1 处理文本数据
1.1 首先收集数据email.zip,如果需要可以私信我。
1.2 分词
1.简单的字符串分词方法可以用.split()
2.对于有特殊字符的,可以用正则表达式
import re
reg = re.compile('\\w+')
text = reg.split(String)
3.部分单词有大小写,统一改成为小写
for doc in text:
doc.lower()
- 过滤掉单个字,即长度为1的单词。
doc for doc in text if len(doc >2)
1.3 测试吧!
def textParse(bigString): #input is big string, #output is word list
import re
listOfTokens = re.split(r'\W+', bigString)
return [tok.lower() for tok in listOfTokens if len(tok) > 2]
def spamTest():
import random
docList = []; classList = []; fullText = []
for i in range(1,26):
wordList = textParse(open('/data/email/spam/%d.txt'%i,encoding="ISO-8859-1").read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = textParse(open('/data/email/ham/%d.txt'%i,encoding="ISO-8859-1").read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocalList = createVocabList(docList)
trainingSet = range(50); testSet = []
for i in range(10):
randIndex = int(random.uniform(0, len(trainingSet)))
testSet.append(trainingSet[randIndex])
# del trainingSet[randIndex] TypeError: 'range' object doesn't support item deletion
del (list(trainingSet)[randIndex])
trainMat = []; trainClass = []
for docIndex in trainingSet:
trainMat.append(setOfWords2Vec(vocalList,docList[docIndex]))
trainClass.append(classList[docIndex])
p0V, p1V, pSpam = trainNB0(np.array(trainMat),np.array(trainClass))
errorCount = 0
# 对测试集分类
for docIndex in testSet:
wordVector = (setOfWords2Vec(vocalList,docList[docIndex]))
if classifyNB(np.array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
errorCount += 1
print('the error rate is ', float(errorCount/len(testSet)))
然后执行上述代码
spamTest()