我们已经讲解过朴素贝叶斯分类器的基本原理和实现:动手实现朴素贝叶斯分类器进行文档分类
在此基础上,我们实现垃圾邮件的过滤,数据为50封txt邮件
(1)将text文本文件,分成单词列表
使用正则表达式,使用除单词和数字外的任意字符串为分隔符
并删除长度小于3的字符串
def textParse(bigString):
import re
listOfTokens = re.split(r'\W*', bigString)
return [tok.lower() for tok in listOfTokens if len(tok) > 2]
(2)垃圾邮件检测
def spamTest():
docList = []; classList = []; fullText = []
for i in range(1,26):
#读取25封垃圾邮件
path = 'email/spam/{0}.txt'.format(i)
wordList = textParse(open(path).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
#读取25封正常邮件
path = 'email/ham/{0}.txt'.format(i)
wordList = textParse(open(path).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
#构建词汇表
vocabList = createVocabList(docList)
#训练集和测试集索引列表
trainingSet = [i for i in range(50)]; testSet = []
#随机选取10个测试文件索引
for i in range(10):
randIndex = int(random.uniform(0,len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
#创建训练文件和标签
trainMat = []; trainClasses = []
for docIndex in trainingSet:
trainMat.append(setOfWord2Vec(vocabList, docList[docIndex]))
trainClasses.append(classList[docIndex])
#训练数据
p0v, p1v, pSpam = trainNB0(array(trainMat), array(trainClasses))
#测试数据
errorCount = 0
for docIndex in testSet:
wordVector = setOfWord2Vec(vocabList, docList[docIndex])
if classifyNB(array(wordVector), p0v, p1v, pSpam) != classList[docIndex]:
errorCount += 1
#输出错误率
print('the error rate is: {0}'.format(float(errorCount)/len(testSet)))