代码及注释如下:
#使用贝叶斯算法实现垃圾邮件过滤
#将一个大字符串解析为字符串列表
def textParse(bigString):
import re
listOfTokens = re.split(r'\W*', bigString)
return [tok.lower() for tok in listOfTokens if len(tok) > 2]
def spamTest():
#import pandas as pd
docList = []; classList = []; fullText = []
for i in range(1,26):
#wordList = textParse(pd.read_csv('email/spam/%d.txt' %i, sep='\n', encoding='utf8'))
wordList = textParse(open('email/spam/%d.txt' % i).read()) #spam文件夹中的邮件全设为1
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
#wordList = textParse(pd.read_csv('email/ham/%d.txt' % i, sep='\n', encoding='utf8'))
wordList = textParse(open('email/ham/%d.txt' % i).read()) #ham文件夹中的邮件全设为0
docList.appe