1. 实验要求:
对垃圾邮件分类算法(书上P66)改进:
1、采用词袋模型
2、随机选择15个测试样本
3、去除长度小于3的字符
2. 垃圾邮件分类算法改进点
defbagOfWords2VecMN(vocabList, inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1
return returnVec
deftextParse(bigString): #input is bigstring, #output is word list
import re
listOfTokens = re.split(r'\W*', bigString)
return [tok.lower() for tok in listOfTokensif len(tok) > 3]
def spamTest():
docList=[]; classList = []; fullText =[]
for i in range(1,26):
wordList =textParse(open('email/spam/%d.txt' % i).read())
doc