目录
1. 准备数据:切分文本
"对于一个文本字符串,可以使用Python的string.split()函数将其切分"
mysent = 'This book is the best book on Python or M.L. I have ever laid eyes upon.'
print(mysent.split())
输出结果:
"使用正则表示式来切分式子,其中分隔符是除单词、数字外的任意字符串"
import re
regEx = re.compile('\\W*')
listoftokens = regEx.split(mysent)
print(listoftokens)
输出结果:
"去掉里面的空字符,将字符串全部转换为小写"
print([tok.lower() for tok in listoftokens if len(tok) >0])
输出结果:
emailtext = open('email/ham/6.txt').read()
listoftokens = regEx.split(emailtext)
print(listoftokens)
2 测试算法:使用朴素贝叶斯进行交叉验证
def textparse(bigstring): # 接受一个大字符串并将其解析为字符串列表
import re
listoftokens = re.split(r'\W*', bigstring)
return [tok.lower() for tok in listoftokens if len(tok)>2] # 去掉少于2个字符的字符串,并将所有字符串转为小写
def spamtest():
doclist = []; classlist = []; fulltext = []
for i in range(1, 26): # 导入文本文件,并将它们解析为词列表
wordlist = textparse(open('email/spam/%d.txt' % i).read())
doclist.append(wordlist) # .append()在列表末尾添加新的对象
fulltext.extend(wordlist) # .extend()在列表末尾追加另一个序列中的多个值,输入对象为元素队列
classlist.append(1)
wordlist = textparse(open('email/ham/%d.txt' % i).read())
doclist.append(wordlist)
fulltext.extend(wordlist)
classlist.append(0)
vocablist = createvocablist(doclist)
trainingset = list(range(50)); testset = []
for i in range(10): # 随机选择其中10个文件,添加到测试集,同时也从训练集中剔除
randindex = int(np.random.uniform(0, len(trainingset)))
testset.append(trainingset[randindex])
del(trainingset[randindex])
trainmat = []; trainingclasses = []
for docindex in trainingset: # 遍历训练集所有文档
trainmat.append(setofwords2vec(vocablist, doclist[docindex]))
trainingclasses.append(classlist[docindex])
p0v, p1v, psam = trainnb0(np.array(trainmat), np.array(trainingclasses))
errorcount = 0
for docindex in testset: # 遍历测试集
wordvector = setofwords2vec(vocablist, doclist[docindex])
if classifynb(np.array(wordvector), p0v, p1v, psam) != classlist[docindex]:
errorcount += 1
print("the error rate is: ", float((errorcount)/len(testset)))
查看测试结果:
spamtest()
输出结果: