最近正在学习《机器学习实战》,发现其中的代码很老,不能在python3上直接运行,而且有错误的地方,翻过了各种坑,在这里分享出来供大家参考。
先写上利用朴素贝叶斯分类进行垃圾邮件分类的代码
import numpy as np
def createVocabList(dataSet):
#create an empty set
vocabSet=set([])
for document in dataSet:
vocabSet=vocabSet | set(document)
return list(vocabSet)
def setOfWords2Vec(vocabList, inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else: print("the word: %s is not in my Vocabulary!" % word)
return returnVec
def trainNB0(trainMatrix,trainCategory):
numTrainDocs=len(trainMatrix)
numWords=len(trainMatrix[0])
pAbusive=sum(trainCategory)/float(numTrainDocs)
#p0Num=zeros(numWords)
#p1Num=zeros(numWords)
#为了避免P(w1|c1)*P(w2|c1)...P(wn|c1)=0,要保证任一个分子不为零,将其初始化为1
p0Num=np.ones(numWords)
p1Num=np.ones(numWords)
p0Denom=2.0;p1Denom=2.0
for i in range(numTrainDocs):
if trainCategory[i]==1:
p1Num+=trainMatrix[i]
p1Denom+=sum(trainMatrix[i])
else:
p0Num+=trainMatrix[i]
p0Denom+=sum(trainMatrix[i])
#p1Vect=p1Num/p1Denom
#为了避免多个很小的数=0,将其对数化
p1Vect=np.log(p1Num/p1Denom)
#p0Vect=p0Num/p0Denom
p0Vect=np.log(p0Num/p0Denom)
return p0Vect,p1Vect,pAbusive
def classify0(vec2Classify,p0Vec,p1Vec,pClass1):
#元素相乘
p1=sum(vec2Classify*p1Vec)+np.log(pClass1)
p0=sum(vec2Classify*p0Vec)+np.log(1-pClass1)
if p1>p0:
return 1
else:
return 0
def textParse(bigString):
import re
listOfTokens=re.split('\W+',bigString,maxsplit=0)
return [tok.lower() for tok in listOfTokens if len(tok)>2]
def spamTest():
docList=[];classList=[];fullText=[]
for i in range(1,26):
wordList = textParse(open('email/spam/%d.txt' % i,'r').read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = textParse(open('email/spam/%d.txt' % i,'r').read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList=createVocabList(docList)
trainingSet=list(range(50));testSet=[]
#随机构造训练集
for i in range(10):
randIndex = int(np.random.uniform(0,len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
trainMat=[];trainClass=[]
for docIndex in trainingSet:
trainMat.append(setOfWords2Vec(vocabList,docList[docIndex]))
trainClass.append(classList[docIndex])
p0V,p1V,pSpam=trainNB0(trainMat,trainClass)
errorCount=0
#对测试集进行分类
for docIndex in testSet:
wordVector=setOfWords2Vec(vocabList,docList[docIndex])
if classify0(wordVector,p0V,p1V,pSpam)!=classList[docIndex]:
errorCount+=1
print('the error rate is:',float(errorCount)/len(testSet))
spamTest()
书中所附的代码,有一处错误是:定义spamTest时,读取文件,原代码是
wordList = textParse(open('email/spam/%d.txt' % i).read())
应该改为
wordList = textParse(open('email/spam/%d.txt' % i,'r').read())
'r'的意思是“只读”,如果没有加上只读会报错expected string or bytes-like object,同时要注意open(....)只是把这个文件打开,加上.read()才能把文件中的内容显示出来,如下所示
fr=open('email/spam/%d.txt' % 1,'r')
print(fr)
[output]:<_io.TextIOWrapper name='email/spam/1.txt' mode='r' encoding='cp936'>
fr=open('email/spam/%d.txt' % 1,'r').read()
print(fr)
[output]:-- We have 15mg & 30mg pills -- 30/15mg for $203.70 - 60/15mg for $385.80 - 90/15mg for $562.50 -- VISA Only!!! ---