import bayes from numpy import * def bagOfWords2VecMN(vocabList,inputSet): returnVec=0*len(vocabList) for word in inputSet: if word in vocabList: returnVec[vocabList.index(word)]+=1 return returnVec def textParse(bigString): import re listOfTokens=re.split(r'\W*',bigString) return [tok.lower() for tok in listOfTokens if len(tok)>2] def spamTest(): docList=[] classList=[] fullText=[] for i in range(1,26): wordList=textParse(open('email/spam/%d.txt'%i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList=bayes.createVocabList(docList) trainingSet=range(50) testSet=[] for i in range(10): randIndex=int(random.uniform(0,len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat=[] trainClasses=[] for docIndex in trainingSet: trainMat.append(bayes.setOfWords2Vec(vocabList,docList[docIndex])) trainClasses.append(classList[docIndex]) p0v,p1v,pSpsm=bayes.trainNB0(trainMat,trainClasses) errorCount=0 for docIndex in testSet: wordVector=bayes.setOfWords2Vec(vocabList,docList[docIndex]) if bayes.classifyNB(wordVector,p0v,p1v,pSpsm) != classList[docIndex]: errorCount+=1 print 'the error rate is :',float(errorCount)/len(testSet) spamTest() spamTest()
朴素贝叶斯算法学习笔记(二)使用算法进行交叉验证
最新推荐文章于 2022-07-02 14:52:37 发布