#coding=utf-8 from numpy import * ''' 准备数据:从文本中构建词向量 ''' #数据集的读取 def loadDataSet(): postingList=[['my','dog','has','flea','problems','help','please'],\ ['maybe','not','take','him','to','dog','park','stupid'],\ ['my','dalmation','is','so','cute','I','love','him'],\ ['stop','posting','stupid','worthless','garbage'],\ ['mr','licks','ate','my','steak','how','to','stop','him'],\ ['quit','buying','worthless','dog','food','stupid']] classVec=[0,1,0,1,0,1] return postingList,classVec #处理为不重复的列表 def createVocabList(dataSet): vocabSet=set([]) for document in dataSet: vocabSet=vocabSet|set(document)#并集 return list(vocabSet) # ''' vocabList:词汇表 inputSet:文档 返回文档向量 ''' def setOfWords2Vec(vocabList,inputSet): returnVec=[0]*len(vocabList) for word in inputSet:#如果文档中出现了这个单词,则将向量置为1 if word in vocabList: returnVec[vocabList.index(word)]=1 else: print "the word:%s is not in my Vocabulary!" return returnVec listOposts,listClasses=loadDataSet() print len(listOposts) myVocabList=createVocabList(listOposts) print myVocabList print setOfWords2Vec(myVocabList,listOposts[3]) ''' 训练算法:从词向量计算概率 trainMatrix:文档矩阵 trainCategory:每篇文档构成的标签向量 ''' def trainNB0(trainMatrix,trainCategory): numTrainDocs=len(trainMatrix)#计算文档数目 numWords=len(trainMatrix[0])#计算文档总词条数 pAbusive=sum(trainCategory)/float(numTrainDocs)#计算侮辱性文档概率 p0Num=ones(numWords)#初始化正常文章矩阵 p1Num=ones(numWords)#初始化侮辱性文章矩阵 p0Denom=2.0#初始化正常词条数 p1Denom=2.0#初始化非正常词条数 for i in range(numTrainDocs): if trainCategory[i] == 1:#如果是侮辱性文档 p1Num += trainMatrix[i]#矩阵相加统计侮辱词条 p1Denom += sum(trainMatrix[i])#统计侮辱词条总数目 else:#如果是正常文档 p0Num += trainMatrix[i]#矩阵相加统计正常词条 p0Denom += sum(trainMatrix[i])#统计正常词条总数目 p1Vect=p1Num/p1Denom p0Vect=p0Num/p0Denom return p0Vect,p1Vect,pAbusive trainMat=[] for postinDoc in listOposts: trainMat.append(setOfWords2Vec(myVocabList,postinDoc)) # print trainMat # p0V,p1V,pAb=trainNB0(trainMat,listClasses) # print p0V # print p1V # print pAb ''' 测试算法:根据现实情况修改分类器 ''' def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1): p1=sum(vec2Classify*p1Vec)+log(pClass1) p0=sum(vec2Classify*p0Vec)+log(1.0-pClass1) if p1>p0: return 1; else: return 0; def testingNB(): listOposts,listClasses=loadDataSet() myVocabList=createVocabList(listOposts) trainMat=[] for postinDoc in listOposts: trainMat.append(setOfWords2Vec(myVocabList,postinDoc)) p0V,p1V,pAb=trainNB0(array(trainMat),array(listClasses)) testEntry=['love','my','dalmation'] thisDoc=array(setOfWords2Vec(myVocabList,testEntry)) print testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb) testEntry = ['stupid', 'garbage'] thisDoc = array(setOfWords2Vec(myVocabList, testEntry)) print testEntry, 'classified as:', classifyNB(thisDoc, p0V, p1V, pAb) testingNB() ['cute', 'love', 'help', 'garbage', 'quit', 'I', 'problems', 'is', 'park', 'stop', 'flea', 'dalmation', 'licks', 'food', 'not', 'him', 'buying', 'posting', 'has', 'worthless', 'ate', 'to', 'maybe', 'please', 'dog', 'how', 'stupid', 'so', 'take', 'mr', 'steak', 'my'] [0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0] ['love', 'my', 'dalmation'] classified as: 0 ['stupid', 'garbage'] classified as: 1
朴素贝叶斯算法学习笔记(一)
最新推荐文章于 2020-12-05 17:18:30 发布