from math import *
postingList=[['my','dog','has','flea','problems','help','please'],['maybe','not','take','him','to','dog','park','stupid'], ['my','dalmation','is','so','cute','I','love','him'],['stop','posting','stupid','worthless','garbage'], ['mr','licks','ate','my','steak','how','to','stop','him'],['quit','buying','worthless','dog','food','stupid']]
classVec=[0,1,0,1,0,1]
vocabSet=set([])
for document in dataSet:
vocabSet=vocabSet|set(document)
returnVec=[0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)]+=1
else:
print "the word:%s is not in my Volcabulary!" % word
TrainMatrix=[]
for postinDoc in listOPosts:
TrainMatrix.append(setOfWords2Vec(vocabList,postinDoc))
return TrainMatrix
return [0]*num
def listDivFloat(lt,flo):
temp=[]
length=len(lt)
for i in range(length):
lt[i]=lt[i]/flo
numTrainDocs=len(trainMatrix)
numWords=len(trainMatrix[0])
pAbusive=sum(trainCategory)/float(numTrainDocs)
p0Num=ones(numWords); p1Num=ones(numWords)
p0Denom=2.0;p1Denom=2.0
for i in range(numTrainDocs):
if trainCategory[i]==1:
p1Num+=trainMatrix[i]
p1Denom+=sum(trainMatrix[i])
else:
p0Num+=trainMatrix[i]
p0Denom+=sum(trainMatrix[i])
p1Vect=p1Num/p1Denom
p0Vect=p0Num/p0Denom
p1=sum(vec2Classify*p1Vec)+log(pClass1)
p0=sum(vec2Classify*p0Vec)+log(1.0-pClass1)
if p1>p0:
return 1
else:
listOPosts,listClass=loadDataSet()
myVocabList=createVocabList(listOPosts)
TrainMatrix=[]
for postinDoc in listOPosts:
TrainMatrix.append(setOfWords2Vec(myVocabList,postinDoc))
thisDoc=array(setOfWords2Vec(myVocabList,testEntry))
print testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb)
testEntry=['stupid','garbage']
thisDoc=array(setOfWords2Vec(myVocabList,testEntry))
print testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb)
from numpy import *
# 加载数据集
def loadDataSet():postingList=[['my','dog','has','flea','problems','help','please'],['maybe','not','take','him','to','dog','park','stupid'], ['my','dalmation','is','so','cute','I','love','him'],['stop','posting','stupid','worthless','garbage'], ['mr','licks','ate','my','steak','how','to','stop','him'],['quit','buying','worthless','dog','food','stupid']]
classVec=[0,1,0,1,0,1]
return postingList,classVec
#创建词库列表
def createVocabList(dataSet):vocabSet=set([])
for document in dataSet:
vocabSet=vocabSet|set(document)
return list(vocabSet)
#统计样本在词库列表中的对应词的出现个数
def setOfWords2Vec(vocabList,inputSet):returnVec=[0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)]+=1
else:
print "the word:%s is not in my Volcabulary!" % word
return returnVec
#创建训练矩阵,该矩阵包含了每个训练样本的出现次数统计
def createTrainMatrix(vocabList,listOPosts):TrainMatrix=[]
for postinDoc in listOPosts:
TrainMatrix.append(setOfWords2Vec(vocabList,postinDoc))
return TrainMatrix
#创建一个长度为num,值为0的列表
def zeros(num):
return [0]*num
#创建一个长度为num,值为1的列表
return [0]*num
def listDivFloat(lt,flo):
temp=[]
length=len(lt)
for i in range(length):
lt[i]=lt[i]/flo
return lt
#训练数据集,根据样本集训练出每个词的出现概率
def trainNB0(trainMatrix,trainCategory):numTrainDocs=len(trainMatrix)
numWords=len(trainMatrix[0])
pAbusive=sum(trainCategory)/float(numTrainDocs)
p0Num=ones(numWords); p1Num=ones(numWords)
p0Denom=2.0;p1Denom=2.0
for i in range(numTrainDocs):
if trainCategory[i]==1:
p1Num+=trainMatrix[i]
p1Denom+=sum(trainMatrix[i])
else:
p0Num+=trainMatrix[i]
p0Denom+=sum(trainMatrix[i])
p1Vect=p1Num/p1Denom
p0Vect=p0Num/p0Denom
return p0Vect,p1Vect,pAbusive
#判断属于哪个类的概率更大
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):p1=sum(vec2Classify*p1Vec)+log(pClass1)
p0=sum(vec2Classify*p0Vec)+log(1.0-pClass1)
if p1>p0:
return 1
else:
return 0
#利用训练好的结果进行测试
def testingNB():listOPosts,listClass=loadDataSet()
myVocabList=createVocabList(listOPosts)
TrainMatrix=[]
for postinDoc in listOPosts:
TrainMatrix.append(setOfWords2Vec(myVocabList,postinDoc))
p0V,p1V,pAb=trainNB0(array(TrainMatrix),array(listClass))
#以上代码训练完毕,以下的为测试代码
testEntry=['love','my','dalmation']thisDoc=array(setOfWords2Vec(myVocabList,testEntry))
print testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb)
testEntry=['stupid','garbage']
thisDoc=array(setOfWords2Vec(myVocabList,testEntry))
print testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb)