from numpy import *
def loadDataSet():
postingList=[['my','dog','has','flea','problems','help','please'],
['maybe','not','take','him','to','dog','park','stupid'],
['my','dalmation','is','so','cute','I','love','him'],
['stop','posting','stupid','worthless','garbage'],
['mr','licks','ate','my','steak','how','to','stop','him'],
['quit','buying','worthless','dog','food','stupid']]
classVec=[0,1,0,1,0,1]
return postingList,classVec
def createVocabList(dataSet):
vocabSet=set([])
for document in dataSet:
vocabSet=vocabSet|set(document)
pass
return list(vocabSet)
def setOfWords2Vec(vocabList,inputSet):
returnVec=[0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)]=1
pass
else:
print("the word:%s is not in my Vocabulary!" % word )
pass
return returnVec
def trainNB0(trainMatrix,trainCategory):
numTrainDocs=len(trainMatrix)
numWords=len(trainMatrix[0])
print("numWords=",numWords)
pAbusive=sum(trainCategory)/float(numTrainDocs)
print("pAbusive的值是:")
print(pAbusive)
p0Num=ones(numWords)
p1Num=ones(numWords)
p0Denom=2.0
p1Denom=2.0
for i in range(numTrainDocs):
if trainCategory[i]==1:
p1Num+=trainMatrix[i]
p1Denom+=sum(trainMatrix[i])
pass
else:
p0Num+=trainMatrix[i]
p0Denom+=sum(trainMatrix[i])
pass
pass
print("p1Num的值是:")
print(p1Num)
print("p1Denom的值是:")
print(p1Denom)
p1Vect=log(p1Num/p1Denom)
print("++++++")
print(p1Vect)
p0Vect=log(p0Num/p0Denom)
return p0Vect,p1Vect,pAbusive
postingList,classVec=loadDataSet()
a=createVocabList(postingList)
bbb=setOfWords2Vec(a,postingList[0])
trainMat=[]
print("postinglist的值是:")
print(postingList)
print("a的值是:")
print(a)
print(bbb)
for postinDoc in postingList:
trainMat.append(setOfWords2Vec(a,postinDoc))
pass
print("trainMat的值是:")
print(trainMat)
print("classVec的值是:")
print(classVec)
p0V,p1V,pAb=trainNB0(trainMat,classVec)
print(p0V)
print(p1V)
print(pAb)
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
p1=sum(vec2Classify*p1Vec)+log(pClass1)
p0=sum(vec2Classify*p0Vec)+log(1.0-pClass1)
if p1>p0:
return 1
else:
return 0
pass
def testingNB():
listOPosts,listClasses=loadDataSet()
myVocabList=createVocabList(listOPosts)
trainMat=[]
for postinDoc in listOPosts:
trainMat.append(setOfWords2Vec(myVocabList,postinDOc))
pass
p0V,p1V,pAb=trainNB0(array(trainMat),array(listClasses))
testEntry=['love','my','dalmation']
thisDoc=array([setOfWords2Vec(myVocabList,testEntry)])
print(testEntry,"classified as:",classifyNB(thisDoc,p0V,p1V,pAb))
testEntry=['stupid','garbage']
thisDoc=array(setOfWords2Vec(myVocabList,testEntry))
print(testEntry,'classified as',classifyNB(thisDoc,p0V,p1V,pAb))
testingNB()
打印
vocablist的值是:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
postinglist的值是:
[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], ['stop', 'posting', 'stupid', 'worthless', 'garbage'], ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
a的值是:
['maybe', 'stop', 'cute', 'flea', 'how', 'mr', 'worthless', 'steak', 'licks', 'him', 'garbage', 'has', 'not', 'is', 'stupid', 'problems', 'quit', 'I', 'dalmation', 'posting', 'dog', 'love', 'food', 'ate', 'please', 'to', 'so', 'park', 'buying', 'my', 'take', 'help']
[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1]
vocablist的值是:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
vocablist的值是:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
vocablist的值是:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
vocablist的值是:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
vocablist的值是:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
vocablist的值是:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
trainMat的值是:
[[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0], [0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0]]
classVec的值是:
[0, 1, 0, 1, 0, 1]
numWords= 32
pAbusive的值是:
0.5
p1Num的值是:
[2. 2. 1. 1. 1. 1. 3. 1. 1. 2. 2. 1. 2. 1. 4. 1. 2. 1. 1. 2. 3. 1. 2. 1.
1. 2. 1. 2. 2. 1. 2. 1.]
p1Denom的值是:
21.0
++++++
[-2.35137526 -2.35137526 -3.04452244 -3.04452244 -3.04452244 -3.04452244
-1.94591015 -3.04452244 -3.04452244 -2.35137526 -2.35137526 -3.04452244
-2.35137526 -3.04452244 -1.65822808 -3.04452244 -2.35137526 -3.04452244
-3.04452244 -2.35137526 -1.94591015 -3.04452244 -2.35137526 -3.04452244
-3.04452244 -2.35137526 -3.04452244 -2.35137526 -2.35137526 -3.04452244
-2.35137526 -3.04452244]
[-3.25809654 -2.56494936 -2.56494936 -2.56494936 -2.56494936 -2.56494936
-3.25809654 -2.56494936 -2.56494936 -2.15948425 -3.25809654 -2.56494936
-3.25809654 -2.56494936 -3.25809654 -2.56494936 -3.25809654 -2.56494936
-2.56494936 -3.25809654 -2.56494936 -2.56494936 -3.25809654 -2.56494936
-2.56494936 -2.56494936 -2.56494936 -3.25809654 -3.25809654 -1.87180218
-3.25809654 -2.56494936]
[-2.35137526 -2.35137526 -3.04452244 -3.04452244 -3.04452244 -3.04452244
-1.94591015 -3.04452244 -3.04452244 -2.35137526 -2.35137526 -3.04452244
-2.35137526 -3.04452244 -1.65822808 -3.04452244 -2.35137526 -3.04452244
-3.04452244 -2.35137526 -1.94591015 -3.04452244 -2.35137526 -3.04452244
-3.04452244 -2.35137526 -3.04452244 -2.35137526 -2.35137526 -3.04452244
-2.35137526 -3.04452244]
0.5
['love', 'my', 'dalmation'] classified as: 0
['stupid', 'garbage'] classified as 1