朴素贝叶斯
1.1准备数据:从文本中构建词向量
'''
创建从词表到向量的转换函数
'''
def loadDataSet () :
postingList=[['my' , 'dog' , 'has' , 'flea' , 'problems' , 'help' , 'please' ],
['maybe' , 'not' , 'take' , 'him' , 'to' , 'dog' , 'park' , 'stupid' ],
['my' , 'dalmation' , 'is' , 'so' , 'cute' , 'I' , 'love' , 'him' ],
['stop' , 'posting' , 'stupid' , 'worthless' , 'garbage' ],
['mr' , 'licks' , 'ate' , 'my' , 'steak' , 'how' , 'to' , 'stop' , 'him' ],
['quit' , 'buying' , 'worthless' , 'dog' , 'food' , 'stupid' ]]
classVec=[0 ,1 ,0 ,1 ,0 ,1 ]
return postingList,classVec
def createVocabList (dataSet) :
vocabSet=set([])
for document in dataSet:
vocabSet=vocabSet|set(document)
return list(vocabSet)
def setOfWords2Vec (vocabList,inputSet) :
returnVec=[0 ]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)]=1
else :
print('the word: %s is not in my vocabulary!' %word)
return returnVec
listOPosts,listClasses=loadDataSet()
myVocabList=createVocabList(listOPosts)
myVocabList
['flea',
'help',
'is',
'has',
'I',
'posting',
'garbage',
'maybe',
'stop',
'steak',
'dog',
'stupid',
'not',
'park',
'ate',
'quit',
'mr',
'licks',
'food',
'how',
'him',
'my',
'so',
'buying',
'problems',
'cute',
'dalmation',
'to',
'love',
'please',
'worthless',
'take']
setOfWords2Vec(myVocabList,listOPosts[0 ])
[1,
1,
0,
1,
0,
0,
0,
0,
0,
0,
1,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
1,
0,
0,
1,
0,
0,
0,
0,
1,
0,
0]
1.2 训练算法:从词向量计算概率
import numpy as np
def trainNB0 (trainMatrix,trainCategory) :
numTrainDocs=len(trainMatrix)
numWords=len(trainMatrix[0 ])
pAbusive=sum(trainCategory)/float(numTrainDocs)
p0Num=np.zeros(numWords)
p1Num=np.zeros(numWords)
p0Denom=0.0
p1Denom=0.0
for i in range(numTrainDocs):
if trainCategory[i]==1 :
p1Num+=trainMatrix[i]
p1Denom+=sum(trainMatrix[i])
else :
p0Num+=trainMatrix[i]
p0Denom+=sum(trainMatrix[i])
p1Vect=p1Num/p1Denom
p0Vect=p0Num/p0Denom
return p0Vect,p1Vect,pAbusive