1、基于贝叶斯决策理论的分类方法(核心思想:选择具有最高概率的决策)
朴素贝叶斯
优点:在数据较少的情况下仍然有效,可以处理多类别问题。
缺点:对于输入数据的准备方式较为敏感。
适用类型:标称型数据。
2、条件概率
p(A|B)=p(A and B)/p(B)
贝叶斯准则:p(c|x)=p(x|c)p(c)/p(x)
使用条件概率来分类:
p(ci|x,y)=p(x,y|ci)p(ci)/p(x,y)
若P(c1|x,y)>P(c2|x,y),那么属于类别c1;
若P(c
2|x,y)>P(c
1|x,y),那么属于类别c
2。
3、使用朴素贝叶斯进行文档分类
两个假设:(1)特征之间相互独立;(2)每个特征同等重要。
使用python实现文本分类:
Bayes
import operator
from numpy import *
from math import log
def loadDataset():#创建一些实验样本。返回进行词条切分后的文档集合、类别标签集合。
postingList=[['my','dog','has','flea','problems','help','please'],
['maybe','not','take','him','to','dog','park','stupid'],
['my','dalmation','is','so','cute','I','love','him'],
['stop','posting','stupid','worthless','garbage'],
['mr','licks','ate','my','steak','how','to','stop','him'],
['quit','buying','worthless','dog','food','stupid']]
classVec=[0,1,0,1,0,1]#1代表侮辱性文字,0代表正常言论
return postingList,classVec
def createVocabList(Dataset):#创建词汇表
vocabset=set([])
for document in Dataset:
vocabset=vocabset|set(document)
return list(vocabset)
def setOfWords2Vec(vocablist,inputSet):#输入词汇表及某个文档。返回和词汇表等长的文档向量。(词集模型:每个词是否出现)
returnVec=[0]*len(vocablist)
for word in inputSet:
if word in vocablist:
returnVec[vocablist.index(word)]=1
else:
print('the word %s is not in my vocabulary' % word)
return returnVec
def bagOfWords2Vec(vocablist,inputSet):#词袋模型:记录了词出现的次数
returnVec=[0]*len(vocablist)
for word in inputSet:
if word in vocablist:
returnVec[vocablist.index(word)]+=1
else:
print('the word %s is not in my vocabulary' % word)
return returnVec
def getAllwordsVec(vocablist,Dataset):#获得文档的所有词向量,返回为矩阵
trainMat=[]
for document in Dataset:
trainMat.append(setOfWords2Vec(vocablist,document))
return trainMat
#p(c
i|w)=p(w|c
i)p(c
i)/p(w)根据条件独立性假设:p(w
0,w
1,w
2...w
n|c
i)=p(w
0|c
i)p(w
1|c
i)p(w
2|c
i)...p(w
n|c
i)
def trainbayes(trainMat,trainlabels):#分类器训练函数,以二分类为例
numtrainDucs=len(trainMat)
numwords=len(trainMat[0])
pAbusive=sum(trainlabels)/float(numtrainDucs)#计算文档属于类别1的概率,因为是二分类问题,p(0)=1-p(1)
p0num=ones(numwords)#避免乘积为0
p1num=ones(numwords)
p0Denom=2.0
p1Denom=2.0
for i in range(numtrainDucs):
if trainlabels[i]==1:
p1num+=trainMat[i]#numpy数组相加
p1Denom+=sum(trainMat[i])
else:
p0num+=trainMat[i]
p0Denom+=sum(trainMat[i])
p1Vect=p1num/p1Denom#防止下溢(太多很小的数相乘造成,四舍五入会得到0)出加log
p0Vect=p0num/p0Denom
return p0Vect,p1Vect,pAbusive
def classify(testvec,p0vec,p1vec,pClass1):#分类函数
p1=sum(testvec*p1vec)+log(pClass1)#log(p(w|c
1)p(c
1))
p0=sum(testvec*p0vec)+log(1.0-pClass1)
if p1>p0:
return 1
else:
return 0
def testbayes():#测试函数
listOposts,listClasses=loadDataset()
myVocabList=createVocabList(listOposts)
trainMat=getAllwordsVec(myVocabList,listOposts)
p0v,p1v,pAb=trainbayes(trainMat,listClasses)
testEntry=['love','my','dalmation']
testVec=array(setOfWords2Vec(myVocabList,testEntry))
print(classify(testVec,p0v,p1v,pAb))
testEntry=['stupid','garbage']
testVec=array(setOfWords2Vec(myVocabList,testEntry))
print(classify(testVec,p0v,p1v,pAb))
if __name__=='__main__':
testbayes()