若结果有误,请指出噢
#1找到所有特征
import numpy as np
#from numpy import *
def loadDataSet():
postingList=[['my','dog','has','flea','problems','help','please'],
['maybe','not','take','him','to','dog','park','stupid'],
['my','dalmation','is','so','cute','I','love','him'],
['stop','posting','stupid','worthless','garbage'],
['mr','licks','ate','my','steak','how','to','stop','him'],
['quit','buying','worthless','dog','food','stupid']]
classVec = [0,1,0,1,0,1]#标签1是恶评,0不是恶评
return postingList,classVec
#listOPosts,listclasses = loadDataSet()
#print(listOPosts)
def createVocaList(dataSet):
vocabSet = set([])#创建空的set集合,无序不重复元素集
for document in dataSet:
vocabSet = vocabSet|set(document)#|代表并集,并集里面没有重复元素
return list(vocabSet)#列表,因为set集合没有索引
#MyvocabList = createVocaList(listOPosts)
#print('特征属性:',MyvocabList)
# 2、找到每条言论对应的词汇向量,每一条言论对应的词汇量是一个列表的形式
def setofwords2Vec(feature,inputset):
returnVec = [0]*len(feature)
for word in inputset:
if word in feature:
returnVec[feature.index(word)] = 1
else:
print('the word:%s is not in my vocabList'%word)
return returnVec
#trainMatrix = []
# for postinDoc in listOPosts:
# print(postinDoc)
# trainMatrix.append(setofwords2Vec(MyvocabList,postinDoc))
# print(trainMatrix)
#3、根据词汇,求出来每个特征在每种标签下对应的概率
def trainNB0(wordSetVec,labels):
numDataSet = len(wordSetVec)#样本数
pAbusive = sum(labels)/numDataSet#恶意言论的概率
featureNum = len(wordSetVec[0])#特征数
#初始化零(行)向量,存放每个特征在每种标签下出现的次数,0(1)表示善意(恶意)言论
p0Num = np.ones(featureNum);p1Num = np.ones(featureNum)
#初始化分母,分母代表的是每种标签里面的单词数
p0Denom = 2;p1Denom = 2
for i in range(numDataSet):
if labels[i] == 1:
p1Num += wordSetVec[i]#对应位置上相加。累加即可求得每个特征出现的次数
p1Denom += sum(wordSetVec[i])#累加即可求得1标签下所有单词数
else:
p0Num += wordSetVec[i]#对应位置上相加。累加即可求得每个特征出现的次数
p0Denom += sum(wordSetVec[i])#累加即可求得0标签下所有单词数
p0Vec = np.log(p0Num/p0Denom)#求出每个特征在0类别下的概率
p1Vec = np.log(p1Num/p1Denom)#求出每个特征在1类别下的概率
return p0Vec,p1Vec,pAbusive
# p0,p1,pA = trainNB0(trainMatrix,listclasses)
# print('p0:',p0)
#print('p1:',p1)
#4、由上面的概率值,定义朴素贝叶斯分类
def NBC(TestWordsVec,p0Vec,p1Vec,pclasses):
p1 = sum(TestWordsVec*p1Vec) + np.log(pclasses)
p0 = sum(TestWordsVec*p0Vec) + np.log(1-pclasses)
if p1>p0:
return "1标签"
if p1<p0:
return "0标签"
#测试算法
def testingNBC():
listOPosts,listclasses = loadDataSet()
print('词汇列表:',listOPosts)
MyvocabList = createVocaList(listOPosts)
print('特征属性:',MyvocabList)
trainMatrix = []
for postinDoc in listOPosts:
# print(postinDoc)
trainMatrix.append(setofwords2Vec(MyvocabList,postinDoc))
print('特征向量:',trainMatrix)
p0,p1,pA = trainNB0(trainMatrix,listclasses)
print('概率:',p0,p1,pA)
testEntry = ['love','my','dalmation']
testVec = setofwords2Vec(MyvocabList,testEntry)
print('分类器分类的类别为:',NBC(testVec,p0,p1,pA))
testEntry1 = ['stupid','garbage']
testVec1 = setofwords2Vec(MyvocabList,testEntry1)
print('分类器分类的类别:',NBC(testVec1,p0,p1,pA))
testingNBC()