目录
思路
'''
字符串转单词表
词表转词向量0/1
词向量计算概率p(ci)和p(w|ci)
最后计算类别概率:
p(ci|w) = p(w|ci)*p(ci) = log(p(w|ci)) + log(p(ci))
得到最大概率的预测结果
'''
'''
p(ci)的计算
数据+标签:
[0,1,0,1,0,0,1][1]
[1,1,0,0,0,0,0][0]
[0,1,1,1,1,1,1][1]
p(ci)的计算结果
p(c0)=1/3
p(c1)=2/3
'''
'''
p(w|ci)的计算
数据+标签:
[0,1,0,1,0,0,1][1]
[1,1,0,0,0,0,0][0]
[0,1,1,1,1,1,1][1]
类别为[1]:
p1Num1 = p1Num0 + [0,1,0,1,0,0,1] = [1,2,1,2,1,1,2]
p1Denom1 = p1Denom0 + sum([0,1,0,1,0,0,1]) = 2 + 3
p1Num2 = p1Num1 + [0,1,1,1,1,1,1] = [1,3,2,3,2,2,3]
p1Denom2 = p1Denom1 + sum([0,1,1,1,1,1,1]) = 5 + 6 = 11
类别为[0]:
p0Num1 = p0Num0 + [1,1,0,0,0,0,0] = [2,2,1,1,1,1,1]
p0Denom1 = p0Denom0 + sum([1,1,0,0,0,0,0]) = 2 + 2 = 4
p(w|ci)的计算结果
p(w|c0) = log(p0Num1/p0Denom1) = log([2,2,1,1,1,1,1]/04) = [pwc0]
p(w|c1) = log(p1Num2/p1Denom2) = log([1,3,2,3,2,2,3]/11) = [pwc1]
'''
'''
测试事件的概率计算p(ci|w) = p(w|ci)*p(ci) = log(p(w|ci)) + log(p(ci)):
p(c0|w) = sum([pwc0]*存在类别0的特征向量)+log(p(c0))
p(c1|w) = sum([pwc1]*存在类别1的特征向量)+log(p(c1))
'''
bayes.py程序
'''
数据样本_第一步:句子+标签
'''
def loadDataSet():
postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classVec = [0,1,0,1,0,1] #1 is abusive, 0 not
return postingList,classVec
'''
数据样本_第二步:句子+标签转单词列表 —— 即【 单词表 】
'''
def createVocabList(dataSet):
vocabSet = set([]) # create empty set
for document in dataSet:
print("dataSet的行document", document)
print("每句话(每行)set不重复", set(document))
vocabSet = vocabSet | set(document) # union of the two sets 并集OR 两个set函数定义的集合可以直接计算并、交集,
print("并集后", vocabSet)
return list(vocabSet)
'''
数据样本_第三步:单词列表转同等长度的词向量,如果A句子的单词出现则,对应词向量的位置为1,否则为0 —— 即【 单词状态表 】
'''
def setOfWords2Vec(vocabList, inputSet):
returnVec = [0]*len(vocabList) # 和词列表一样长度
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else: print ("the word: %s is not in my Vocabulary!" % word)
return returnVec
from numpy import *
def trainNB0(trainMatrix, trainCategory):
numTrainDocs = len(trainMatrix) # 6个数据集
numWords = len(trainMatrix[0]) # 32个特征词
'''
p(ci)的计算
数据+标签:
[0,1,0,1,0,0,1][1]
[1,1,0,0,0,0,0][0]
[0,1,1,1,1,1,1][1]
p(ci)的计算结果
p(c0)=1/3
p(c1)=2/3
'''
pAbusive = sum(trainCategory)/float(numTrainDocs) # 类别为 1 的概率 = sum(类别为1)/总数据集个数
p0Num = ones(numWords); p1Num = ones(numWords) #change to ones()
p0Denom = 2.0; p1Denom = 2.0 #change to 2.0
'''
p(w|ci)的计算
数据+标签:
[0,1,0,1,0,0,1][1]
[1,1,0,0,0,0,0][0]
[0,1,1,1,1,1,1][1]
类别为[1]:
p1Num1 = p1Num0 + [0,1,0,1,0,0,1] = [1,2,1,2,1,1,2]
p1Denom1 = p1Denom0 + sum([0,1,0,1,0,0,1]) = 2 + 3
p1Num2 = p1Num1 + [0,1,1,1,1,1,1] = [1,3,2,3,2,2,3]
p1Denom2 = p1Denom1 + sum([0,1,1,1,1,1,1]) = 5 + 6 = 11
类别为[0]:
p0Num1 = p0Num0 + [1,1,0,0,0,0,0] = [2,2,1,1,1,1,1]
p0Denom1 = p0Denom0 + sum([1,1,0,0,0,0,0]) = 2 + 2 = 4
p(w|ci)的计算结果
p(w|c0) = log(p0Num1/p0Denom1) = log([2,2,1,1,1,1,1]/04) = [pwc0]
p(w|c1) = log(p1Num2/p1Denom2) = log([1,3,2,3,2,2,3]/11) = [pwc1]
'''
for i in range(numTrainDocs):
if trainCategory[i] == 1: # 类别为 1 的 每个词条进行计算
p1Num += trainMatrix[i] # 对应位置如果出现,即1,该位置自加。
p1Denom += sum(trainMatrix[i]) # 对应的总词数也进行统计。
print(trainMatrix[i])
print(sum(trainMatrix[i]))
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
p1Vec = log(p1Num/p1Denom) #change to log()
p0Vec = log(p0Num/p0Denom) #change to log()
return p0Vec, p1Vec, pAbusive
'''
测试事件的概率计算p(ci|w) = p(w|ci)*p(ci) = log(p(w|ci)) + log(p(ci)):
p(c0|w) = sum([pwc0]*存在类别0的特征向量)+log(p(c0))
p(c1|w) = sum([pwc1]*存在类别1的特征向量)+log(p(c1))
'''
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
p1 = sum(vec2Classify * p1Vec) + log(pClass1) # p(w|ci)p(ci)在自然对数里+即表示乘法,因为p(w)一样所以就不必计算,只需计算分子
p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
print(p1)
print(p0)
if p1 > p0:
return 1
else:
return 0
if __name__ == "__main__":
listOPosts, listClasses = loadDataSet()
print("————————————————————————————————————————————————————", "输入数据", "输入标签")
print(listOPosts, listClasses)
myVocabList = createVocabList(listOPosts)
print("————————————————————————————————————————————————————", "转换称词表")
print(myVocabList)
trainMat = []
for postinDoc in listOPosts:
trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
pass
print("————————————————————————————————————————————————————", "训练数据和标签")
for trainMat_rows in trainMat:
print(trainMat_rows)
pass
for class_rows in listClasses:
print(class_rows)
pass
p0V, p1V, pAb = trainNB0(array(trainMat),array(listClasses))
print("————————————————————————————————————————————————————", "每个句子对应的特征向量标号")
print(p0V)
print(p1V)
print(pAb)
testEntry = ['stupid', 'my']
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
print("————————————————————————————————————————————————————", "测试——每个句子对应的特征向量标号")
print(thisDoc)
print (testEntry,'classified as: ',classifyNB(thisDoc, p0V, p1V, pAb))
pass