# 朴素贝叶斯模型的简单应用

from numpy import *

datalist = [['my', 'dog', 'has', 'flea', 'problems', 'please', 'help', 'a'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him', 'have'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classvect = [0, 1, 0, 1, 0, 1]# 类别标签，0表示不含有侮辱性语言，1表示含有侮辱性语言
return datalist, classvect

#创建dataset
def createvocabset(dataset):
vocabset = set([])
for item in dataset:
vocabset = vocabset | set(item)
return list(vocabset)

#将测试字符串转化为向量
def word2vect(wordset, testword):
resultvect = [0]*len(wordset)
for item in testword:
if item in wordset:
indx = wordset.index(item)
resultvect[indx] = 1
else:
print(item + 'is not in wordlist!')
return resultvect

def calcondiproba(wordlist, wordset, classvect):
wordmatrix = []
for item in wordlist:
wordmatrix.append(word2vect(wordset, item))
numwords = len(wordmatrix[0])
numvects = len(wordmatrix)
p1 = ones(numwords)
p0 = ones(numwords)
for i in range(numvects):
if classvect[i] == 1:
p1 += wordmatrix[i]
else:
p0 += wordmatrix[i]
p1 = log(p1/(sum(p1)+2))
p0 = log(p0/(sum(p0)+2))
pA = sum(classvect)/len(classvect)
return p1, p0, pA

def classify(vect2classify, p1, p0, pA):
P1 = sum(vect2classify * p1) + log(pA)
P0 = sum(vect2classify * p0) + log(1 - pA)
if P1 > P0:
return 1
else:
return 0

#测试主函数
def test_classify(testwords):
testwords  = testwords.split()
wordset = createvocabset(wordlist)
p1, p0, pA = calcondiproba(wordlist, wordset, classvect)
testvect = word2vect(wordset, testwords)
result = classify(testvect, p1, p0, pA)
print(result)

#测试
mysetence1 = "I have a stupid dog"
test_classify(mysetence1)
mysetence2 = "my dog is worthless"
test_classify(mysetence2)


• 广告
• 抄袭
• 版权
• 政治
• 色情
• 无意义
• 其他

120