from numpy import *
def load():
postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classVec = [0, 1, 0, 1, 0, 1]
return postingList, classVec
def creatList(dataSet):
vocabSet = set([])
for d in dataSet:
vocabSet = vocabSet | set(d)
return list(vocabSet)
def word2Vec(vocList, dataSet):
returnVec = [0] * len(vocList)
for word in dataSet:
if word in vocList:
returnVec[vocList.index(word)] = 1
else:
print('the word %s is not in vocab list! '% word)
return returnVec
def trainNB(data, class_):
# print('data : \n', data)
numDoc = len(data)
numWord = len(data[0])
pa = sum(class_)/float(numDoc)
# 拉普拉斯平滑,分子加1,分母加n
num0 = ones(numWord) # numpy中的函数
num1 = ones(numWord)
p0 = 2.0
p1 = 2.0
for i in range(numDoc):
# print('data[i] : \n', data[i])
if class_[i] == 1:
num1 += data[i]
p1 += sum(data[i])
# print('num1 : \n', num1)
# print(p1)
else:
num0 += data[i]
p0 += sum(data[i])
# print('num0 : \n', num0)
# print(p0)
p1Vect = log(num1/p1)
p0Vect = log(num0/p0)
return p0Vect, p1Vect, pa
def classifyNB(vec, p0Vec, p1Vec, pa_):
p0 = sum(vec * p0Vec) + log(pa_)
p1 = sum(vec * p1Vec) + log(1-pa_)
if p0 > p1:
return 0
else:
return 1
def testNB(testList):
postList, classList = load()
vocabList = creatList(postList)
print(vocabList)
# print(len(vocabList))
trainMat = []
for postinDoc in postList:
trainMat.append(word2Vec(vocabList, postinDoc))
# print('trainMat : \n', trainMat)
p0, p1, pa = trainNB(trainMat, classList)
print(p0, '\n', p1, '\n pa = \t', pa)
testVec = word2Vec(vocabList, testList)
testClass = classifyNB(testVec, p0, p1, pa)
print('该样本的分类为 :\t', testClass)
def bagWord2Vec(vocList, dataSet):
returnVec = [0] * len(vocList)
for word in dataSet:
if word in vocList:
returnVec[vocList.index(word)] += 1
else:
print('the word %s is not in vocab list! ' % word)
return returnVec
if __name__ == '__main__':
test1 = ['stupid', 'dalmation']
testNB(test1)