下面是经过我注释并添加 docstring 的朴素贝叶斯分类器源码
from numpy import *
def loadDataSet():
''' () -> list of lists, list
Return a list of lists which are some posts in a forum and
a list containing the corresponding class of each post
>>> postingList, classVec = loadDataSet()
>>> postingList
[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
>>> classVec
[0, 1, 0, 1, 0, 1]
'''
postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classVec = [0, 1, 0, 1, 0, 1]
return postingList, classVec
def createVocabList(dataSet):
''' (list of list) -> list
Return a list that contains non-duplicate elements from dataSet
>>> createVocabList([['apple', 'is', 'good', 'for', 'health'], ['he', 'is', 'a', 'fan', 'of', 'apple']])
['a', 'good', 'apple', 'for', 'of', 'is', 'health', 'fan', 'he']
'''
vocabList = set([])
for document in dataSet:
vocabList = vocabList | set(document)
return list(vocabList)
def setOfWords2Vec(vocabList, inputSet):
''' (list, list) -> list
Return a list containing 0s ans 1s. If a word from inputSet is in the vocabList,
the corresponding position in returnVec of the word will be set as 1, or 0.
>>> setOfWords2Vec(['apple', 'pen', 'good'], ['pen', 'is', 'a', 'good', 'writing', 'tool'])
[0, 1, 1]
'''
returnVec = [0] * len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
return returnVec
def bagOfWords2Vec(vocabList, inputSet):
''' (list, list) -> list
Return a list containing the number of times each word of inputSet appears in vocabList.
>>> bagOfWords2Vec(['apple', 'pen', 'good'], ['pen', 'is', 'a', 'good', 'writing', 'tool', 'pen'])
[0, 2, 1]
'''
returnVec = [0] * len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1
return returnVec
def trainNB0(trainMatrix, trainCategory):
''' (list of lists, list) -> 1D-array, 1D-array, float
Return the probabilities of each word appearing in abusive posts
and non-abusive posts and the ratio of abusive posts.
trainMatrix is a list of lists containing 0s and 1s, and each 0
or 1 indecates weither a word appears in the corresponding post.
trainCategroy is a list of ints (only 0 and 1). 0 means that the
corresponding post is not abusive, and 1 means that the corresponding
post is abusive.
>>> trainMatrix = [[0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0],
[1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1],
[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1],
[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0]]
>>> trainCategory = [0, 1, 0, 1, 0, 1]
>>> p0, p1, pa = trainNB0(trainMatrix, trainCategory)
>>> p0
array([ 0.04166667, 0.04166667, 0.04166667, 0. , 0. ,
0.04166667, 0.04166667, 0.04166667, 0. , 0.04166667,
0.04166667, 0.04166667, 0.04166667, 0. , 0. ,
0.08333333, 0. , 0. , 0.04166667, 0. ,
0.04166667, 0.04166667, 0. , 0.04166667, 0.04166667,
0.04166667, 0. , 0.04166667, 0. , 0.04166667,
0.04166667, 0.125 ])
>>> p1
array([ 0. , 0. , 0. , 0.05263158, 0.05263158,
0. , 0. , 0. , 0.05263158, 0.05263158,
0. , 0. , 0. , 0.05263158, 0.05263158,
0.05263158, 0.05263158, 0.05263158, 0. , 0.10526316,
0. , 0.05263158, 0.05263158, 0. , 0.10526316,
0. , 0.15789474, 0. , 0.05263158, 0. ,
0. , 0. ])
>>> pa
0.5
'''
numTrainDocs = len(trainMatrix) # the number of posts
numWords = len(trainMatrix[0]) # the number of words in a single post
pAbusive = sum(trainCategory) / float(numTrainDocs) # the probability of abusive posts
p0Num = ones(numWords) # the number of times each word appears in non-abusive posts
p1Num = ones(numWords) # the number of times each word appears in abusive posts
p0Denom = 2 # the total number of words in non-abusive posts
p1Denom = 2 # the total number of words in abusive posts
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num += trainMatrix[i] # accumulate the number of times each word appears in abusive posts
p1Denom += sum(trainMatrix[i]) # accumulate the total number of words in non-abusive posts
else:
p0Num += trainMatrix[i] # accumulate the number of times each word appears in abusive posts
p0Denom += sum(trainMatrix[i]) # accumulate the total number of words in non-abusive posts
# change the probabilities to log to avoid precision and overflow problems
p1Vect = log(p1Num / p1Denom) # calculate the conditional probabilities of each word in abusive posts
p0Vect = log(p0Num / p0Denom) # calculate the conditional probabilities of each word in non-abusive posts
return p0Vect, p1Vect, pAbusive
def classifyNB(vec2classify, p0Vect, p1Vect, pClass1):
''' (list, list, list, float) -> str
Return 'abusive' if vev2classify is the vector of abusive post.
Return 'not abusive' is vev2classify is the vector of non-abusive post.
Bayes theorem: P(A|B) = P(B|A) * P(A) / P(B)
P(abusive) = p1Vect * pClass1 / P(each word in vec2classify)
P(non-abusive) = p0Vect * (1-pClass1) / P(each word in vec2classify)
>>> classifyNB(vec2classify, p0Vect, p1Vect, pClass1)
'''
pAbusive = sum(vec2classify * p1Vect) + log(pClass1)
pNonAbusive = sum(vec2classify * p0Vect) + log(1 - pClass1)
if pAbusive > pNonAbusive:
return 'Abusive post'
return 'Non-abusive post'
def testingNB():
''' () -> NoneType
Self-contained test.
>>> testingNB()
['love', 'my', 'dalmation'] classified as: Non-abusive post
['stupid', 'grabage'] classified as: Abusive post
'''
listOfPosts, listOfClasses = loadDataSet()
vocabList = createVocabList(listOfPosts)
trainMatrix = []
for post in listOfPosts:
trainMatrix.append(setOfWords2Vec(vocabList, post))
p0V, p1V, pAb = trainNB0(array(trainMatrix), array(listOfClasses))
testEntry = ['love', 'my', 'dalmation']
thisDoc = array(setOfWords2Vec(vocabList, testEntry))
print testEntry, 'classified as:', classifyNB(thisDoc, p0V, p1V, pAb)
testEntry = ['stupid', 'grabage']
thisDoc = array(setOfWords2Vec(vocabList, testEntry))
print testEntry, 'classified as:', classifyNB(thisDoc, p0V, p1V, pAb)