一、步骤
(1)收集数据:提供文本文件。
(2)准备数据:将文本文件解析成词条向量。
(3)分析数据:检查词条确保解析的正确性。
(4)训练算法:使用我们之前建立的trainNB0()函数。
(5)测试算法:使用classifyNB(),并且构建一个新的测试函数来计算文档集的错误率。
(6)使用算法:构建一个完整的程序对一组文档进行分类,将错分的文档输出到屏幕上。
二、实例+注释
# coding: utf-8
from numpy import *
import sys
def loadDataSet():
''' 收集数据:提供文本文件。 '''
postingList = [[ 'my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'T', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classVec = [0, 1, 0, 1, 0, 1] # 1代表侮辱性文字 0代表正常言论
return postingList, classVec
def createVocabList( dataSet ):
vocabSet = set([])
for document in dataSet:
vocabSet = vocabSet | set(document)
return list(vocabSet)
def setOfWordsVec( vocabList, inputSet ):
''' 准备数据:将文本文件解析成词条向量。 '''
# 创建一个其中所含元素都为0的向量
returnVec = [0] * len( vocabList )
for word in inputSet:
if word in vocabList:
returnVec[ vocabList.index(word) ] = 1
else:
print "the word: %s is not in my Vocabulary!" % word
return returnVec
def bagOfWords2VecMN( vocabList, inputSet ):
''' 准备数据: 文档词袋模型 '''
returnVec = [0] * len( vocabList )
for word in inputSet:
if word in inputSet:
returnVec[ vocabList.index(word) ] += 1
return returnVec
def trainNB0( trainMatrix, trainCategory ):
''' 训练算法 分类器训练函数 '''
''' 得到vec类型的 侮辱性词汇概率 训练矩阵,训练类型标签 '''
numTrainDocs = len( trainMatrix ) # 多少行训练文档 6
numWords = len( trainMatrix[0] ) # 一行文档有多少词条数量 32
# 有多少个文档里面包含了侮辱性词汇
pAbusive = sum( trainCategory ) / float( numTrainDocs )
p0Num = ones( numWords ) # 初始化为1 防止相乘后为0
p1Num = ones( numWords )
p0Denom = 2.0
p1Denom = 2.0 # 分母
for i in range( numTrainDocs ):
if trainCategory[i] == 1: # 该行有侮辱性词汇
p1Num += trainMatrix[i] # 两个 narray 相加
p1Denom += sum( trainMatrix[i] )
else:
p0Num += trainMatrix[i]
p0Denom += sum( trainMatrix[i] )
p1Vect = log(p1Num / p1Denom) # 得到vec类型的 侮辱性词汇概率
p0Vect = log(p0Num / p0Denom) # 得到vec类型的 正常性词汇概率
return p0Vect, p1Vect, pAbusive
def classifyNB( vec2Classify, p0Vec, p1Vec, pClass1 ):
''' 测试算法:分类函数 '''
p1 = sum( vec2Classify * p1Vec ) + log( pClass1 ) # 加法即乘法,log(ab) = log(a) + log(b)
p0 = sum( vec2Classify * p0Vec ) + log( 1.0 - pClass1 )
if p1 > p0:
return 1
else:
return 0
def textParse( bigString ):
''' 数据准备 '''
import re
listOfTokens = re.split(r'\W*', bigString)
return [ tok.lower() for tok in listOfTokens if len(tok) > 0 ]
def spamTest():
docList = []; classList = []; fullText = []
for i in range(1, 26):
wordList = textParse( open('email/spam/%d.txt' % i).read() )
docList.append( wordList )
fullText.extend( wordList )
classList.append( 1 )
wordList = textParse( open('email/ham/%d.txt' % i).read() )
docList.append( wordList )
fullText.extend( wordList )
classList.append( 0 )
vocabList = createVocabList( docList ) # 词集
trainingSet = range(50) # 初始化一个长度为50 的训练列表
testSet = []
# 随机构建训练集
for i in range(10):
randIndex = int( random.uniform(0, len(trainingSet)) ) # 随机10个下标
testSet.append( trainingSet[randIndex] ) # 将随机的下标的数据加入到测试集合
del( trainingSet[randIndex] ) # 将这些下标数据从训练列表中删除
trainMat = []; trainClasses = []
for docIndex in trainingSet:
trainMat.append( setOfWordsVec(vocabList, docList[docIndex]) )
trainClasses.append( classList[docIndex] )
p0V, p1V, pSpam = trainNB0( array(trainMat), array(trainClasses) )
errorCount = 0
for docIndex in testSet:
wordVector = setOfWordsVec( vocabList, docList[docIndex] )
if classifyNB( array(wordVector), p0V, p1V, pSpam ) != classList[docIndex]:
errorCount += 1
print 'the error rate is: ', float(errorCount) / len(testSet)
def testingNB():
listOPosts, listClasses = loadDataSet()
myVocabList = createVocabList( listOPosts ) # 所有不重复词条列表
trainMat = []
for postingDoc in listOPosts:
trainMat.append( setOfWordsVec( myVocabList, postingDoc) )
# 先需要得到 侮辱性词汇概率等数据 p0V, p1V, pAb
p0V, p1V, pAb = trainNB0( array(trainMat), array(listClasses) )
# 要考虑到对应位置的概率
testEntry = [ 'love', 'my', 'dalmation' ]
thisDoc = array( setOfWordsVec( myVocabList, testEntry ) )
print testEntry, 'classified as : ', classifyNB( thisDoc, p0V, p1V, pAb )
testEntry = [ 'stupid', 'garbage' ]
thisDoc = array( setOfWordsVec(myVocabList, testEntry) )
print testEntry, 'classified as : ', classifyNB( thisDoc, p0V, p1V, pAb )
if __name__=="__main__":
spamTest()