参考博客:朴素贝叶斯基础篇之言论过滤器 (po主Jack-Cui,《——大部分内容转载自
参考书籍:《机器学习实战》——第四章4.6
1 数据集
ham文件夹 正常邮件
spam文件夹 垃圾邮件
2 数据处理
3 训练
4 测试
#!/usr/bin/env python
#_*_coding:utf-8_*_
from numpy import *
import re
'''
文本分类-过滤垃圾邮件
'''
'''切分文本'''
def textParse(bigString):
listOfTokens = re.split(r'\W*',bigString)
return [tok.lower() for tok in listOfTokens if len(tok)>2]
'''创建词汇表'''
def createVocablist(dataset):
vocabList = set([])
for data in dataset:
vocabList = vocabList | set(data)
return list(vocabList)
'''词汇转向量'''
def setOfWord2Vec(vocabList,dataset):
returnVec = [0] * len(vocabList)
for word in dataset:
if word in vocabList:
returnVec[vocabList.index(word)] = 1 #单词出现标1
else:
print "sorry this word %s is not in our vocablist" % word
return returnVec
'''计算p(wi|ci)
该类下,该单词出现的概率
需要计算每一类每个单词出现的次数(分子) 和 每一类出现过的总单词数(分母)
'''
def trainNB(trainMatrix,trainCategory):
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pSpam = float(sum(trainCategory)) / float(numTrainDocs)
p1Num = ones(numWords)
p0Num = ones(numWords) #记录每个单词在该类出现的次数,是一个向量
p1Denom = 2.0
p0Denom = 2.0 #属于该类的总单词出现次数,是一个数
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
p1Vec = log(p1Num / p1Denom)
p0Vec = log(p0Num / p0Denom)
return p0Vec,p1Vec,pSpam
'''计算文档属于某个类别i的概率:p = p(w0|ci)p(w1|ci)p(w2|ci)~p(wn|ci)*p(ci)以下是二分类'''
def classifyNB(vec2classify,p0vec,p1vec,pclass1):
p1 = sum(vec2classify * p1vec) + log(pclass1)
p0 = sum(vec2classify * p0vec) + log(1 - pclass1)
if p1 > p0:
#print "p1=%f" % p1
return 1
else:
#print "p0=%f" % p0
return 0
'''
random.uniform(a, b),用于生成一个指定范围内的随机符点数
'''
def spamTest():
classList = [] ; docList = []
'''导入文件共50个'''
for i in range(1,26):
wordList = textParse(open('email/spam/%d.txt' % i,'r').read())
classList.append(1)
docList.append(wordList)
wordList = textParse(open('email/ham/%d.txt' % i, 'r').read())
classList.append(0)
docList.append(wordList)
'''创建词汇表'''
vocabList = createVocablist(docList)
#print len(vocabList)
#print vocabList
trainSetIndex = range(50)
testSetIndex = []
#print docList
'''划分训练集和测试集 取10个测试集 40个训练集'''
for i in range(10):
randIndex = int(random.uniform(0,len(trainSetIndex)))
testSetIndex.append(trainSetIndex[randIndex])#避免重复
#print trainSetIndex[randIndex]
del(trainSetIndex[randIndex])
numTrainDoc = len(trainSetIndex)
numTestDoc = len(testSetIndex)
trainMat = []; testMat = [];
trainClass = []; testClass = [];
'''训练集数据向量化 数据集标签'''
for i in range(numTrainDoc):
trainMat.append(setOfWord2Vec(vocabList,docList[trainSetIndex[i]]))
trainClass.append(classList[trainSetIndex[i]])
p0Vec, p1Vec, pSpam = trainNB(trainMat,trainClass)
errorCount = 0.0
'''测试集数据向量化 数据集标签 进行测试'''
for i in range(numTestDoc):
testMat = setOfWord2Vec(vocabList,docList[testSetIndex[i]])
testClass = classList[testSetIndex[i]]
classResult = classifyNB(testMat,p0Vec,p1Vec,pSpam)
if classResult != testClass:
print "classify wrong:origin %d" % testClass
errorCount += 1
print "error rate = %.2f" % (errorCount / numTestDoc)
if __name__ == '__main__':
spamTest()