邮件数据下载地址:http://download.csdn.net/detail/liyuefeilong/9106481
具体代码如下:
# -*- coding: utf-8 -*-
"""
Created on Tue Sep 08 16:12:55 2015
@author: Administrator
"""
from numpy import *
# 创建实验样本,可能需要对真实样本做一些处理,如去除标点符号
def loadDataSet():
postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
listClass = [0, 1, 0, 1, 0, 1] # 1代表存在侮辱性的文字,0代表不存在
return postingList, listClass
# 将所有文档所有词都存到一个列表中,用set()函数去除重复出现的词
def createNonRepeatedList(data):
vocList = set([])
for doc in data:
vocList = vocList | set(doc) # 两集合的并集
return list(vocList)
def detectInput(vocList, inputStream):
returnVec = [0]*len(vocList) # 创建和vocabList一样长度的全0列表
for word in inputStream:
if word in vocList: # 针对某段words进行处理
returnVec[vocList.index(word)] = 1 # ?
else:
print ("The word :%s is not in the vocabulary!" % word)
return returnVec
# 贝叶斯分类器训练函数
def trainNaiveBayes(trainMatrix, classLabel):
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pBase = sum(classLabel) / float(numTrainDocs)
# The following Settings aim at avoiding the probability of 0
p0Num = ones(numWords)
p1Num = ones(numWords)
p0Denom = 2.0
p1Denom = 2.0
for i in range(numTrainDocs):
if classLabel[i] == 1:
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
p0 = log(p0Num / p0Denom)
p1 = log(p1Num / p1Denom)
return p0, p1, pBase
trainMat = []
#print trainMat
# test the algorithm
def naiveBayesClassify(vec2Classify, p0, p1, pBase):
p0res = sum(vec2Classify * p0) + log(1 - pBase)
p1res = sum(vec2Classify * p1) + log(pBase)
if p1res > p0res:
return 1
else:
return 0
def testNaiveBayes():
loadData, classLabel = loadDataSet()
vocList = createNonRepeatedList(loadData)
trainMat = []
for doc in loadData:
trainMat.append(detectInput(vocList, doc))
p0, p1, pBase = trainNaiveBayes(array(trainMat), array(classLabel))
testInput = ['love', 'my', 'dalmation']
thisDoc = array(detectInput(vocList, testInput))
print (testInput, 'the classified as: ', naiveBayesClassify(thisDoc, p0, p1, pBase))
testInput = ['stupid', 'garbage']
thisDoc = array(detectInput(vocList, testInput))
print (testInput, 'the classified as: ', naiveBayesClassify(thisDoc, p0, p1, pBase))
def textParse(bigString): # 正则表达式进行文本解析
import re
listOfTokens = re.split(r'\W*', bigString)
return [tok.lower() for tok in listOfTokens if len(tok) > 2]
def spamTest():
docList = []
classList = []
fullText = []
for i in range(1, 26): # 导入并解析文本文件,25个普通邮件和25个垃圾邮件
wordList = textParse(open('C:/Users/ryq/Desktop/email/email/spam/%d.txt' % i,encoding='Shift_JIS').read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = textParse(open('C:/Users/ryq/Desktop/email/email/ham/%d.txt' % i,encoding='Shift_JIS').read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createNonRepeatedList(docList)
trainingSet = list(range(50))
testSet = []
for i in range(0,10): # 随机构建训练集,包含10个样本
randIndex = int(random.uniform(0, len(trainingSet)))
print(randIndex)
testSet.append(trainingSet[randIndex])
del (trainingSet[randIndex])
trainMat = []
trainClasses = [] # 用于存放训练集和训练样本的标签
for docIndex in trainingSet:
trainMat.append(detectInput(vocabList, docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V, p1V, pSpam = trainNaiveBayes(array(trainMat), array(trainClasses))
errorCount = 0
for docIndex in testSet: # 对测试集进行分类
wordVector = detectInput(vocabList, docList[docIndex])
if naiveBayesClassify(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: # 误判
errorCount += 1
print('the error rate is: ', float(errorCount) / len(testSet)) # 输出分类误差
spamTest()