经历了3天,玩梦幻西游手游浪费了不少时间,在这里自我检讨,赶在睡觉之前完成了朴素贝叶斯分类器的实现,总算能睡个好觉,明天和同学出去自习,搞定逻辑回归。
书上赘述了一大堆铺垫,关于公式的理解和朴素贝叶斯的原理上一篇已经有解释。这里只给出代码实现。
因步骤太过繁琐,和书上基本一致。室友在睡觉,早点睡,直接上代码,代码里有注释。
ps:数据集下载csdn中《机器学习实战》的数据选第四章,把email这个文件夹放在python的目录中。
测试函数和测试集合也含在程序中,不影响程序运行。
#!/usr/bin/python
#coding:utf-8
#bayes.py
from numpy import *
import operator
#读取测试数据
def loadDataSet():
postingList = [['my', 'dog', 'has', 'flea', \
'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him',\
'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', \
'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how',\
'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
lavels = [0, 1, 0, 1, 0 ,1]
return postingList, lavels
#取得文章的不重复词语集合
def createWordList(dataset):
wordlist = set([])
for line in dataset:
wordlist = wordlist | set(line)
return list(wordlist)
#得到词语向量(词袋模型:记录单词的出现次数)
def getWord(wordlist, testline):
res = [0] * len(wordlist)
for i in testline:
if i in wordlist:
res[wordlist.index(i)] += 1
else :
print 'the word %s is not in this dict'%i
return res
#得到词语向量(词集模型:单词的出现作为特性)
def getWordSet(wordlist, testline):
res = [0] * len(wordlist)
for i in testline:
if i in wordlist:
res[wordlist.index(i)] = 1
else:
print 'the word %s is not in this dict'%i
return res
#计算条件概率
def p(mat, lavels):
docs = len(mat)
#0表示无侮辱性 1表示侮辱性
#初始化总数为特性数目( 每个单词初始化为1)
p1num = len(mat[0])
p0num = len(mat[0])
num1 = 0.0
p1 = ones(len(mat[0]))
p0 = ones(len(mat[0]))
for i in range(docs):
if lavels[i] == 1:
num1 += 1
p1 += mat[i]
p1num += sum(mat[i])
else:
p0 += mat[i]
p0num += sum(mat[i])
#得用numpy库的log,可以对矩阵操作,math的log不能对数组操作,这里默认用了numpy
p1p = log(p1/p1num)
p0p = log(p0/p0num)
p1type = num1/docs
return p1type, p1p, p0p
#贝叶斯分类函数
def bayesType(line, p1type, p1p, p0p):
p1 = sum(line * p1p) + log(p1type)
p0 = sum(line * p0p) + log(1 - p1type)
if(p1 > p0):
return 1
else:
return 0
#测试函数
def testBayes():
dataset, lavels = loadDataSet()
wordlist = createWordList(dataset)
docslist = []
for i in dataset:
docslist.append(getWord(wordlist, i))
p1type, p1p, p0p = p(docslist, lavels)
test1 = ['love', 'my', 'dalmation']
word1 = getWord(wordlist, test1)
print bayesType(word1, p1type, p1p, p0p)
test2 = ['stupid', 'garbage']
word2 = getWord(wordlist, test2)
print bayesType(word2, p1type, p1p, p0p)
#邮件分割函数
def mailType(mail):
import re
maillist = re.split(r'\W*', mail)
return [tok.lower() for tok in maillist if len(tok) > 2]
#完整的邮件测试函数
def fullTestBayes():
mailslist = []
lavellist = []
for i in range(1, 26):
wlist = mailType(open('email/spam/%d.txt'%i).read())
mailslist.append(wlist)
lavellist.append(1)
wlist = mailType(open('email/ham/%d.txt'%i).read())
mailslist.append(wlist)
lavellist.append(0)
wordlist = createWordList(mailslist)
train = range(50)
testset = []
for i in range(10):
x = int(random.uniform(0, len(train)))
testset.append(train[x])
del (train[x])
docslist = []
lavels = []
for i in train:
docslist.append(getWord(wordlist, mailslist[i]))
lavels.append(lavellist[i])
p1type, p1p, p0p = p(docslist, lavels)
errornum = 0
for i in testset:
line = getWord(wordlist, mailslist[i])
if bayesType(line, p1type, p1p, p0p) != lavellist[i]:
errornum += 1
print 'the error rate is', float(errornum) / len(testset)