机器学习day7 机器学习实战朴素贝叶斯分类器的实现

经历了3天,玩梦幻西游手游浪费了不少时间,在这里自我检讨,赶在睡觉之前完成了朴素贝叶斯分类器的实现,总算能睡个好觉,明天和同学出去自习,搞定逻辑回归。

书上赘述了一大堆铺垫,关于公式的理解和朴素贝叶斯的原理上一篇已经有解释。这里只给出代码实现。

因步骤太过繁琐,和书上基本一致。室友在睡觉,早点睡,直接上代码,代码里有注释。

ps:数据集下载csdn中《机器学习实战》的数据选第四章,把email这个文件夹放在python的目录中。

测试函数和测试集合也含在程序中,不影响程序运行。

#!/usr/bin/python
#coding:utf-8

#bayes.py

from numpy import *
import operator

#读取测试数据
def loadDataSet():
    postingList = [['my', 'dog', 'has', 'flea', \
                    'problems', 'help', 'please'],
                    ['maybe', 'not', 'take', 'him',\
                    'to', 'dog', 'park', 'stupid'],
                    ['my', 'dalmation', 'is', 'so', 'cute', \
                    'I', 'love', 'him'],
                    ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                    ['mr', 'licks', 'ate', 'my', 'steak', 'how',\
                    'to', 'stop', 'him'],
                    ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    lavels = [0, 1, 0, 1, 0 ,1]
    return postingList, lavels

#取得文章的不重复词语集合
def createWordList(dataset):
    wordlist = set([])
    for line in dataset:
        wordlist = wordlist | set(line)
    return list(wordlist)

#得到词语向量(词袋模型:记录单词的出现次数)
def getWord(wordlist, testline):
    res = [0] * len(wordlist)
    for i in testline:
        if i in wordlist:
            res[wordlist.index(i)] += 1
        else :
            print 'the word %s is not in this dict'%i
    return res      

#得到词语向量(词集模型:单词的出现作为特性)
def getWordSet(wordlist, testline):
    res = [0] * len(wordlist)
    for i in testline:
        if i in wordlist:
            res[wordlist.index(i)] = 1
        else:
            print 'the word %s is not in this dict'%i
    return res            

#计算条件概率
def p(mat, lavels):
    docs = len(mat)
    #0表示无侮辱性 1表示侮辱性
    #初始化总数为特性数目( 每个单词初始化为1)
    p1num = len(mat[0])
    p0num = len(mat[0])
    num1 = 0.0
    p1 = ones(len(mat[0]))
    p0 = ones(len(mat[0]))
    for i in range(docs):
        if lavels[i] == 1:
            num1 += 1
            p1 += mat[i]
            p1num += sum(mat[i])
        else:
            p0 += mat[i]
            p0num += sum(mat[i])
    #得用numpy库的log,可以对矩阵操作,math的log不能对数组操作,这里默认用了numpy           
    p1p = log(p1/p1num)
    p0p = log(p0/p0num)      
    p1type = num1/docs
    return p1type, p1p, p0p

#贝叶斯分类函数
def bayesType(line, p1type, p1p, p0p):
    p1 = sum(line * p1p) + log(p1type)
    p0 = sum(line * p0p) + log(1 - p1type)
    if(p1 > p0):
        return 1
    else:
        return 0        

#测试函数
def testBayes():
    dataset, lavels = loadDataSet()
    wordlist = createWordList(dataset)
    docslist = []
    for i in dataset:
        docslist.append(getWord(wordlist, i))
    p1type, p1p, p0p = p(docslist, lavels)
    test1 = ['love', 'my', 'dalmation']
    word1 = getWord(wordlist, test1)
    print bayesType(word1, p1type, p1p, p0p)
    test2 = ['stupid', 'garbage']
    word2 = getWord(wordlist, test2)
    print bayesType(word2, p1type, p1p, p0p)

#邮件分割函数
def mailType(mail):
    import re
    maillist = re.split(r'\W*', mail)
    return [tok.lower() for tok in maillist if len(tok) > 2]

#完整的邮件测试函数
def fullTestBayes():
    mailslist = []
    lavellist = []
    for i in range(1, 26):
        wlist = mailType(open('email/spam/%d.txt'%i).read())
        mailslist.append(wlist)
        lavellist.append(1)
        wlist = mailType(open('email/ham/%d.txt'%i).read())
        mailslist.append(wlist)
        lavellist.append(0)
    wordlist = createWordList(mailslist)
    train = range(50)
    testset = []
    for i in range(10):
        x = int(random.uniform(0, len(train)))
        testset.append(train[x])
        del (train[x])
    docslist = []
    lavels = []
    for i in train:
        docslist.append(getWord(wordlist, mailslist[i]))
        lavels.append(lavellist[i])
    p1type, p1p, p0p = p(docslist, lavels)
    errornum = 0
    for i in testset:
        line = getWord(wordlist, mailslist[i])
        if bayesType(line, p1type, p1p, p0p) != lavellist[i]:
            errornum += 1
    print 'the error rate is', float(errornum) / len(testset)            



  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值