机器学习实战之贝叶斯分类器实现

"""
词表到向量的转换函数
"""


def loadDataSet():
    postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                   ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                   ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                   ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                   ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                   ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0, 1, 0, 1, 0, 1]  # 0 侮辱性文字;1 正常言论
    return postingList, classVec


def createVocabList(dataSet):  # 创建不重复的文档词列表
    vocabSet = set([])  # 空集
    for document in dataSet:
        vocabSet = vocabSet | set(document)  # 集合的合并
    return list(vocabSet)


def setOfWord2Vec(vocabList, inputSet):  # 输入的集合与文档词列表对比生成词向量
    returnVec = [0]*len(vocabList)
    for word in inputSet:
    	if word in vocabList:
        	returnVec[vocabList.index(word)] = 1  # 匹配上的定为1
    else:
        print("the word: %s is not in my Vocabulary!" % word)
    return returnVec

import importlib
importlib.reload(pusu_bayes)
Out[14]: <module 'pusu_bayes' from 'C:\\Users\\xuning\\PycharmProjects\\machine learning\\bayes\\pusu_bayes.py'>
listOPosts, listClasses = pusu_bayes.loadDataSet()
myVocabList = pusu_bayes.createVocabList(listOPosts)
pusu_bayes.setOfWord2Vec(myVocabList, listOPosts[0])
the word: please is not in my Vocabulary!
Out[17]: 
[1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]
myVocabList
Out[18]: 
['has',
 'my',
 'to',
 'dog',
 'how',
 'maybe',
 'love',
 'park',
 'problems',
 'stop',
 'please',
 'flea',
 'take',
 'is',
 'mr',
 'not',
 'garbage',
 'food',
 'help',
 'worthless',
 'buying',
 'I',
 'so',
 'him',
 'posting',
 'ate',
 'licks',
 'dalmation',
 'steak',
 'stupid',
 'cute',
 'quit']
pusu_bayes.setOfWord2Vec(myVocabList, listOPosts[3])
Out[20]: 
[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0]
def trainNB0(trainMatrix, trainCategory):  # 训练得到某词在不同类别中出现的概率
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory)/float(numTrainDocs)  # 属于侮辱类的概率
    p0Num = zeros(numWords); p1Num = zeros(numWords)   # 初始化概率分子分母
    p0Denom = 0.0; p1Denom = 0.0
    for i in range(numTrainDocs):  # 向量相加
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    print(p1Num, p0Num, p1Denom, p0Denom)
    p1Vect = p1Num/p1Denom
    p0Vect = p0Num/p0Denom
    return p0Vect, p1Vect, pAbusive  # 两个向量, 一个概率
importlib.reload(pusu_bayes)
Out[36]: <module 'pusu_bayes' from 'C:\\Users\\xuning\\PycharmProjects\\machine learning\\bayes\\pusu_bayes.py'>
listOPosts, listClasses = pusu_bayes.loadDataSet()
myVocabList = pusu_bayes.createVocabList(listOPosts)
trainMat = []
for postinDoc in listOPosts:
   ...:     trainMat.append(pusu_bayes.setOfWord2Vec(myVocabList, postinDoc))
   ...:     
p0V, p1V, pAb = pusu_bayes.trainNB0(trainMat, listClasses)
[0. 0. 1. 2. 0. 1. 0. 1. 0. 1. 0. 0. 1. 0. 0. 1. 1. 1. 0. 2. 1. 0. 0. 1.
 1. 0. 0. 0. 0. 3. 0. 1.] [1. 3. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0. 1. 1. 0. 0. 0. 1. 0. 0. 1. 1. 2.
 0. 1. 1. 1. 1. 0. 1. 0.] 19.0 24.0
pAb
Out[42]: 0.5
p0V
Out[43]: 
array([0.04166667, 0.125     , 0.04166667, 0.04166667, 0.04166667,
       0.        , 0.04166667, 0.        , 0.04166667, 0.04166667,
       0.04166667, 0.04166667, 0.        , 0.04166667, 0.04166667,
       0.        , 0.        , 0.        , 0.04166667, 0.        ,
       0.        , 0.04166667, 0.04166667, 0.08333333, 0.        ,
       0.04166667, 0.04166667, 0.04166667, 0.04166667, 0.        ,
       0.04166667, 0.        ])
p1V
Out[44]: 
array([0.        , 0.        , 0.05263158, 0.10526316, 0.        ,
       0.05263158, 0.        , 0.05263158, 0.        , 0.05263158,
       0.        , 0.        , 0.05263158, 0.        , 0.        ,
       0.05263158, 0.05263158, 0.05263158, 0.        , 0.10526316,
       0.05263158, 0.        , 0.        , 0.05263158, 0.05263158,
       0.        , 0.        , 0.        , 0.        , 0.15789474,
       0.        , 0.05263158])
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):  # bayes分类函数
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)  # 对应相乘然后相加,数目乘概率
    p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else:
        return 0


def testingNB():
    listOPosts, listClasses = loadDataSet()  # 实验文档及标签
    myVocabList = createVocabList(listOPosts)  # 不重复的文档词列表
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(setOfWord2Vec(myVocabList, postinDoc))
    p0V, p1V, pAb = trainNB0(array(trainMat), array(listClasses))
    testEntry1 = ['love', 'my', 'dalmation']
    thisDoc1 = array(setOfWord2Vec(myVocabList, testEntry1))
    print(testEntry1, 'classified as:', classifyNB(thisDoc1, p0V, p1V, pAb))
    testEntry2 = ['stupid', 'garbage']
    thisDoc2 = array(setOfWord2Vec(myVocabList, testEntry2))
    print(testEntry2, 'classified as:', classifyNB(thisDoc2, p0V, p1V, pAb))
importlib.reload(pusu_bayes)
Out[63]: <module 'pusu_bayes' from 'C:\\Users\\xuning\\PycharmProjects\\machine learning\\bayes\\pusu_bayes.py'>
pusu_bayes.testingNB()
['love', 'my', 'dalmation'] classified as: 0
['stupid', 'garbage'] classified as: 1
"""
文件解析及完整的垃圾邮件测试函数
"""


def textParse(bigString):
    import re
    listOfTokens = re.split(r'\w+', bigString)  # 句子切分,分隔符是除单词和数字之外的任意字符
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]


def spamTest():
    docList = []
    classList = []
    fullText = []
    for i in range(1, 26):
        wordList = textParse(open('email/spam/%d.txt' % i).read())  # 读取每一个文档得到词列表
        docList.append(wordList)  # 整个词列表
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt' % i).read())  # 读取每一个文档得到词列表
        docList.append(wordList)  # 整个词列表
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)  # 创建不重复的词列表
    trainingSet = list(range(50))  # 共有50封邮件
    testSet = []
    for i in range(10):  # 随机构建训练集(包含10封邮件为测试集)
        rangeIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[rangeIndex])
        del(trainingSet[rangeIndex])  # 从总的中剔除掉测试集
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(setOfWord2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        wordVector = setOfWord2Vec(vocabList, docList[docIndex])
        if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
    print('the error rate is:', float(errorCount/len(testSet)))
importlib.reload(pusu_bayes)
Out[89]: <module 'pusu_bayes' from 'C:\\Users\\xuning\\PycharmProjects\\machine learning\\bayes\\pusu_bayes.py'>
pusu_bayes.spamTest()
the error rate is: 0.1
  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

NXU2023

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值