机器学习实战(3)----朴素贝叶斯(基于python3.5)

import numpy as np

def loadDataSet():  #实验样本
    postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], 
                    ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                    ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                    ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                    ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                    ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]  #1代表侮辱性文字,0代表正常言论
    return postingList, classVec

def createVocabList(dataSet):
#创建一个包含在所有文档中出现的不重复词的列表
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document) #两个集合的并集
    return list(vocabSet)

##词集模型:每个词只能出现一次 
def setOfWords2Vec(vocabList, inputSet): #输入参数为词汇表及某个文档
    returnVec = [0] * len(vocabList)  #创建一个与词汇表等长的向量
    for word in inputSet:  #遍历文档中所有单词,如果出现词汇表中的单词,则输出向量对应值为1
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else:   
            print('the word: %s is not in my Vocabulary!' % word)
    return returnVec

def trainNB0(trainMatrix, trainCategory): #输入参数为文档矩阵及每篇文档类别标签构成的向量
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = np.sum(trainCategory)/float(numTrainDocs) #所有文档中类别为1的概率,类别为0的可以通过1-P(1)得到
    p0Num = np.ones(numWords)
    p1Num = np.ones(numWords)  #初始化词的出现次数,初始化为1,防止概率乘积为0的情况
    p0Denom = 2.0
    p1Denom = 2.0
    for i in range(numTrainDocs): #遍历所有文档
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]  #利用向量加法统计类别1中每个词出现的个数
            p1Denom += np.sum(trainMatrix[i])  #统计类别1中所有词的个数
        else:
            p0Num += trainMatrix[i]  #利用向量加法统计类别0中每个词出现的个数
            p0Denom += np.sum(trainMatrix[i])  #统计类别0中所有词的个数
    p1Vect = np.log(p1Num/p1Denom)  #由于概率值比较小,相乘后易出现下溢问题(四舍五入后为0),
    p0Vect = np.log(p0Num/p0Denom)  #取对数避免下溢出问题
    return p0Vect, p1Vect, pAbusive

def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
#输入参数:要分类的向量以及用函数trainNB0计算得到的三个概率
    p1 = np.sum(vec2Classify * p1Vec) + np.log(pClass1)  #求待分类向量的条件概率(对数化后的,所以用求和),加到类别的对数概率上
    p0 = np.sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
    if p1 > p0:  #比较概率大小,返回概率大的类别标签
        return 1
    else:
        return 0
    
def testingNB():
    listOPosts, listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0V, p1V, pAb = trainNB0(np.array(trainMat), np.array(listClasses))
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as:', classifyNB(thisDoc, p0V, p1V, pAb))
    testEntry = ['stupid', 'garbage']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as:', classifyNB(thisDoc, p0V, p1V, pAb))  

##词袋模型:每个词可以出现多次    
def bagOfWords2VecMN(vocabList, inputSet): #输入参数为词汇表及某个文档
    returnVec = [0] * len(vocabList)  #创建一个与词汇表等长的向量
    for word in inputSet:  #遍历文档中所有单词,如果出现词汇表中的单词,则输出向量中对应值增加1
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec   
        
def textParse(bigString):
    import re
    listOfTokens = re.split(r'\W', bigString)  #使用正则表示式切分句子,其中分割符是除单词、数字外的任意字符串
    return [tok.lower() for tok in listOfTokens if  len (tok) > 2]  #将切分后的单词统一成小写字母,并去掉长度小于2的字符
    
def spamTest():
    docList = []
    classList = []
    fullText = []
    for i in range(1, 26): #导入文件并将它们解析为词列表
        wordList = textParse(open(r'F:\算法学习\机器学习书籍\machinelearninginaction\Ch04\email\spam\%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open(r'F:\算法学习\机器学习书籍\machinelearninginaction\Ch04\email\ham\%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    trainingSet = list(range(50))
    testSet = []
    #留存交叉验证
    for i in range(10):  #构建训练集和测试集,共50个文本,随机选择10个作为测试集
        randIndex = int(np.random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet: #遍历训练集,对每个文本基于词汇表构建词向量并计算分类概率
        trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        wordVector = setOfWords2Vec(vocabList, docList[docIndex])
        if classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1 #如果分类错误则错误数加1
            # 设定阈值,Ipython界面将省略号表示的内容完整输出
#            np.set_printoptions(threshold=1e6) 
            print('classification error',docList[docIndex]) #输出分类错误的文档
    print('the error rate is: ', float(errorCount)/len(testSet)) #打印测试集中分类错误百分比

       

         
         
    
    
    
    
    
    
    
    
    
    
   
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值