机器学习实战-朴素贝叶斯笔记

最新推荐文章于 2022-07-29 22:55:12 发布

eclipSYcn

最新推荐文章于 2022-07-29 22:55:12 发布

阅读量597

点赞数

分类专栏：机器学习 python 文章标签： python 机器学习

本文链接：https://blog.csdn.net/Eclipsesy/article/details/77701080

版权

python 同时被 2 个专栏收录

21 篇文章 0 订阅

订阅专栏

机器学习

5 篇文章 0 订阅

订阅专栏

对于python3的编码和解码问题 encode及decode
由于encode及decode问题带来的python3 中 ‘r’和’rb’问题
range(50)需要list(range(50))可以返回列表形式
set()的 | 并集操作
python正则表达式，re的pattern及match search的方法

import numpy as np

def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]    #1 is abusive, 0 not
    return postingList,classVec

#提取所有的词汇表
def createVocabList(dataSet):
    vocabSet = set([])
    for document in dataSet:
        vocabSet |= set(document)
    return list(vocabSet)

listOfPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOfPosts)

print(type(myVocabList))
print(myVocabList)

<class 'list'>
['posting', 'so', 'my', 'take', 'ate', 'not', 'help', 'mr', 'cute', 'please', 'quit', 'I', 'licks', 'has', 'park', 'love', 'buying', 'is', 'dalmation', 'him', 'how', 'steak', 'stupid', 'stop', 'dog', 'flea', 'problems', 'worthless', 'food', 'maybe', 'garbage', 'to']

a = set([1,2,3,1])

{1, 2, 3}

#词转化为词向量形式，便于计算
def setOfWord2Vec(vocablist, inputSet):
    returnVec = [0] * len(vocablist)
    for word in inputSet:
        if word in vocablist:
            # list 的index方法()。第一个匹配项。类似字符串的find方法
            returnVec[vocablist.index(word)] = 1
        else: 
            print("the word: {0} is not in vocablist".format(word))
    return returnVec

print(setOfWord2Vec(myVocabList, listOfPosts[3]))

[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0]

str = 'abca'
lis = ['a','b','a']

print(str.index('a'))
print(str.index('a',1))
print(str.find('a'))
print(str.find('a',2))
#rfind()
print(str.rfind('a'))
#rindex
print(str.rindex('a'))
#list没有find方法
print(lis.find('a'))

0
3
0
3
3
3



---------------------------------------------------------------------------

AttributeError                            Traceback (most recent call last)

<ipython-input-26-463ca8d5b89f> in <module>()
      8 print(str.rindex('a'))
      9 #list没有find方法
---> 10 print(lis.find('a'))


AttributeError: 'list' object has no attribute 'find'

def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    #不能np.shape 因为是字符类型，不是数字不是np.array
    numWords = len(trainMatrix[0])
    # 计算p(c1) ，即脏话文档的概率
    pAbusive = sum(trainCategory) / float(numTrainDocs)
    #准备统计每个词出现的次数
#     p0Num = np.zeros(numWords)
#     p1Num = np.zeros(numWords)
#     p0Denom = 0.0
#     p1Denom = 0.0
#由于下溢问题，修改为
    p0Num = np.ones(numWords)
    p1Num = np.ones(numWords)
    p0Denom = 2.0
    p1Denom = 2.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            #统计脏话词向量
            p1Num += trainMatrix[i]
            #计算脏话文档总计一共有多少个词
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    #计算每个词在相应类别总词数占的概率,如p(w1|c1)
    p1Vect = p1Num/p1Denom
    p0Vect = p0Num/p0Denom
    return p0Vect, p1Vect, pAbusive

trainMat = []
for postinDoc in listOfPosts:
    trainMat.append(setOfWord2Vec(myVocabList, postinDoc))
p0V,p1V,pAb = trainNB0(trainMat, listClasses)
print(trainMat)
print(p0V)
print(p1V)
print(pAb)

[[0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1], [0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0], [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0]]
[ 0.03846154  0.07692308  0.15384615  0.03846154  0.07692308  0.03846154
  0.07692308  0.07692308  0.07692308  0.07692308  0.03846154  0.07692308
  0.07692308  0.07692308  0.03846154  0.07692308  0.03846154  0.07692308
  0.07692308  0.11538462  0.07692308  0.07692308  0.03846154  0.07692308
  0.07692308  0.07692308  0.07692308  0.03846154  0.03846154  0.03846154
  0.03846154  0.07692308]
[ 0.0952381   0.04761905  0.04761905  0.0952381   0.04761905  0.0952381
  0.04761905  0.04761905  0.04761905  0.04761905  0.0952381   0.04761905
  0.04761905  0.04761905  0.0952381   0.04761905  0.0952381   0.04761905
  0.04761905  0.0952381   0.04761905  0.04761905  0.19047619  0.0952381
  0.14285714  0.04761905  0.04761905  0.14285714  0.0952381   0.0952381
  0.0952381   0.0952381 ]
0.5

pAb

0.5

#构建Bayes分类函数
def classifyNB(vec2Classify, p0Vect, p1Vect, pClass1):
    #取对数，便于计算，防止下溢
    p1Vec = np.log(p1Vect)
    p0Vec = np.log(p0Vect)
    p1 = sum(vec2Classify * p1Vec) + np.log(pClass1)
    p0 = sum(vec2Classify * p0Vec) + np.log(1 - pClass1)
    if p1 > p0:
        return 1
    else:
        return 0

def testingNB():
    listOfPosts, listClasses = loadDataSet()
    myVocabList = createVocabList(listOfPosts)
    trainMat = []
    for postInDoc in listOfPosts:
        trainMat.append(setOfWord2Vec(myVocabList, postInDoc))
    p0V,p1V,pAb = trainNB0(trainMat, listClasses)
    testEntry1 = ['love', 'my', 'dalmation']
    thisDoc1 = np.array(setOfWord2Vec(myVocabList, testEntry1))
    print('{0} classified as : {1}'.format(testEntry1,classifyNB(thisDoc1, p0V, p1V, pAb)))
    testEntry2 = ['stupid', 'garbage']
    thisDoc2 = np.array(setOfWord2Vec(myVocabList, testEntry2))
    print('{0} classified as : {1}'.format(testEntry2,classifyNB(thisDoc2, p0V, p1V, pAb)))

testingNB()

['love', 'my', 'dalmation'] classified as : 0
['stupid', 'garbage'] classified as : 1

def textParse(bigString):
    import re
    if bigString != None:
        #需要非空的pattern，所以我把*改成了+
        listOfTokens = re.split(r'\W+', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]

def spamTest():
    docList = []
    classList = []
    fullText = []
    for i in range(26):
        #read()读取整个文件，通常转化为字符串
        #print(i)#，用来检测哪个文件出错了
        #若用'rb'读取，需要'gbk'解码
        #我在目录下添加了0.txt，为了符合range(26)的需求
        wordList = textParse(open('email\\spam\\%d.txt' % i, 'r').read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        #ham\6.txt有错误，中间有个are是乱码。
        wordList = textParse(open('email\\ham\\%d.txt' % i, 'r').read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)

    vocabList = createVocabList(docList)
    #改成list()形式，python3返回range迭代器
    traingSet = list(range(50))
    testSet = []

    for i in range(10):
        randIndex = int(np.random.uniform(0, len(traingSet)))
        #testSet只要index
        testSet.append(traingSet[randIndex])
        del(traingSet[randIndex])
    traingMat = []
    traingClasses = []
    for docIndex in traingSet:
        traingMat.append(setOfWord2Vec(vocabList, docList[docIndex]))
        traingClasses.append(classList[docIndex])
    p0V, p1V, pAb = trainNB0(np.array(traingMat), np.array(traingClasses))
    errorCount = 0.0
    for docIndex in testSet:
        wordVector = setOfWord2Vec(vocabList, docList[docIndex])
        if classifyNB(np.array(wordVector), p0V, p1V, pAb) != classList[docIndex]:
            errorCount += 1
    print('the error rate is : ', float(errorCount) / len(testSet))

spamTest()

the error rate is :  0.1

'ab'.encode('gbk')
'\0xab'.encode('gbk')

b'\x00xab'

#改动前ham\6.txt为置信区间0.73， windows-1225编码？中间有乱码导致错误
import chardet
f = open('email\\ham\\6.txt','rb')
chardet.detect(f.read())

{'confidence': 1.0, 'encoding': 'ascii', 'language': ''}

print(list(range(50)))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]