机器学习实战-朴素贝叶斯笔记

import numpy as np

def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]    #1 is abusive, 0 not
    return postingList,classVec
#提取所有的词汇表
def createVocabList(dataSet):
    vocabSet = set([])
    for document in dataSet:
        vocabSet |= set(document)
    return list(vocabSet)
listOfPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOfPosts)
print(type(myVocabList))
print(myVocabList)
<class 'list'>
['posting', 'so', 'my', 'take', 'ate', 'not', 'help', 'mr', 'cute', 'please', 'quit', 'I', 'licks', 'has', 'park', 'love', 'buying', 'is', 'dalmation', 'him', 'how', 'steak', 'stupid', 'stop', 'dog', 'flea', 'problems', 'worthless', 'food', 'maybe', 'garbage', 'to']
a = set([1,2,3,1])
a
{1, 2, 3}
#词转化为词向量形式,便于计算
def setOfWord2Vec(vocablist, inputSet):
    returnVec = [0] * len(vocablist)
    for word in inputSet:
        if word in vocablist:
            # list 的index方法()。第一个匹配项。类似字符串的find方法
            returnVec[vocablist.index(word)] = 1
        else: 
            print("the word: {0} is not in vocablist".format(word))
    return returnVec
print(setOfWord2Vec(myVocabList, listOfPosts[3]))
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0]
str = 'abca'
lis = ['a','b','a']
print(str.index('a'))
print(str.index('a',1))
print(str.find('a'))
print(str.find('a',2))
#rfind()
print(str.rfind('a'))
#rindex
print(str.rindex('a'))
#list没有find方法
print(lis.find('a'))
0
3
0
3
3
3



---------------------------------------------------------------------------

AttributeError                            Traceback (most recent call last)

<ipython-input-26-463ca8d5b89f> in <module>()
      8 print(str.rindex('a'))
      9 #list没有find方法
---> 10 print(lis.find('a'))


AttributeError: 'list' object has no attribute 'find'
def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    #不能np.shape 因为是字符类型,不是数字不是np.array
    numWords = len(trainMatrix[0])
    # 计算p(c1) ,即脏话文档的概率
    pAbusive = sum(trainCategory) / float(numTrainDocs)
    #准备统计每个词出现的次数
#     p0Num = np.zeros(numWords)
#     p1Num = np.zeros(numWords)
#     p0Denom = 0.0
#     p1Denom = 0.0
#由于下溢问题,修改为
    p0Num = np.ones(numWords)
    p1Num = np.ones(numWords)
    p0Denom = 2.0
    p1Denom = 2.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            #统计脏话词向量
            p1Num += trainMatrix[i]
            #计算脏话文档总计一共有多少个词
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    #计算每个词在相应类别总词数占的概率,如p(w1|c1)
    p1Vect = p1Num/p1Denom
    p0Vect = p0Num/p0Denom
    return p0Vect, p1Vect, pAbusive
trainMat = []
for postinDoc in listOfPosts:
    trainMat.append(setOfWord2Vec(myVocabList, postinDoc))
p0V,p1V,pAb = trainNB0(trainMat, listClasses)
print(trainMat)
print(p0V)
print(p1V)
print(pAb)
[[0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1], [0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0], [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0]]
[ 0.03846154  0.07692308  0.15384615  0.03846154  0.07692308  0.03846154
  0.07692308  0.07692308  0.07692308  0.07692308  0.03846154  0.07692308
  0.07692308  0.07692308  0.03846154  0.07692308  0.03846154  0.07692308
  0.07692308  0.11538462  0.07692308  0.07692308  0.03846154  0.07692308
  0.07692308  0.07692308  0.07692308  0.03846154  0.03846154  0.03846154
  0.03846154  0.07692308]
[ 0.0952381   0.04761905  0.04761905  0.0952381   0.04761905  0.0952381
  0.04761905  0.04761905  0.04761905  0.04761905  0.0952381   0.04761905
  0.04761905  0.04761905  0.0952381   0.04761905  0.0952381   0.04761905
  0.04761905  0.0952381   0.04761905  0.04761905  0.19047619  0.0952381
  0.14285714  0.04761905  0.04761905  0.14285714  0.0952381   0.0952381
  0.0952381   0.0952381 ]
0.5
pAb
0.5
#构建Bayes分类函数
def classifyNB(vec2Classify, p0Vect, p1Vect, pClass1):
    #取对数,便于计算,防止下溢
    p1Vec = np.log(p1Vect)
    p0Vec = np.log(p0Vect)
    p1 = sum(vec2Classify * p1Vec) + np.log(pClass1)
    p0 = sum(vec2Classify * p0Vec) + np.log(1 - pClass1)
    if p1 > p0:
        return 1
    else:
        return 0
def testingNB():
    listOfPosts, listClasses = loadDataSet()
    myVocabList = createVocabList(listOfPosts)
    trainMat = []
    for postInDoc in listOfPosts:
        trainMat.append(setOfWord2Vec(myVocabList, postInDoc))
    p0V,p1V,pAb = trainNB0(trainMat, listClasses)
    testEntry1 = ['love', 'my', 'dalmation']
    thisDoc1 = np.array(setOfWord2Vec(myVocabList, testEntry1))
    print('{0} classified as : {1}'.format(testEntry1,classifyNB(thisDoc1, p0V, p1V, pAb)))
    testEntry2 = ['stupid', 'garbage']
    thisDoc2 = np.array(setOfWord2Vec(myVocabList, testEntry2))
    print('{0} classified as : {1}'.format(testEntry2,classifyNB(thisDoc2, p0V, p1V, pAb)))
testingNB()
['love', 'my', 'dalmation'] classified as : 0
['stupid', 'garbage'] classified as : 1
def textParse(bigString):
    import re
    if bigString != None:
        #需要非空的pattern,所以我把*改成了+
        listOfTokens = re.split(r'\W+', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]
def spamTest():
    docList = []
    classList = []
    fullText = []
    for i in range(26):
        #read()读取整个文件,通常转化为字符串
        #print(i)#,用来检测哪个文件出错了
        #若用'rb'读取,需要'gbk'解码
        #我在目录下添加了0.txt,为了符合range(26)的需求
        wordList = textParse(open('email\\spam\\%d.txt' % i, 'r').read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        #ham\6.txt有错误,中间有个are是乱码。
        wordList = textParse(open('email\\ham\\%d.txt' % i, 'r').read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)

    vocabList = createVocabList(docList)
    #改成list()形式,python3返回range迭代器
    traingSet = list(range(50))
    testSet = []

    for i in range(10):
        randIndex = int(np.random.uniform(0, len(traingSet)))
        #testSet只要index
        testSet.append(traingSet[randIndex])
        del(traingSet[randIndex])
    traingMat = []
    traingClasses = []
    for docIndex in traingSet:
        traingMat.append(setOfWord2Vec(vocabList, docList[docIndex]))
        traingClasses.append(classList[docIndex])
    p0V, p1V, pAb = trainNB0(np.array(traingMat), np.array(traingClasses))
    errorCount = 0.0
    for docIndex in testSet:
        wordVector = setOfWord2Vec(vocabList, docList[docIndex])
        if classifyNB(np.array(wordVector), p0V, p1V, pAb) != classList[docIndex]:
            errorCount += 1
    print('the error rate is : ', float(errorCount) / len(testSet))
spamTest()
the error rate is :  0.1
'ab'.encode('gbk')
'\0xab'.encode('gbk')
b'\x00xab'
#改动前ham\6.txt为置信区间0.73, windows-1225编码?中间有乱码导致错误
import chardet
f = open('email\\ham\\6.txt','rb')
chardet.detect(f.read())
{'confidence': 1.0, 'encoding': 'ascii', 'language': ''}
print(list(range(50)))
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值