机器学习实战-贝叶斯

from numpy import *

def loadDataSet():
    postingList=[['my','dog','has','flea','problems','help','please'],
                ['maybe','not','take','him','to','dog','park','stupid'],
                ['my','dalmation','is','so','cute','I','love','him'],
                 ['stop','posting','stupid','worthless','garbage'],
                 ['mr','licks','ate','my','steak','how','to','stop','him'],
                ['quit','buying','worthless','dog','food','stupid']]
    classVec=[0,1,0,1,0,1]#1代表侮辱性文字,0代表正常言论
    return postingList,classVec
#创建包含文档中出现不重复词的列表,使用set数据类型,将词条列表输出
#给Set构造函数,set就会返回不重复列表,操作符'\'是用于求两个集合的并集
#也是按位或(OR)操作符,在数学符号表示上,按位或操作与集合求并操作使用相同记号。
def createVocabList(dataSet):
    vocabSet=set([])
    for document in dataSet:
        vocabSet=vocabSet|set(document)#创建两个集合的并集
        pass
    return list(vocabSet)

def setOfWords2Vec(vocabList,inputSet):
    #print([1]*7)---->[1, 1, 1, 1, 1, 1, 1],[0]*len(postingList)长度为6===>[0, 0, 0, 0, 0, 0]
    returnVec=[0]*len(vocabList)#32
#     print("vocablist的值是:")
#     print(returnVec)
    for word in inputSet:
        if word in vocabList:
            #在单词出现的位置即索引处将其值改为1,表示出现
            returnVec[vocabList.index(word)]=1
            pass
        else:
            print("the word:%s is not in my Vocabulary!" % word )
            pass
    return returnVec
    
 #trainMatrix文档矩阵, trainCategory是每一篇文档类别标签所构成的向量  
def trainNB0(trainMatrix,trainCategory):
#     trainMatrix二维的[[],[],[]]
    numTrainDocs=len(trainMatrix)
    
    
#     每一条中词的长度
   # print("numTrainDocs=",numTrainDocs)    #6
    numWords=len(trainMatrix[0]) 
    print("numWords=",numWords)    #32
  #  print("sum(trainCategory)=",sum(trainCategory))    #3
    pAbusive=sum(trainCategory)/float(numTrainDocs)
    print("pAbusive的值是:")
    print(pAbusive)
    #初始化概率
#     p0Num=zeros(numWords)
#     p1Num=zeros(numWords)
    p0Num=ones(numWords)
    p1Num=ones(numWords)
    #贝叶斯分了你对文档进行分类时,要计算多个概率的面积以获得文档属于每一个类别的概率即p(w0\1)p(w1\1)p(w2\1)如果其中一个概率值
    #为0,则最后乘积也是0、为降低这种影响,可以将所有词的出现次数初始化为1,将分母初始化为2
   # p0Denom=0.0; 
   # p1Denom=0.0
    p0Denom=2.0 
    p1Denom=2.0
    for i in range(numTrainDocs):
        if trainCategory[i]==1:
            #向量相加,对应位置进行相加
            
            p1Num+=trainMatrix[i]
            #计算1的个数
            p1Denom+=sum(trainMatrix[i])
            pass
        else:
            #算0的个数
            p0Num+=trainMatrix[i]
            p0Denom+=sum(trainMatrix[i])
            pass
        pass
    #对每一个元素做除法,词向量中每个词出现的次数,除以所有词出现的总次数
    #(当有很多很小的数相乘时,可能造成程序会下溢,或者找不到正确答案,乘很多很小的数四舍五入时会得到0,可以取对数解决该问题)
    #p1Vect=p1Num/p1Denom#change to log()
    print("p1Num的值是:")
    print(p1Num)
    print("p1Denom的值是:")
    print(p1Denom)
    
    p1Vect=log(p1Num/p1Denom)
    print("++++++")
    print(p1Vect)
    #p0Vect=p0Num/p0Denom#change to log()
    p0Vect=log(p0Num/p0Denom)
    return p0Vect,p1Vect,pAbusive

    
postingList,classVec=loadDataSet()
a=createVocabList(postingList)
bbb=setOfWords2Vec(a,postingList[0])

trainMat=[]
print("postinglist的值是:")
print(postingList)#输入的二维矩阵
print("a的值是:")
print(a)#无重复的单词构成的向量
#输入的二维矩阵中的某一条和五重复的列表对比,含有该单词则对应为1,否则为0
print(bbb)
for postinDoc in postingList:
    trainMat.append(setOfWords2Vec(a,postinDoc))
    pass
#trainNB0是转换后的只含0.1的二维矩阵
print("trainMat的值是:")
print(trainMat)
#classVec是[0,1,0,1,0,1]
print("classVec的值是:")
print(classVec)
p0V,p1V,pAb=trainNB0(trainMat,classVec)

print(p0V)
print(p1V)
print(pAb)   

#朴素贝叶斯分类函数
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
    p1=sum(vec2Classify*p1Vec)+log(pClass1)
    p0=sum(vec2Classify*p0Vec)+log(1.0-pClass1)
    if p1>p0:
        return 1
    else:
        return 0
    pass
def testingNB():
    listOPosts,listClasses=loadDataSet()
    myVocabList=createVocabList(listOPosts)
    trainMat=[]
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList,postinDOc))
        pass
    p0V,p1V,pAb=trainNB0(array(trainMat),array(listClasses))
    testEntry=['love','my','dalmation']
    thisDoc=array([setOfWords2Vec(myVocabList,testEntry)]print(testEntry,"classified as:",classifyNB(thisDoc,p0V,p1V,pAb))
    testEntry=['stupid','garbage']
    thisDoc=array(setOfWords2Vec(myVocabList,testEntry))
    print(testEntry,'classified as',classifyNB(thisDoc,p0V,p1V,pAb))
 
 testingNB()   

打印

vocablist的值是:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
postinglist的值是:
[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], ['stop', 'posting', 'stupid', 'worthless', 'garbage'], ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
a的值是:
['maybe', 'stop', 'cute', 'flea', 'how', 'mr', 'worthless', 'steak', 'licks', 'him', 'garbage', 'has', 'not', 'is', 'stupid', 'problems', 'quit', 'I', 'dalmation', 'posting', 'dog', 'love', 'food', 'ate', 'please', 'to', 'so', 'park', 'buying', 'my', 'take', 'help']
[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1]
vocablist的值是:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
vocablist的值是:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
vocablist的值是:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
vocablist的值是:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
vocablist的值是:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
vocablist的值是:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
trainMat的值是:
[[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0], [0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0]]
classVec的值是:
[0, 1, 0, 1, 0, 1]
numWords= 32
pAbusive的值是:
0.5
p1Num的值是:
[2. 2. 1. 1. 1. 1. 3. 1. 1. 2. 2. 1. 2. 1. 4. 1. 2. 1. 1. 2. 3. 1. 2. 1.
1. 2. 1. 2. 2. 1. 2. 1.]
p1Denom的值是:
21.0
++++++
[-2.35137526 -2.35137526 -3.04452244 -3.04452244 -3.04452244 -3.04452244
-1.94591015 -3.04452244 -3.04452244 -2.35137526 -2.35137526 -3.04452244
-2.35137526 -3.04452244 -1.65822808 -3.04452244 -2.35137526 -3.04452244
-3.04452244 -2.35137526 -1.94591015 -3.04452244 -2.35137526 -3.04452244
-3.04452244 -2.35137526 -3.04452244 -2.35137526 -2.35137526 -3.04452244
-2.35137526 -3.04452244]
[-3.25809654 -2.56494936 -2.56494936 -2.56494936 -2.56494936 -2.56494936
-3.25809654 -2.56494936 -2.56494936 -2.15948425 -3.25809654 -2.56494936
-3.25809654 -2.56494936 -3.25809654 -2.56494936 -3.25809654 -2.56494936
-2.56494936 -3.25809654 -2.56494936 -2.56494936 -3.25809654 -2.56494936
-2.56494936 -2.56494936 -2.56494936 -3.25809654 -3.25809654 -1.87180218
-3.25809654 -2.56494936]
[-2.35137526 -2.35137526 -3.04452244 -3.04452244 -3.04452244 -3.04452244
-1.94591015 -3.04452244 -3.04452244 -2.35137526 -2.35137526 -3.04452244
-2.35137526 -3.04452244 -1.65822808 -3.04452244 -2.35137526 -3.04452244
-3.04452244 -2.35137526 -1.94591015 -3.04452244 -2.35137526 -3.04452244
-3.04452244 -2.35137526 -3.04452244 -2.35137526 -2.35137526 -3.04452244
-2.35137526 -3.04452244]
0.5


['love', 'my', 'dalmation'] classified as: 0
['stupid', 'garbage'] classified as 1
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值