朴素贝叶斯有两个假设:
1.“朴素”:特征之间相互独立,单词出现的概率相同,当然这个假设不合理
2.“权重相同”:每个特征等同重要,这个假设也不太合理
尽管如此朴素贝叶斯的效果可观
朴素贝叶斯有两种实现方式
1.基于贝努利模型:不考虑词在文档中出现的次数,只考虑出不出现,相当于假设词是等权重的
2.基于多项式模型:考虑词在文档中出现的次数(词袋)也就是词不是等权重的,即不考虑第二个假设
基本模型
1.等权重下的
import numpy as np
##进行社区留言本的留言进行是否为包含侮辱类和非侮辱类留言进行二分类,前者为‘1’,后者为‘0‘
##载入训练文本,需要自己根据实际更改,样本在1000以上时可以得到很好的分布
def DataLoader():
textlist=[['my','dog','has','flea','problems','help','please'],['maybe','not','take','him','to','dog','park','stupid'],['my','dalmation','is','so','cute',\
'love','him'],['stop','posting','stupid','worthless','garbage'],['mr','licks','ate','my','steak','how','to','stop','him'],['quit','buying','worthless','dog','food','stupid']]
classlabel=[0,1,0,1,0,1]
return textlist,classlabel
##建立无重复的词汇表
def createVocabList(textlist):
vocabset=set([])
for text in textlist:
vocabset=vocabset|set(text)
return list(vocabset)
##将输入一个文档的词转为类似one-hot向量的向量表示,当出现词汇表里未出现的单词发出提示
def word2Vec(vocabset,textlist_document):
wordlen=len(vocabset)
finalVec=[0 for i in range(wordlen)]
for word in textlist_document:
if word in vocabset:
finalVec[vocabset.index(word)]=1
else:
print('the word:{} is not in the vocabset'.format(word))
return finalVec
##计算条件概率,训练算法
##p(ci|w)=p(w|ci)p(ci)/p(w) w是向量,由词汇表的单词组成的向量,由于假设特征间独立所以有p(w|ci)=p(w1|ci)p(w2|ci)...p(wn|ci)
##最后只要比较p(w|ci)p(ci)大小即可,p(c1)得出就可得p(c0)=1-p(c1)
def NBtrain(trainMatrix,trainLabel):
##得出文档数
documentNum=len(trainMatrix)
##得出词汇表的总数
wordNum=len(trainMatrix[0])
##得出p(c1)
pc1=np.sum(trainLabel)/float(documentNum)
##为了避免某个特征算出来是0的情况而最后算条件概率时相乘时全为0,以及为方便后面为了避免下溢出(小的数相乘导致太小)使用log转为加法时被log数不为0
p0Denom=2.0
p1Denom=2.0
p0array=np.ones(wordNum)
p1array=np.ones(wordNum)
for i in range(documentNum):
if trainLabel[i]==1:
p1array+=trainMatrix[i]
p1Denom+=np.sum(trainMatrix[i])
else:
p0array+=trainMatrix[i]
p0Denom+=np.sum(trainMatrix[i])
p1Vect=np.log(p1array/p1Denom)
p0Vect=np.log(p0array/p0Denom)
return p0Vect,p1Vect,pc1
def NBclassify(testVec,p0Vect,p1Vect,pc1):
p1=np.sum(testVec*p1Vect)+np.log(pc1)
p0=np.sum(testVec*p0Vect)+np.log(1.0-pc1)
if p1>p0:
return 1
else:
return 0
if __name__=='__main__':
textlist,classlabel=DataLoader()
myvocaset=createVocabList(textlist)
# print(len(word2Vec(myvocaset,textlist[0])))
trainMatrix=[]
for document in textlist:
trainMatrix.append(word2Vec(myvocaset,document))
# print(trainMatrix)
# print(myvocaset)
p0Vect, p1Vect, pc1=NBtrain(trainMatrix,classlabel)
# print(p0Vect)
test=['stupid','my','garbage']
testVec=np.array(word2Vec(myvocaset,test))
print('{} is classified as : {}'.format(test,NBclassify(testVec,p0Vect,p1Vect,pc1)))
2.不等权重下的,使用词袋
import numpy as np
##唯一区别在这个使用了词袋,即假设词是不等权重的
##进行社区留言本的留言进行是否为包含侮辱类和非侮辱类留言进行二分类,前者为‘1’,后者为‘0‘
##载入训练文本,需要自己根据实际更改,样本在1000以上时可以得到很好的分布
def DataLoader():
textlist=[['my','dog','has','flea','problems','help','please'],['maybe','not','take','him','to','dog','park','stupid'],['my','dalmation','is','so','cute',\
'love','him'],['stop','posting','stupid','worthless','garbage'],['mr','licks','ate','my','steak','how','to','stop','him'],['quit','buying','worthless','dog','food','stupid']]
classlabel=[0,1,0,1,0,1]
return textlist,classlabel
##建立无重复的词汇表
def createVocabList(textlist):
vocabset=set([])
for text in textlist:
vocabset=vocabset|set(text)
return list(vocabset)
##将输入的每个文档转为词袋
def bagofwords2Vec(vocabset,textlist_document):
wordlen=len(vocabset)
finalVec=[0 for i in range(wordlen)]
for word in textlist_document:
if word in vocabset:
finalVec[vocabset.index(word)]+=1
else:
print('the word:{} is not in the vocabset'.format(word))
return finalVec
##计算条件概率,训练算法
##p(ci|w)=p(w|ci)p(ci)/p(w) w是向量,由词汇表的单词组成的向量,由于假设特征间独立所以有p(w|ci)=p(w1|ci)p(w2|ci)...p(wn|ci)
##最后只要比较p(w|ci)p(ci)大小即可,p(c1)得出就可得p(c0)=1-p(c1)
##输入的是nparray格式
def NBtrain(trainMatrix,trainLabel):
##得出文档数
documentNum=len(trainMatrix)
##得出词汇表的总数
wordNum=len(trainMatrix[0])
##得出p(c1)
pc1=np.sum(trainLabel)/float(documentNum)
##为了避免某个特征算出来是0的情况而最后算条件概率时相乘时全为0,以及为方便后面为了避免下溢出(小的数相乘导致太小)使用log转为加法时被log数不为0
p0Denom=2.0
p1Denom=2.0
p0array=np.ones(wordNum)
p1array=np.ones(wordNum)
for i in range(documentNum):
if trainLabel[i]==1:
p1array+=trainMatrix[i]
p1Denom+=np.sum(trainMatrix[i])
else:
p0array+=trainMatrix[i]
p0Denom+=np.sum(trainMatrix[i])
p1Vect=np.log(p1array/p1Denom)
p0Vect=np.log(p0array/p0Denom)
return p0Vect,p1Vect,pc1
def NBclassify(testVec,p0Vect,p1Vect,pc1):
p1=np.sum(testVec*p1Vect)+np.log(pc1)
p0=np.sum(testVec*p0Vect)+np.log(1.0-pc1)
if p1>p0:
return 1
else:
return 0
if __name__=='__main__':
textlist,classlabel=DataLoader()
myvocaset=createVocabList(textlist)
# print(len(word2Vec(myvocaset,textlist[0])))
trainMatrix=[]
for document in textlist:
trainMatrix.append(bagofwords2Vec(myvocaset,document))
# print(trainMatrix)
# print(myvocaset)
p0Vect, p1Vect, pc1=NBtrain(trainMatrix,classlabel)
# print(p0Vect)
test=['stupid','my','garbage']
testVec=np.array(bagofwords2Vec(myvocaset,test))
print('{} is classified as : {}'.format(test,NBclassify(testVec,p0Vect,p1Vect,pc1)))
实例 进行垃圾邮件和有用邮件分类
下载的例子中ham文档的第23个邮件中的第二行需要改为SciFinance is 否则有一个‘?’无法进行编码
1.使用等权重的模型
import naive_bayes
import numpy as np
import re
import random
##进行垃圾邮件和有用邮件分类
##进行分词,这里简单粗略的跳出词长度大于2的词
def tokenizer(text):
listoftokens=re.split(r'\W+',text)
return [tok.lower() for tok in listoftokens if len(tok)>2]
##构建训练集和测试集,并进行测试计算错误率
def createTest():
docList=[]
classList=[]
fullText=[]
##有用邮件和垃圾邮件各有25篇
for i in range(1,26):
##读垃圾邮件
wordlist_spam=tokenizer(open(r'F:\机器学习实战源代码和数据\machinelearninginaction\Ch04\email\spam\%d.txt'%i).read())
docList.append(wordlist_spam)
classList.append(1)
fullText.extend(wordlist_spam)
wordlist_ham=tokenizer(open(r'F:\机器学习实战源代码和数据\machinelearninginaction\Ch04\email\ham\%d.txt'%i).read())
docList.append(wordlist_ham)
classList.append(0)
fullText.extend(wordlist_ham)
##构造词汇表
myvocaset = naive_bayes.createVocabList(docList)
##总共有50篇邮件
trainset = list(range(50))
testset = []
##随机抽取10篇作为测试集并从训练集中删去,用构建索引的方式比较简便而且省去一些工作
for i in range(10):
randIndex = int(random.uniform(0, len(trainset)-1)) ##必须得用len(trainset),因为删除里面元素会造成长度改变
testset.append(trainset[randIndex])
del (trainset[randIndex])
##构造训练集
trainMat = []
trainClass = []
for docIndex in trainset:
trainMat.append(naive_bayes.word2Vec(myvocaset, docList[docIndex]))
trainClass.append(classList[docIndex])
##通过训练集得出条件概率
p0Vect, p1Vect, pc1 = naive_bayes.NBtrain(np.array(trainMat), np.array(trainClass))
error = 0
##用测试集进行测试
for testIndex in testset:
testVect = naive_bayes.word2Vec(myvocaset, docList[testIndex])
if naive_bayes.NBclassify(np.array(testVect), p0Vect, p1Vect, pc1) != classList[testIndex]:
error += 1
print('error rate is :{}'.format(float(error) / len(testset)))
return float(error) / len(testset)
if __name__=='__main__':
##多次迭代求平均错误率
errorSum=0
for i in range(1000):
errorSum+=createTest()
print('average error rate is :{}'.format(errorSum/1000))
迭代1000次后平均错误率为
2.使用词袋
import bagsofwordfNB
import numpy as np
import re
import random
##进行垃圾邮件和有用邮件分类,使用的是词袋版本
##进行分词,这里简单粗略的跳出词长度大于2的词
def tokenizer(text):
listoftokens=re.split(r'\W+',text) ##W+和W*在idle里并没有很大影响,但pycharm中区别较大
# print(listoftokens)
res=[tok.lower() for tok in listoftokens if len(tok)>2]
# print(res)
return res
##构建训练集和测试集,并进行测试计算错误率
def createTest():
docList=[]
classList=[]
fullText=[]
##有用邮件和垃圾邮件各有25篇
for i in range(1,26):
##读垃圾邮件
text=open(r'F:\机器学习实战源代码和数据\machinelearninginaction\Ch04\email\spam\%d.txt'%i).read()
# print(text)
wordlist_spam=tokenizer(text)
# print(wordlist_spam)
docList.append(wordlist_spam)
classList.append(1)
fullText.extend(wordlist_spam)
wordlist_ham=tokenizer(open(r'F:\机器学习实战源代码和数据\machinelearninginaction\Ch04\email\ham\%d.txt'%i).read())
docList.append(wordlist_ham)
classList.append(0)
fullText.extend(wordlist_ham)
# print(docList)
##构造词汇表
myvocaset = bagsofwordfNB.createVocabList(docList)
##总共有50篇邮件
trainset = list(range(50))
testset = []
##随机抽取10篇作为测试集并从训练集中删去,用构建索引的方式比较简便而且省去一些工作
for i in range(10):
randIndex = int(random.uniform(0, len(trainset))) ##必须得用len(trainset),因为删除里面元素会造成长度改变
testset.append(trainset[randIndex])
del (trainset[randIndex])
##构造训练集
trainMat = []
trainClass = []
for docIndex in trainset:
trainMat.append(bagsofwordfNB.bagofwords2Vec(myvocaset, docList[docIndex]))
trainClass.append(classList[docIndex])
##通过训练集得出条件概率
p0Vect, p1Vect, pc1 = bagsofwordfNB.NBtrain(np.array(trainMat), np.array(trainClass))
error = 0
##用测试集进行测试
for testIndex in testset:
testVect = bagsofwordfNB.bagofwords2Vec(myvocaset, docList[testIndex])
if bagsofwordfNB.NBclassify(np.array(testVect), p0Vect, p1Vect, pc1) != classList[testIndex]:
error += 1
print('error rate is :{}'.format(float(error) / len(testset)))
return float(error) / len(testset)
if __name__=='__main__':
##多次迭代求平均错误率
errorSum=0
for i in range(1000):
errorSum+=createTest()
print('average error rate is :{}'.format(errorSum/1000))
迭代1000次后平均错误率为
进阶实验
1.可尝试对词的出现次数做排序,从多到少的顺序排序后去除前30个再计算错误率(为了粗略去除冗余的辅助性单词)
2.使用停用词去除冗余的辅助性单词
3.输出单词的条件概率,通过阈值进行筛选,将得出的结果从高到低进行排序,选取前n个单词,可以得到不同类别的文章对应的相关信息(但其中会含有许多停用词,为了更好的效果需要去除),若是地域广告,可获得其中不同地区人们的倾向
基本模型
import operator
import feedparser
import completeNB2
import bagsofwordfNB
import numpy as np
import random
##网址的内容没用读进去,
def CalmostWord(vocabList,fulltext):
freqDic={}
for word in vocabList:
freqDic[word]=fulltext.count(word)
sortedFreDic=sorted(freqDic,key=lambda k:freqDic[k],reverse=True)
##取出现频率最高的前30个
return sortedFreDic[:30]
def createTest1(feed1,feed0): ##输入的是两个rss源
docList = []
classList = []
fullText = []
minlen=min(len(feed1['entries']),len(feed0['entries']))
# print(minlen)
for i in range(minlen):
wordlist=completeNB2.tokenizer(feed1['entries'][i]['summary'])
docList.append(wordlist)
classList.append(1)
fullText.extend(wordlist)
wordlist=completeNB2.tokenizer(feed0['entries'][i]['summary'])
docList.append(wordlist)
classList.append(0)
fullText.append(wordlist)
myvocablist=bagsofwordfNB.createVocabList(docList)
topWord=CalmostWord(myvocablist,fullText)
for word in topWord:
if word in myvocablist:
myvocablist.remove(word)
trainset=list(range(minlen*2))
testset=[]
for i in range(20):
randIndex = int(random.uniform(0, len(trainset))) ##必须得用len(trainset),因为删除里面元素会造成长度改变
print(len(trainset),randIndex)
testset.append(trainset[randIndex])
del (trainset[randIndex])
##构造训练集
trainMat = []
trainClass = []
for docIndex in trainset:
trainMat.append(bagsofwordfNB.bagofwords2Vec(myvocablist, docList[docIndex]))
trainClass.append(classList[docIndex])
##通过训练集得出条件概率
p0Vect, p1Vect, pc1 = bagsofwordfNB.NBtrain(np.array(trainMat), np.array(trainClass))
error = 0
##用测试集进行测试
for testIndex in testset:
testVect = bagsofwordfNB.bagofwords2Vec(myvocablist, docList[testIndex])
if bagsofwordfNB.NBclassify(np.array(testVect), p0Vect, p1Vect, pc1) != classList[testIndex]:
error += 1
print('error rate is :{}'.format(float(error) / len(testset)))
return myvocablist,p0Vect,p1Vect
##得出超过一定阈值的条件概率的词,通过去掉高频出现的前30个词后的
def getTopselectedWord(myvocablist,p0V,p1V):
p1=[]
p0=[]
for i in range(len(p0V)):
if p0V[i]>-6.0:
p0.append((myvocablist[i],p0V[i]))
if p1V[i]>-6.0:
p1.append((myvocablist[i],p1V[i]))
sortedp1=sorted(p1,key=lambda k:k[1],reverse=True)
sortedp0=sorted(p0,key=lambda k:k[1],reverse=True)
print('p0V---------------------------------------------------------------------------------------------')
print(sortedp0)
for i in sortedp0:
print(i[0])
print('p1V----------------------------------------------------------------------------------------------')
print(sortedp1)
for i in sortedp1:
print(i[0])
if __name__=='__main__':
ny=feedparser.parse('http://newyork.craigslist.orh/stp/index.rss')
print(ny)
sy=feedparser.parse('http://sfbay.craigslist.orh/stp/index.rss')
myvoca,p0V,p1V=createTest1(ny,sy)
由于rss源读不进去,改为垃圾邮件和有效邮件进行试验
test1去除高频(辅助性的词)版本
import bagsofwordfNB
import numpy as np
import re
import random
import Adervertisement_sentiment
##进行垃圾邮件和有用邮件分类,使用的是词袋版本,去除词频最高的前30个单词后看错误率对比,以及得出两类反映出的隐藏信息不同倾向,与test2空白组对比
##进行分词,这里简单粗略的跳出词长度大于2的词
##可以替换成自己的分词程序来获得更好的效果
def tokenizer(text):
listoftokens=re.split(r'\W+',text) ##W+和W*在idle里并没有很大影响,但pycharm中区别较大
# print(listoftokens)
res=[tok.lower() for tok in listoftokens if len(tok)>2]
# print(res)
return res
##构建训练集和测试集,并进行测试计算错误率
def createTest():
docList=[]
classList=[]
fullText=[]
##有用邮件和垃圾邮件各有25篇
for i in range(1,26):
##读垃圾邮件
text=open(r'F:\机器学习实战源代码和数据\machinelearninginaction\Ch04\email\spam\%d.txt'%i).read()
# print(text)
wordlist_spam=tokenizer(text)
# print(wordlist_spam)
docList.append(wordlist_spam)
classList.append(1)
fullText.extend(wordlist_spam)
wordlist_ham=tokenizer(open(r'F:\机器学习实战源代码和数据\machinelearninginaction\Ch04\email\ham\%d.txt'%i).read())
docList.append(wordlist_ham)
classList.append(0)
fullText.extend(wordlist_ham)
# print(docList)
##构造词汇表
myvocaset = bagsofwordfNB.createVocabList(docList)
# print(len(myvocaset))
topWord = Adervertisement_sentiment.CalmostWord(myvocaset, fullText)
# print(topWord)
for word in topWord:
# print(word)
if word in myvocaset:
myvocaset.remove(word)
# print(len(myvocaset))
##总共有50篇邮件
trainset = list(range(50))
testset = []
##随机抽取10篇作为测试集并从训练集中删去,用构建索引的方式比较简便而且省去一些工作
for i in range(10):
randIndex = int(random.uniform(0, len(trainset))) ##必须得用len(trainset),因为删除里面元素会造成长度改变
testset.append(trainset[randIndex])
del (trainset[randIndex])
##构造训练集
trainMat = []
trainClass = []
for docIndex in trainset:
trainMat.append(bagsofwordfNB.bagofwords2Vec(myvocaset, docList[docIndex]))
trainClass.append(classList[docIndex])
##通过训练集得出条件概率
p0Vect, p1Vect, pc1 = bagsofwordfNB.NBtrain(np.array(trainMat), np.array(trainClass))
error = 0
##用测试集进行测试
for testIndex in testset:
testVect = bagsofwordfNB.bagofwords2Vec(myvocaset, docList[testIndex])
if bagsofwordfNB.NBclassify(np.array(testVect), p0Vect, p1Vect, pc1) != classList[testIndex]:
error += 1
print('error rate is :{}'.format(float(error) / len(testset)))
return myvocaset,p0Vect,p1Vect,float(error) / len(testset)
if __name__=='__main__':
errorSum=0
for i in range(100):
myvocaset, p0Vect, p1Vect ,error= createTest()
errorSum+=error
print('average error rate is {}'.format(errorSum/100))
Adervertisement_sentiment.getTopselectedWord(myvocaset,p0Vect,p1Vect)
错误率为:
有效邮件类中人名以及办公词语有较大出现倾向
垃圾邮件中推销产品价格,商品名出现倾向大
test2空白对照版本
import bagsofwordfNB
import numpy as np
import re
import random
##进行垃圾邮件和有用邮件分类,使用的是词袋版本
##进行分词,这里简单粗略的跳出词长度大于2的词
##可以替换成自己的分词程序来获得更好的效果
def tokenizer(text):
listoftokens=re.split(r'\W+',text) ##W+和W*在idle里并没有很大影响,但pycharm中区别较大
# print(listoftokens)
res=[tok.lower() for tok in listoftokens if len(tok)>2]
# print(res)
return res
##构建训练集和测试集,并进行测试计算错误率
def createTest():
docList=[]
classList=[]
fullText=[]
##有用邮件和垃圾邮件各有25篇
for i in range(1,26):
##读垃圾邮件
text=open(r'F:\机器学习实战源代码和数据\machinelearninginaction\Ch04\email\spam\%d.txt'%i).read()
# print(text)
wordlist_spam=tokenizer(text)
# print(wordlist_spam)
docList.append(wordlist_spam)
classList.append(1)
fullText.extend(wordlist_spam)
wordlist_ham=tokenizer(open(r'F:\机器学习实战源代码和数据\machinelearninginaction\Ch04\email\ham\%d.txt'%i).read())
docList.append(wordlist_ham)
classList.append(0)
fullText.extend(wordlist_ham)
# print(docList)
##构造词汇表
myvocaset = bagsofwordfNB.createVocabList(docList)
##总共有50篇邮件
trainset = list(range(50))
testset = []
##随机抽取10篇作为测试集并从训练集中删去,用构建索引的方式比较简便而且省去一些工作
for i in range(10):
randIndex = int(random.uniform(0, len(trainset))) ##必须得用len(trainset),因为删除里面元素会造成长度改变
testset.append(trainset[randIndex])
del (trainset[randIndex])
##构造训练集
trainMat = []
trainClass = []
for docIndex in trainset:
trainMat.append(bagsofwordfNB.bagofwords2Vec(myvocaset, docList[docIndex]))
trainClass.append(classList[docIndex])
##通过训练集得出条件概率
p0Vect, p1Vect, pc1 = bagsofwordfNB.NBtrain(np.array(trainMat), np.array(trainClass))
error = 0
##用测试集进行测试
for testIndex in testset:
testVect = bagsofwordfNB.bagofwords2Vec(myvocaset, docList[testIndex])
if bagsofwordfNB.NBclassify(np.array(testVect), p0Vect, p1Vect, pc1) != classList[testIndex]:
error += 1
print('error rate is :{}'.format(float(error) / len(testset)))
return float(error) / len(testset)
if __name__=='__main__':
##多次迭代求平均错误率
errorSum=0
for i in range(100):
errorSum+=createTest()
print('average error rate is :{}'.format(errorSum/100))
错误率为
不管是哪一类,辅助性词出现的频率最多,干扰倾向分析
错误率按道理会下降,上升的原因可能是单单靠去除词频最高的前30个词的做法还远远不够,太粗略,把有用的词也去除了,并且分词也不够精细