内容来源于书《机器学习实战》
# *-* coding: utf-8 *-*
'''
<<机器学习实战>>---读书笔记: 第4章 基于概率论的分类而方法:朴素贝叶斯
关键:
1 基于贝叶斯决策理论的分类方法
优点:可处理多分类问题,数据较少仍有效
缺点:对输入数据的准备方式敏感
适用:标称型数据(目标变量只在有限集合中选取,例如真假)。 例如,文档分类
贝叶斯决策理论的核心思想:选择高概率对应类别
贝叶斯概率:引入先验知识和逻辑推理 处理 不确定命题
2 条件概率:
P(A|B)=P(AB) / P(B) = P(B|A) * P(A) / P(B)
使用条件概率来分类:
设p(C1|x,y)表示数据点x,y来自类别C1的概率,那么其实就是求
p(C1|x,y) > p(C2|x,y)的时候,那么属于类别C1
问题转换为求:p(C1|x,y)
p(Ci|x,y)=p(x,y|Ci) * p(Ci)/ p(x,y)
3 使用朴素贝叶斯进行文档分类
朴素贝叶斯的过程
1)收集数据 , 2)准备数据(数值型或布尔型)
3)分析数据:特征较多,使用直方图。 4)训练算法:计算不同独立特征条件概率
5)测试算法:计算错误率, 6)使用算法:例如文档分类
特征独立:特征或者单词出现可能性与他和其他单词相邻没有关系
朴素贝叶斯分类器的假设:每个特征同等重要
4 从词向量计算概率:
设W表示一个向量,由多个数值组成
p(Ci|W) = p(W|Ci) * p(Ci) / p(W)
p(Ci)=类别i中文档总数除以总文档数
因为所有词相互独立
p(W|Ci)=p(W0,W1,...,Wn|C)=p(W0|Ci) * p(W1|Ci) * .... * p(Wn|Ci)
p(W)=p(W0,W1,...,Wn)=p(W0) * p(W1) * ... * p(Wn)
5 训练过程:
朴素贝叶斯分类器训练函数: 需要计算不好类别的整个概率,以及每个单词分别在不好类别和好的类别下的概率,即对于
p(Wi|C1)和p(Wi|C0)的概率,
p(Wi|C1)表示在类别C1下单词Wi的概率=Wi在类别C1出现次数/类别C1中所有单词个数
p(C1)=属于类别C1的文档个数/总的文档个数
输入参数:文档矩阵,文档类别向量
6 验证测试集过程:
根据输入的文本向量,判断属于哪个类别,
需要比较 p(C1|W) 和p(C0|W)大小,即比较的是在文档W出现的情况下,它属于类别1和类别0的概率
p(C1|W)=p(W|C1) * p(C1) / p(W)
p(C0|W)=p(W|C0) * p(C0) / p(W)
即只需要比较p(W|C1) * p(C1) 与 p(W|C0) * p(C0)的概率,取对数后,
即只需要比较p(W|C1) + p(C1) 与 p(W|C0) + p(C0) 的概率
p(W|C1)=sum(W * p1Vec),即文档W乘以类别1的单词概率向量,然后累加即为在类别C1下文档W的概率
'''
from numpy import *
import codecs
import operator
import feedparser
def loadDataSet():
postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classVec = [0,1,0,1,0,1] #1 is abusive, 0 not
return postingList,classVec
#传入数据集,每个数据是一个列表,包含了许多单词,生成词汇列表
def createVocabList(dataSet):
vocabSet = set()
for document in dataSet:
#求并集
vocabSet = vocabSet | set(document)
return list(vocabSet)
#词袋模型:需要统计每个单词出现的次数
def bagOfWords2VecMN(vocabList , inputSet):
resultVec = [0] * len(vocabList)
for word in inputSet:
if word in vocabList:
resultVec[ vocabList.index(word) ] += 1
return resultVec
#将输入的文本向量转化为词汇向量
def setOfWords2Vec(vocabList , inputSet):
# python中 [0] * num表示将元素[0]复制num份
resultVec = [0] * len(vocabList)
for word in inputSet:
if word in vocabList:
resultVec[ vocabList.index(word) ] = 1
else:
print("word %s is not in vocabulary" % word)
return resultVec
def init_test():
dataSet , labels = loadDataSet()
vocabList = createVocabList(dataSet)
print(vocabList)
resultVec = setOfWords2Vec(vocabList , dataSet[0])
print(resultVec)
'''
朴素贝叶斯分类器训练函数: 需要计算不好类别的整个概率,以及每个单词分别在不好类别和好的类别下的概率,即对于
p(Wi|C1)和p(Wi|C0)的概率,
p(Wi|C1)表示在类别C1下单词Wi的概率=Wi在类别C1出现次数/类别C1中所有单词个数
p(C1)=属于类别C1的文档个数/总的文档个数
输入参数:文档矩阵,文档类别矩阵
'''
def trainNB0(trainMatrix , trainCategory):
docNum = len(trainMatrix)
wordNum = len(trainMatrix[0])
pAbusive = sum(trainCategory) / float(docNum)
#计算不好的类别的概率矩阵
'''
p1Num = zeros(wordNum)
p0Num = zeros(wordNum)
p0Denom = 0.0
p1Denom = 0.0
'''
#根据现实情况修改分类器:为了避免p(W0|1) * p(W1|1) *...为0,可将所有词出现次数初始化为1,分母初始化为2
p1Num = ones(wordNum)
p0Num = ones(wordNum)
p0Denom = 2.0
p1Denom = 2.0
#遍历每个文档,统计该文档对应类别下单词出现词频,该类别下总的单词个数
for i in range(docNum):
if 1 == trainCategory[i]:
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
#计算每个单词在每个类别下出现的概率
'''
p1Vect = p1Num / p1Denom
p0Vect = p0Num / p0Denom
'''
#为了避免多个小数相乘变为0,改为log,而不是除法
p1Vect = log(p1Num / p1Denom)
p0Vect = log(p0Num / p0Denom)
return p0Vect , p1Vect , pAbusive
'''
关键:
根据输入的文本向量,判断属于哪个类别,
需要比较 p(C1|W) 和p(C0|W)大小,即比较的是在文档W出现的情况下,它属于类别1和类别0的概率
p(C1|W)=p(W|C1) * p(C1) / p(W)
p(C0|W)=p(W|C0) * p(C0) / p(W)
即只需要比较p(W|C1) * p(C1) 与 p(W|C0) * p(C0)的概率,取对数后,
即只需要比较p(W|C1) + p(C1) 与 p(W|C0) + p(C0) 的概率
p(W|C1)=sum(W * p1Vec),即文档W乘以类别1的单词概率向量,然后累加即为在类别C1下文档W的概率
'''
def classifyNB(inputVec , p0Vec , p1Vec , pClass1):
p1 = sum(inputVec * p1Vec) + pClass1
p0 = sum(inputVec * p0Vec) + (1 - pClass1)
if p1 > p0:
return 1
else:
return 0
def trainNB0_test():
dataSet , labels = loadDataSet()
vocabList = createVocabList(dataSet)
trainMatrix = []
for inputVec in dataSet:
resultVec = setOfWords2Vec(vocabList , inputVec)
trainMatrix.append(resultVec)
p0Vec , p1Vec , pAbusive = trainNB0(trainMatrix , labels)
'''
print(pAbusive)
print(p0Vec)
print(p1Vec)
'''
testDoc = ['love' , 'my' , 'dalmation']
inputVec = array(setOfWords2Vec(vocabList , testDoc))
result = classifyNB(inputVec , p0Vec , p1Vec , pAbusive)
print(result)
testDoc = ['stupid' , 'garbage']
inputVec = array(setOfWords2Vec(vocabList , testDoc))
result = classifyNB(inputVec , p0Vec , p1Vec , pAbusive)
print(result)
'''
'''
def textParse(bigString):
import re
#按照数字和字母进行切分
words = re.split(r'\w*' , bigString)
results = [word.lower() for word in words if len(word) > 2]
return results
#对垃圾邮件进行分类测试
def spamTest():
docList = []
classList = []
fullText = []
#遍历每个文本,对每个文本内容粉刺,加入到文本矩阵中,加入到词汇表中,加入到类别表中
for i in range(1,26):
# file.read() 读取字符串
fileName = 'email/spam/%d.txt' % i
wordList = textParse(codecs.open(fileName, "r" ).read() )
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
fileName = 'email/ham/%d.txt' % i
try:
wordList = textParse(codecs.open(fileName, "r" ).read() )
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
except Exception as ex:
info1 = "file: ", fileName
info2 = " , error: " , ex
info1 += info2
print(info1)
#创建词汇列表
vocabList = createVocabList(docList)
trainingSet = list(range(50)) # python3中range返回range对象,需要转换为数组
testSet = []
#创建随机测试集,包含10个
for i in range(10):
randIndex = int(random.uniform(0 , len(trainingSet)))
#生成的是测试集编号
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
# 疑问,不需要删除训练集中对应的结果吗
#训练
trainMat = []
trainClasses = []
for i in trainingSet:
trainMat.append(setOfWords2Vec(vocabList ,docList[i] ))
trainClasses.append(classList[i] )
p0Vec , p1Vec , pSpam = trainNB0(array(trainMat) , array(trainClasses))
errorCount = 0
#验证测试集
for docIndex in testSet:
wordVector = setOfWords2Vec(vocabList , docList[docIndex])
result = classifyNB(wordVector , p0Vec , p1Vec , pSpam)
if result != classList[docIndex]:
errorCount += 1
print("total error num is: %d , the error rate is: %f " %
(errorCount , errorCount / len(testSet)))
'''
RSS还是一项很有创造性和实用性的东西。RSS 是用于分发 Web 站点上的内容的摘要的一种简单的 XML 格式
'''
def calcMostFreq(vocabList,fullText):
freqDict = {}
for token in vocabList:
freqDict[token]=fullText.count(token)
sortedFreq = sorted(freqDict.items(), key=operator.itemgetter(1), reverse=True)
return sortedFreq[:30]
def localWords(feed1,feed0):
docList=[]; classList = []; fullText =[]
minLen = min(len(feed1['entries']),len(feed0['entries']))
for i in range(minLen):
wordList = textParse(feed1['entries'][i]['summary'])
docList.append(wordList)
fullText.extend(wordList)
classList.append(1) #NY is class 1
wordList = textParse(feed0['entries'][i]['summary'])
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList)#create vocabulary
top30Words = calcMostFreq(vocabList,fullText) #remove top 30 words
for pairW in top30Words:
if pairW[0] in vocabList: vocabList.remove(pairW[0])
trainingSet = range(2*minLen); testSet=[] #create test set
for i in range(20):
randIndex = int(random.uniform(0,len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
trainMat=[]; trainClasses = []
for docIndex in trainingSet:#train the classifier (get probs) trainNB0
trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
errorCount = 0
for docIndex in testSet: #classify the remaining items
wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
errorCount += 1
print('the error rate is: ',float(errorCount)/len(testSet))
return vocabList,p0V,p1V
def getTopWords(ny,sf):
vocabList,p0V,p1V=localWords(ny,sf)
topNY=[]; topSF=[]
for i in range(len(p0V)):
if p0V[i] > -6.0 : topSF.append((vocabList[i],p0V[i]))
if p1V[i] > -6.0 : topNY.append((vocabList[i],p1V[i]))
sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True)
print("SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**")
for item in sortedSF:
print(item[0])
sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)
print("NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**")
for item in sortedNY:
print(item[0])
if __name__ == "__main__":
init_test()
trainNB0_test()
spamTest()
ny = feedparser.parse("http://newyork.craigslist.org/stp/index.rss")
sf = feedparser.parse("http://sfbay.craigslist.org/stp/index.rss")
getTopWords(ny, sf)
print("done")