这是学习机器学习算法实战这本书时,写的代码实战。让自己对各个算法有更直观的了解,不能一直不写啊。不管简单还是不简单都亲自一行一行的敲一遍啊。
具体的源码和和数据链接:https://pan.baidu.com/s/1G2S2pb5gfBnxGNNTFgTkEA 密码:fov0
下面是主程序bayes.py和其中在实际操作中遇到的问题啊
# -*- coding: utf-8 -*-
# author: Yufeng Song
from numpy import *
import re
import feedparser
def loadDataSet():
postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classVec = [0, 1, 0, 1, 0, 1]
return postingList, classVec
def createVocabList(dataSet):
vocabSet = set([])
for document in dataSet:
vocabSet = vocabSet | set(document)
return list(vocabSet)
# 这是词集模型,只判断词是否出现所以只为1
def setOfWords2Vec(vocabList, inputSet):
returnVec = [0] * len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else:
print('the word:%s is not in my Vocabulary!' % word)
return returnVec
# 这是词袋模型,所以判断的次数不止为1
def bagOfWords2VecMN(vocabList, inputSet):
returnVec = [0] * len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1
return returnVec
# 这个函数就是计算当是侮辱性文档中每个词出现的概率和非侮辱性文档时每个词出现的概率
def trainNBO(trainMatrix, trainCategory):
numTrainDocs = len(trainMatrix)
# print(numTrainDocs) # 6
# print(trainCategory) # [0, 1, 0, 1, 0, 1]
numWords = len(trainMatrix[0])
pAbusive = sum(trainCategory) / float(numTrainDocs)
# p0Num = zeros(numWords);p1Num = zeros(numWords)
# p0Denom = 0.0; p1Denom = 0.0
p0Num = ones(numWords);
p1Num = ones(numWords);
p0Denom = 2.0;
p1Denom = 2.0 # 防止概率为0的情况
for i in range(numTrainDocs):
if trainCategory[i] == 1: # 是侮辱性文档的情况
p1Num += trainMatrix[i] # 对应的词的位置会加一
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
# print('#' * 50)
# print(p1Num)
# print(p1Denom)
p1Vect = log(p1Num / p1Denom)
p0Vect = log(p0Num / p0Denom)
return p0Vect, p1Vect, pAbusive # p0Vect不是侮辱性文档时每个词出现的概率,p1Vect是侮辱性文档时每个词出现的概率
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
p1 = sum(vec2Classify * p1Vec) + log(pClass1)
p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
# if p1 > p0:
# return 1
# else:
# return 0
return 1 if p1 > p0 else 0
def testingNB():
listOPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
trainMat = []
for postinDoc in listOPosts:
trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
p0V, p1V, pAb = trainNBO(array(trainMat), array(listClasses))
testEntry = ['love', 'my', 'dalmation']
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
print(testEntry, 'classified as:', classifyNB(thisDoc, p0V, p1V, pAb))
testEntry = ['stupid', 'garbage']
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
print(testEntry, 'classified as:', classifyNB(thisDoc, p0V, p1V, pAb))
def textParse(bigString):
listOfTokens = re.split(r'\W*', bigString)
return [tok.lower() for tok in listOfTokens if len(tok) > 2]
def spamTest():
docList = [];
classList = [];
fullText = []
for i in range(1, 26):
wordList = textParse(open('email/spam/%d.txt' % i).read())
if i == 1: print(wordList)
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = textParse(open('email/ham/%d.txt' % i,encoding='gb18030',errors='ignore').read())#open(‘1.txt’,encoding=’gb18030’,errors=‘ignore’)
# wordList = textParse(open('email/ham/%d.txt' % i).read().decode('utf-8'))
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList)
# print('#'*100)
# print(vocabList)
trainingSet =list(range(50)); #python3.x range返回的是range对象,不返回数组对象
testSet = []
for i in range(10):
randIndex = int(random.uniform(0, len(trainingSet)))#返回[x,y)间的一个随机数
testSet.append(trainingSet[randIndex])
del (trainingSet[randIndex])
trainMat = [];
trainClasses = []
for docIndex in trainingSet:
trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V, p1V, pSam = trainNBO(array(trainMat), array(trainClasses))
errorCount = 0
for docIndex in testSet:
wordVector = setOfWords2Vec(vocabList, docList[docIndex])
if classifyNB(array(wordVector), p0V, p1V, pSam) != classList[docIndex]:
errorCount += 1
print('the error rate is:', float(errorCount) / len(testSet))
def calcMostFreq(vocabList,fullText):
freqDict = {}
for token in vocabList:
freqDict[token] = fullText.count(token)
sortedFreq = sorted(freqDict.items(),key = lambda x:x[1],reverse=True)
return sortedFreq[:30]
def localWords(feed1,feed0):
docList=[];classList = [];fullText = []
minLen = min(len(feed1['entries']),len(feed0['entries']))
for i in range(minLen):
wordList = textParse(feed1['entries'][i]['summary'])
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = textParse(feed0['entries'][i]['summary'])
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList)
top30Words = calcMostFreq(vocabList,fullText)
for pairW in top30Words:
if pairW[0] in vocabList: vocabList.remove(pairW[0])
trainingSet =list(range(2*minLen)); testSet = []
for i in range(20):
randIndex = int(random.uniform(0,len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
trainMat = [];trainClasses = []
for docIndex in trainingSet:
trainMat.append(bagOfWords2VecMN(vocabList,docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V,p1V,pSpam = trainNBO(array(trainMat),array(trainClasses))
errorCount = 0
for docIndex in testSet:
wordVector = bagOfWords2VecMN(vocabList,docList[docIndex])
if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
errorCount += 1
print('the error rate is:',float(errorCount)/len(testSet))
return vocabList,p0V,p1V
def getTopWords(ny,sf):
vocabList,p0V,p1V = localWords(ny,sf)
topNY = [];topSF = []
for i in range(len(p0V)):
if p0V[i] > -6.0 : topSF.append((vocabList[i],p0V[i]))
if p1V[i] > -6.0 : topNY.append((vocabList[i],p0V[i]))
sortedSF = sorted(topSF,key=lambda x:x[1],reverse=True)
print(sortedSF)
print("SF**"*10)
# for item in sortedSF:
# print(item[0])
for i in range(10):
print(sortedSF[i][0],end=" ")
sortedNY = sorted(topNY,key=lambda x:x[1],reverse=True)
print(sortedNY)
print('NF**'*10)
# for item in sortedNY[10]:
# print(item[0])
for i in range(10):
print(sortedNY[i][0],end='\t')
if __name__ == '__main__':
# listOPosts, listClasses = loadDataSet()
# myVocabList = createVocabList(listOPosts)
# print(listOPosts)
# print(myVocabList)
# print(setOfWords2Vec(myVocabList, listOPosts[0]))
# # [0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1]
# print(setOfWords2Vec(myVocabList, listOPosts[3]))
# trainMat = []
# for postinDoc in listOPosts:
# trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
#
# p0V, p1V, pAb = trainNBO(trainMat, listClasses)
# print(p0V)
# print(p1V)
# print(pAb)
# testingNB()
# emailText = open('email/ham/6.txt').read()
# print(emailText)
# regEx = re.compile('\\W*')
# listOfTokens = regEx.split(emailText)
# print(listOfTokens)
# spamTest()
ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')#解析一个网站内容f
sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')
# print(ny)
# print(len(ny['entries']))#25
# vocabList,pSF,pNY = localWords(ny,sf)
#
# vocabList,pSF,pNY = localWords(ny,sf)
getTopWords(ny,sf)