bayes 源码

#!/usr/bin/python
# -*- coding:utf-8 -*-

from numpy import *

def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]    #1是侮辱性的,0不是
    return postingList,classVec

def createVocabList(dataSet):	#根据数据集创建词库
	vocabSet = set([])
	for document in dataSet:
		vocabSet = vocabSet | set(document)		#将document中的词合并到词库中
	return list(vocabSet)

def setOfWords2Vec(vocabList, inputSet):	#将输入的集合转换为向量,输入为词库和词汇数组
	returnVec = [0]*len(vocabList)	#创建词库中词个数长度的向量,所以返回的向量是词库中词数量的大小
	for word in inputSet:		#遍历输入单词数组中的每个词
		if word in vocabList:	#如果词在词库中
			returnVec[vocabList.index(word)] = 1 	#找到词在词库中对应的下标并在向量中标记为出现过(1)
		else:
			print "the word :%s is not in my Vocabulary!" % word 	#否则输出没有这个词
	return returnVec	#返回词库中的词在这条样本中是否出现的向量

def bagOfWords2Vec(vocabList, inputSet):	#将输入的集合转换为向量,输入为词库和词汇数组
	returnVec = [0]*len(vocabList)	#创建词库中词个数长度的向量,所以返回的向量是词库中词数量的大小
	for word in inputSet:		#遍历输入单词数组中的每个词
		if word in vocabList:	#如果词在词库中
			returnVec[vocabList.index(word)] += 1 	#找到词在词库中对应的下标并在向量中的出现次数加一
		else:
			print "the word :%s is not in my Vocabulary!" % word 	#否则输出没有这个词
	return returnVec	#返回词库中的词在这条样本中是否出现的向量

def trainNB0(trainMatrix,trainCategory):	#计算贝叶斯中的条件特征和先验概率
	numTrainDocs = len(trainMatrix)
	numWords = len(trainMatrix[0])
	pAbusive = sum(trainCategory)/float(numTrainDocs)	#因为侮辱性的为1,所以sum起来就是侮辱性的样本数量
	p0Num = ones(numWords)	#如果初始化为0矩阵就可能使有的没有出现的词对应的元素为0,最后的结果也会为0,造成影响
	p1Num = ones(numWords)	#所以初始化为1
	p0Denom = 2.0			#但是总数是不是应该初始化为词库中词的数量啊?
	p1Denom = 2.0
	for i in range(numTrainDocs):
		if trainCategory[i] == 1:
			p1Num += trainMatrix[i]			#其实每个向量代表的词是相同的,都是从词库中建立的向量,所以可以直接相加
			p1Denom += sum(trainMatrix[i])	#计算总词数
		else:
			p0Num += trainMatrix[i]
			p0Denom += sum(trainMatrix[i])
	p1Vect = log(p1Num/p1Denom)				#属于侮辱性的条件下,词库中每个词出现的概率
	p0Vect = log(p0Num/p0Denom)
	return p0Vect,p1Vect,pAbusive

def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
	p1 = sum(vec2Classify * p1Vec) + log(pClass1)		#直接相乘再相加就可以,因为向量也是按照词库的单词顺序生成的
	p0 = sum(vec2Classify * p0Vec) + log(1.0-pClass1)
	if p1>p0:
		return 1
	else:
		return 0

def testingNB():
	listOPosts,listClasses = loadDataSet()		#li是每条数据,cl是数据的类别
	myVocabList = createVocabList(listOPosts)
	trainMat = []	#这个训练矩阵就是方便计算
	for postinDoc in listOPosts:
		trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
	p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))
	testEntry = ['love','my','dalmation']
	thisDoc = array(setOfWords2Vec(myVocabList,testEntry))
	print testEntry, 'classified as: ', classifyNB(thisDoc,p0V,p1V,pAb)
	testEntry = ['stupid','garbage']
	thisDoc = array(setOfWords2Vec(myVocabList,testEntry))
	print testEntry, 'classified as: ', classifyNB(thisDoc,p0V,p1V,pAb)

def textParse(bigString):	#从一段话中分词,得到文字的词汇列表(去掉少于两个字符的字符串,并转换为小写)
	import re
	listOfTokens = re.split(r'\W*',bigString)
	return [tok.lower() for tok in listOfTokens if len(tok)>2]

def spamTest():		#因为是随机取10组数据进行测试,就可以多进行几次求平均错误率
	docList = []
	classList = []
	fullText = []
	for i in range(1,26):	#因为垃圾邮件和正常邮件各有25封
		wordList = textParse(open('email/spam/%d.txt' %i).read())	#对路径下文件进行处理
		docList.append(wordList)	#将词汇向量加入docList
		fullText.extend(wordList)	#扩展fullList
		classList.append(1)			#对应的类别加入classList
		wordList = textParse(open('email/ham/%d.txt' %i).read())
		docList.append(wordList)
		fullText.extend(wordList)
		classList.append(0)
	vocabList = createVocabList(docList)	#构建词库
	trainingSet = range(50)		#总共的样本数据个数
	testSet = []
	for i in range(10):		#选出10个用作测试
		randIndex = int(random.uniform(0,len(trainingSet)))		#随机取一条数据集中的数据
		testSet.append(trainingSet[randIndex])
		del(trainingSet[randIndex])		#删除选中的数据下标,可以避免选中重复数据
	trainMat = []
	trainClasses=[]
	for docIndex in trainingSet:
		trainMat.append(setOfWords2Vec(vocabList,docList[docIndex]))	#训练样本向量化并加入训练矩阵
		trainClasses.append(classList[docIndex])	#训练样本类加入
	p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
	errorCount = 0
	for docIndex in testSet:
		wordVector = setOfWords2Vec(vocabList, docList[docIndex])	#测试样本向量化
		if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
			errorCount = errorCount+1
			print docList[docIndex]
	print "the error rate is: ", float(errorCount)/len(testSet)

def calcMostFreq(vocabList,fullText):	#返回fullText(文章中所有词的累加,可重复)中最经常出现的30个词
	import operator
	freqDict={}
	for token in vocabList:
		freqDict[token] = fullText.count(token)
	sortedFreq = sorted(freqDict.iteritems(), key=operator.itemgetter(1), reverse=True)
	return sortedFreq[:30]		#输出最经常出现的30个词

def localWords(feed1,feed0):
	import feedparser
	docList = []
	classList = []
	fullText = []
	minLen = min(len(feed1['entries']),len(feed0['entries']))
	for i in range(minLen):
		wordList = textParse(feed1['entries'][i]['summary'])	#对输入文件进行处理
		docList.append(wordList)	#将词汇向量加入docList
		fullText.extend(wordList)	#扩展fullList
		classList.append(1)			#对应的类别加入classList
		wordList = textParse(feed0['entries'][i]['summary'])
		docList.append(wordList)
		fullText.extend(wordList)
		classList.append(0)
	vocabList = createVocabList(docList)
	top30Words = calcMostFreq(vocabList,fullText)
	for pairW in top30Words:		#从词库中去掉出现次数高于30次的词,可能出现后序单词向量中的词在词库中找不到的情况
		if pairW[0] in vocabList:
			vocabList.remove(pairW[0])
	trainingSet = range(2*minLen)
	testSet = []
	for i in range(20):
		randIndex = int(random.uniform(0,len(trainingSet)))
		testSet.append(trainingSet[randIndex])
		del(trainingSet[randIndex])
	trainMat = []
	trainClasses = []
	for docIndex in trainingSet:
		trainMat.append(bagOfWords2Vec(vocabList,docList[docIndex]))
		trainClasses.append(classList[docIndex])
	p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
	errorCount = 0
	for docIndex in testSet:
		wordVector = bagOfWords2Vec(vocabList,docList[docIndex])
		if classifyNB(array(wordVector),p0V,p1V,pSpam)!=classList[docIndex]:
			errorCount = errorCount + 1
	print "the error rate is: ",float(errorCount)/len(testSet)
	return vocabList,p0V,p1V

def getTopWords(ny,sf):
	import operator
	vocabList,p0V,p1V = localWords(ny,sf)
	topNY = []
	topSF = []
	for i in range(len(p0V)):
		if p0V[i]>-5.0:		#将条件概率大于一定值的单词加入输出列表中
			topSF.append((vocabList[i],p0V[i]))
		if p1V[i]>-5.0:
			topNY.append((vocabList[i],p1V[i]))
	sortedSF = sorted(topSF,key = lambda pair:pair[1], reverse = True)	#根据出现概率进行排序
	print "SFSFSFSFSFSFSFSF"
	for item in sortedSF:	#输出
		print item[0]
	sortedNY = sorted(topNY, key = lambda pair:pair[1],reverse = True)
	print "NYNYNYNYNYNYNYNY"
	for item in sortedNY:
		print item[0]


画图库matplotlib测试

#!/usr/bin/python
# -*- coding:utf-8 -*-

from numpy import *
import matplotlib
import matplotlib.pyplot as plt

x = arange(-5.0,5.0,0.01)
y = 2*pow(x,2)
s = log(y)

fig = plt.figure()
ax = fig.add_subplot(211)	#两个数分别是x,y分为多少份,第三个是这个图是其中的第几份
ax.set_xlabel('x')	#设置坐标轴名称
ax.set_ylabel('y')
ax.plot(x,y)

xcord1 = []
xcord2 = []
ycord1 = []
ycord2 = []
for i in range(1000):
	[r1,r2] = random.standard_normal(2)		#生成随机坐标
	myClass = random.uniform(0,1)		#生成0~1随机数
	if myClass>0.5:
		x1 = r1 + 0.9
		x2 = r2+x1-0.9
		xcord1.append(x1)
		xcord2.append(x2)
	else:
		y1 = r1+5.0
		y2 = r2+y1-5.0
		ycord1.append(y1)
		ycord2.append(y2)
ax = fig.add_subplot(212)	
ax.scatter(xcord1,xcord2, marker='o',s = 90)		#设置图中的标记符号和标记大小
ax.scatter(ycord1,ycord2, marker='^',s = 50,c='red')	#设置标记颜色
plt.show()


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值