#!/usr/bin/python
# -*- coding:utf-8 -*-
from numpy import *
def loadDataSet():
postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classVec = [0,1,0,1,0,1] #1是侮辱性的,0不是
return postingList,classVec
def createVocabList(dataSet): #根据数据集创建词库
vocabSet = set([])
for document in dataSet:
vocabSet = vocabSet | set(document) #将document中的词合并到词库中
return list(vocabSet)
def setOfWords2Vec(vocabList, inputSet): #将输入的集合转换为向量,输入为词库和词汇数组
returnVec = [0]*len(vocabList) #创建词库中词个数长度的向量,所以返回的向量是词库中词数量的大小
for word in inputSet: #遍历输入单词数组中的每个词
if word in vocabList: #如果词在词库中
returnVec[vocabList.index(word)] = 1 #找到词在词库中对应的下标并在向量中标记为出现过(1)
else:
print "the word :%s is not in my Vocabulary!" % word #否则输出没有这个词
return returnVec #返回词库中的词在这条样本中是否出现的向量
def bagOfWords2Vec(vocabList, inputSet): #将输入的集合转换为向量,输入为词库和词汇数组
returnVec = [0]*len(vocabList) #创建词库中词个数长度的向量,所以返回的向量是词库中词数量的大小
for word in inputSet: #遍历输入单词数组中的每个词
if word in vocabList: #如果词在词库中
returnVec[vocabList.index(word)] += 1 #找到词在词库中对应的下标并在向量中的出现次数加一
else:
print "the word :%s is not in my Vocabulary!" % word #否则输出没有这个词
return returnVec #返回词库中的词在这条样本中是否出现的向量
def trainNB0(trainMatrix,trainCategory): #计算贝叶斯中的条件特征和先验概率
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pAbusive = sum(trainCategory)/float(numTrainDocs) #因为侮辱性的为1,所以sum起来就是侮辱性的样本数量
p0Num = ones(numWords) #如果初始化为0矩阵就可能使有的没有出现的词对应的元素为0,最后的结果也会为0,造成影响
p1Num = ones(numWords) #所以初始化为1
p0Denom = 2.0 #但是总数是不是应该初始化为词库中词的数量啊?
p1Denom = 2.0
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num += trainMatrix[i] #其实每个向量代表的词是相同的,都是从词库中建立的向量,所以可以直接相加
p1Denom += sum(trainMatrix[i]) #计算总词数
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
p1Vect = log(p1Num/p1Denom) #属于侮辱性的条件下,词库中每个词出现的概率
p0Vect = log(p0Num/p0Denom)
return p0Vect,p1Vect,pAbusive
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
p1 = sum(vec2Classify * p1Vec) + log(pClass1) #直接相乘再相加就可以,因为向量也是按照词库的单词顺序生成的
p0 = sum(vec2Classify * p0Vec) + log(1.0-pClass1)
if p1>p0:
return 1
else:
return 0
def testingNB():
listOPosts,listClasses = loadDataSet() #li是每条数据,cl是数据的类别
myVocabList = createVocabList(listOPosts)
trainMat = [] #这个训练矩阵就是方便计算
for postinDoc in listOPosts:
trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))
testEntry = ['love','my','dalmation']
thisDoc = array(setOfWords2Vec(myVocabList,testEntry))
print testEntry, 'classified as: ', classifyNB(thisDoc,p0V,p1V,pAb)
testEntry = ['stupid','garbage']
thisDoc = array(setOfWords2Vec(myVocabList,testEntry))
print testEntry, 'classified as: ', classifyNB(thisDoc,p0V,p1V,pAb)
def textParse(bigString): #从一段话中分词,得到文字的词汇列表(去掉少于两个字符的字符串,并转换为小写)
import re
listOfTokens = re.split(r'\W*',bigString)
return [tok.lower() for tok in listOfTokens if len(tok)>2]
def spamTest(): #因为是随机取10组数据进行测试,就可以多进行几次求平均错误率
docList = []
classList = []
fullText = []
for i in range(1,26): #因为垃圾邮件和正常邮件各有25封
wordList = textParse(open('email/spam/%d.txt' %i).read()) #对路径下文件进行处理
docList.append(wordList) #将词汇向量加入docList
fullText.extend(wordList) #扩展fullList
classList.append(1) #对应的类别加入classList
wordList = textParse(open('email/ham/%d.txt' %i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList) #构建词库
trainingSet = range(50) #总共的样本数据个数
testSet = []
for i in range(10): #选出10个用作测试
randIndex = int(random.uniform(0,len(trainingSet))) #随机取一条数据集中的数据
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex]) #删除选中的数据下标,可以避免选中重复数据
trainMat = []
trainClasses=[]
for docIndex in trainingSet:
trainMat.append(setOfWords2Vec(vocabList,docList[docIndex])) #训练样本向量化并加入训练矩阵
trainClasses.append(classList[docIndex]) #训练样本类加入
p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
errorCount = 0
for docIndex in testSet:
wordVector = setOfWords2Vec(vocabList, docList[docIndex]) #测试样本向量化
if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
errorCount = errorCount+1
print docList[docIndex]
print "the error rate is: ", float(errorCount)/len(testSet)
def calcMostFreq(vocabList,fullText): #返回fullText(文章中所有词的累加,可重复)中最经常出现的30个词
import operator
freqDict={}
for token in vocabList:
freqDict[token] = fullText.count(token)
sortedFreq = sorted(freqDict.iteritems(), key=operator.itemgetter(1), reverse=True)
return sortedFreq[:30] #输出最经常出现的30个词
def localWords(feed1,feed0):
import feedparser
docList = []
classList = []
fullText = []
minLen = min(len(feed1['entries']),len(feed0['entries']))
for i in range(minLen):
wordList = textParse(feed1['entries'][i]['summary']) #对输入文件进行处理
docList.append(wordList) #将词汇向量加入docList
fullText.extend(wordList) #扩展fullList
classList.append(1) #对应的类别加入classList
wordList = textParse(feed0['entries'][i]['summary'])
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList)
top30Words = calcMostFreq(vocabList,fullText)
for pairW in top30Words: #从词库中去掉出现次数高于30次的词,可能出现后序单词向量中的词在词库中找不到的情况
if pairW[0] in vocabList:
vocabList.remove(pairW[0])
trainingSet = range(2*minLen)
testSet = []
for i in range(20):
randIndex = int(random.uniform(0,len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
trainMat = []
trainClasses = []
for docIndex in trainingSet:
trainMat.append(bagOfWords2Vec(vocabList,docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
errorCount = 0
for docIndex in testSet:
wordVector = bagOfWords2Vec(vocabList,docList[docIndex])
if classifyNB(array(wordVector),p0V,p1V,pSpam)!=classList[docIndex]:
errorCount = errorCount + 1
print "the error rate is: ",float(errorCount)/len(testSet)
return vocabList,p0V,p1V
def getTopWords(ny,sf):
import operator
vocabList,p0V,p1V = localWords(ny,sf)
topNY = []
topSF = []
for i in range(len(p0V)):
if p0V[i]>-5.0: #将条件概率大于一定值的单词加入输出列表中
topSF.append((vocabList[i],p0V[i]))
if p1V[i]>-5.0:
topNY.append((vocabList[i],p1V[i]))
sortedSF = sorted(topSF,key = lambda pair:pair[1], reverse = True) #根据出现概率进行排序
print "SFSFSFSFSFSFSFSF"
for item in sortedSF: #输出
print item[0]
sortedNY = sorted(topNY, key = lambda pair:pair[1],reverse = True)
print "NYNYNYNYNYNYNYNY"
for item in sortedNY:
print item[0]
画图库matplotlib测试
#!/usr/bin/python
# -*- coding:utf-8 -*-
from numpy import *
import matplotlib
import matplotlib.pyplot as plt
x = arange(-5.0,5.0,0.01)
y = 2*pow(x,2)
s = log(y)
fig = plt.figure()
ax = fig.add_subplot(211) #两个数分别是x,y分为多少份,第三个是这个图是其中的第几份
ax.set_xlabel('x') #设置坐标轴名称
ax.set_ylabel('y')
ax.plot(x,y)
xcord1 = []
xcord2 = []
ycord1 = []
ycord2 = []
for i in range(1000):
[r1,r2] = random.standard_normal(2) #生成随机坐标
myClass = random.uniform(0,1) #生成0~1随机数
if myClass>0.5:
x1 = r1 + 0.9
x2 = r2+x1-0.9
xcord1.append(x1)
xcord2.append(x2)
else:
y1 = r1+5.0
y2 = r2+y1-5.0
ycord1.append(y1)
ycord2.append(y2)
ax = fig.add_subplot(212)
ax.scatter(xcord1,xcord2, marker='o',s = 90) #设置图中的标记符号和标记大小
ax.scatter(ycord1,ycord2, marker='^',s = 50,c='red') #设置标记颜色
plt.show()