jupyter notebook
机器学习基础
from numpy import *
random.rand(4,4)
randMat=mat(random.rand(4,4)) mat 把数组转化为矩阵
invrandMat=randMat.I .I 矩阵求逆
invrandMat*randMat 矩阵乘法
eye(4) 产生单位矩阵
K-近邻算法
from numpy import *
import operator
def createDataSet():
group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels = ['A','A','B','B']
return group, labels
import kNN
group,labels=kNN.createDataSet()
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0] #shape 数组维度 去行
diffMat = tile(inX, (dataSetSize,1)) - dataSet # tile 矩阵重复 ,InX 要分类的点的向量,广播思想,矩阵运算思想
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort() # argsort函数返回的是数组值从小到大的索引值
classCount={}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 #get指定dict默认键的值,setdefault相似的函数
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) #按字典键值进行排序, operator包中 sorted(iterable[, cmp[, key[, reverse]]])
return sortedClassCount[0][0]
kNN.classify0([0,0],group,labels,3)
def file2matrix(filename):
fr = open(filename)
numberOfLines = len(fr.readlines()) #获得文件行数
returnMat = zeros((numberOfLines,3)) #prepare matrix to return
classLabelVector = [] #prepare labels return
fr = open(filename)
index = 0
for line in fr.readlines():
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat,classLabelVector
import kNN
import matplotlib
from numpy import *
import matplotlib.pyplot as plt
datingDataMat,datingLabels=kNN.file2matrix('datingTestSet2.txt')
fig=plt.figure()
ax=fig.add_subplot(111)
ax.scatter(datingDataMat[:,1], datingDataMat[:,2], 15.0*array(datingLabels), 15.0*array(datingLabels))
ax.axis([-2,25,-0.2,2.0])
plt.xlabel('Percentage of Time Spent Playing Video Games')
plt.ylabel('Liters of Ice Cream Consumed Per Week')
plt.show()
数据准备,归一化数值
def autoNorm(dataSet):
minVals = dataSet.min(0)
maxVals = dataSet.max(0) #0代表了列方向,1代表了行方向
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet)) #产生了和原来大小一样的都为0的
m = dataSet.shape[0]
normDataSet = dataSet - tile(minVals, (m,1)) #广播,重复 数据减去列的最小值
normDataSet = normDataSet/tile(ranges, (m,1)) #广播范围, 注意有些软件包中/表示矩阵相除
return normDataSet, ranges, minVals
normMat,ranges,minVals=kNN.autoNorm(datingDataMat)
作为完整算法验证分类器
def datingClassTest():
hoRatio = 0.50 #设置训练集大小
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat) #归一化后的数据
m = normMat.shape[0]
numTestVecs = int(m*hoRatio) # 训练集
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3) # 注意第一个参数是测试数据, 后两个参数代表训练数据集
print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i])
if (classifierResult != datingLabels[i]): errorCount += 1.0
print "the total error rate is: %f" % (errorCount/float(numTestVecs))
print errorCount
构建完整可用系统
def classifyPerson():
resultList=['not at all','in small doses','in large doses']
percentTats=float(raw_input("percentage of time spent playing video games?"))
ffMiles=float(raw_inut("frequent filer miles earned per year?"))
iceCream=float(raw_input("liters of ice cream consumed per year?"))
datingDateMat,dataingLabels=file2matrix('datingTestSet2.txt')
normMat,ranges,minVals=autoNorm(datingDataMat)
inArr=array([ffMiles,percentTats,iceCream])
classifierResult=classify0((inArr-minVals)/ranges,normMat,datingLabels,3)
print "you will probably like this person:"resultList[classifierResult-1]
将图像转化为测试向量
def img2vector(filename):
returnVect = zeros((1,1024))
fr = open(filename)
for i in range(32):
lineStr = fr.readline()
for j in range(32):
returnVect[0,32*i+j] = int(lineStr[j])
return returnVect
使用K临近算法识别手写数字
def handwritingClassTest():
hwLabels = []
trainingFileList = listdir('trainingDigits') #导入训练集数据文件目录 os模块的listdir方法
m = len(trainingFileList)
trainingMat = zeros((m,1024))
for i in range(m):
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.split('.')[0] #0_0.txt 格式的文件, 取出0_0
classNumStr = int(fileStr.split('_')[0]) #0_0 中取出第一个0, 提取出该文件代表的哪个数字,标签
hwLabels.append(classNumStr)
trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr)
testFileList = listdir('testDigits') #iterate through the test set
errorCount = 0.0
mTest = len(testFileList)
for i in range(mTest):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split('.')[0] #take off .txt
classNumStr = int(fileStr.split('_')[0])
vectorUnderTest = img2vector('testDigits/%s' % fileNameStr)
classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3) # 注意第一个是测试集,后两个参数是训练集的
print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr)
if (classifierResult != classNumStr): errorCount += 1.0
print "\nthe total number of errors is: %d" % errorCount
print "\nthe total error rate is: %f" % (errorCount/float(mTest))
决策树
计算熵
def calcShannonEnt(dataSet):
numEntries = len(dataSet) #传入了列表
labelCounts = {}
for featVec in dataSet: #the the number of unique elements and their occurance
currentLabel = featVec[-1] #标签
if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1 #抽取出的分类特性数量统计
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key])/numEntries #计算特性的每个分类概论
shannonEnt -= prob * log(prob,2) #log base 2 计算熵
return shannonEnt
创建数据集
def createDataSet():
dataSet = [[1, 1, 'yes'],
[1, 1, 'yes'],
[1, 0, 'no'],
[0, 1, 'no'],
[0, 1, 'no']]
labels = ['no surfacing','flippers']
#change to discrete values
return dataSet, labels
划分数据集
def splitDataSet(dataSet, axis, value): # dataSet为列表
retDataSet = []
for featVec in dataSet:
if featVec[axis] == value: #按照axis 列的值是否等于value分组
reducedFeatVec = featVec[:axis] #此处取需要分组列表的axis特征的前几项
reducedFeatVec.extend(featVec[axis+1:]) #此处把需要分组列表的axis特征的前后项连接到前面的几项上去,因此组成的新列表不包括axis列,其它的都包括
retDataSet.append(reducedFeatVec) # 注意append和extend的区别,append 增加的时候不管增加的是列表等,都不展开, 而extend会展开list
return retDataSet
选择最好的数据集划分方式
def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0]) - 1 #最后一个是标签,获得特征
baseEntropy = calcShannonEnt(dataSet) #此处计算的是总熵,非条件熵
bestInfoGain = 0.0; bestFeature = -1
for i in range(numFeatures): #iterate over all the features
featList = [example[i] for example in dataSet] #选择特征i的全部样例
uniqueVals = set(featList) #特征i的不同类, 从列表中创建集合是获得列表唯一元素的最快方法
newEntropy = 0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value) #先计算出特征i等于value的, 对value进行循环,然后不同的value的相加
prob = len(subDataSet)/float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet) #注意前面所说最后一列是标签列 ,条件熵, 分的越好,条件熵可能为0
infoGain = baseEntropy - newEntropy #计算信息增益
if (infoGain > bestInfoGain): #compare this to the best gain so far
bestInfoGain = infoGain #if better than current best, set to best
bestFeature = i
return bestFeature #returns an integer
投票器:
import operator
def majorityCnt(classList):
classCount={}
for vote in classList:
if vote not in classCount.keys(): classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) #注意写法
return sortedClassCount[0][0]
递归决策树构建
def createTree(dataSet,labels):
classList = [example[-1] for example in dataSet] #最后一列是标签,分类标签,这里是所有的分类标签
if classList.count(classList[0]) == len(classList): #判断所有的分类标签是否相同,相同
return classList[0]#stop splitting when all of the classes are equal
if len(dataSet[0]) == 1: #用完了所有的特征,任然不能划分的时候,采取投票法 ,注意剩下的是分类标签了
return majorityCnt(classList)
bestFeat = chooseBestFeatureToSplit(dataSet)
bestFeatLabel = labels[bestFeat] #特征的名字套上
myTree = {bestFeatLabel:{}}
del(labels[bestFeat]) #去掉已经使用的特征
featValues = [example[bestFeat] for example in dataSet] #所有特征类
uniqueVals = set(featValues) #找出唯一
for value in uniqueVals:
subLabels = labels[:] #copy all of labels, so trees don't mess up existing labels
myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels) # 注意此处树的增加采用[bestFeatLabel][value],value在for循环中,value离散数据
return myTree
绘制树图
获得叶节点的数目和层数
def getNumLeafs(myTree):
numLeafs = 0
firstStr = myTree.keys()[0]
secondDict = myTree[firstStr]
for key in secondDict.keys():
if type(secondDict[key]).__name__=='dict':#test to see if the nodes are dictonaires, if not they are leaf nodes
numLeafs += getNumLeafs(secondDict[key])
else: numLeafs +=1
return numLeafs
def getTreeDepth(myTree):
maxDepth = 0
firstStr = myTree.keys()[0]
secondDict = myTree[firstStr]
for key in secondDict.keys():
if type(secondDict[key]).__name__=='dict':#test to see if the nodes are dictonaires, if not they are leaf nodes
thisDepth = 1 + getTreeDepth(secondDict[key])
else: thisDepth = 1
if thisDepth > maxDepth: maxDepth = thisDepth
return maxDepth
def createPlot(inTree):
fig = plt.figure(1, facecolor='white')
fig.clf()
axprops = dict(xticks=[], yticks=[])
createPlot.ax1 = plt.subplot(111, frameon=False, **axprops) #no ticks
#createPlot.ax1 = plt.subplot(111, frameon=False) #ticks for demo puropses
plotTree.totalW = float(getNumLeafs(inTree))
plotTree.totalD = float(getTreeDepth(inTree))
plotTree.xOff = -0.5/plotTree.totalW; plotTree.yOff = 1.0;
plotTree(inTree, (0.5,1.0), '')
plt.show()
def plotTree(myTree, parentPt, nodeTxt):#if the first key tells you what feat was split on
numLeafs = getNumLeafs(myTree) #this determines the x width of this tree
depth = getTreeDepth(myTree)
firstStr = myTree.keys()[0] #the text label for this node should be this
cntrPt = (plotTree.xOff + (1.0 + float(numLeafs))/2.0/plotTree.totalW, plotTree.yOff)
plotMidText(cntrPt, parentPt, nodeTxt)
plotNode(firstStr, cntrPt, parentPt, decisionNode)
secondDict = myTree[firstStr]
plotTree.yOff = plotTree.yOff - 1.0/plotTree.totalD
for key in secondDict.keys():
if type(secondDict[key]).__name__=='dict':#test to see if the nodes are dictonaires, if not they are leaf nodes
plotTree(secondDict[key],cntrPt,str(key)) #recursion
else: #it's a leaf node print the leaf node
plotTree.xOff = plotTree.xOff + 1.0/plotTree.totalW
plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode)
plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))
plotTree.yOff = plotTree.yOff + 1.0/plotTree.totalD
#if you do get a dictonary you know it's a tree, and the first element will be another dict
测试算法:使用决策树进行分类
def classify(inputTree,featLabels,testVec):
firstStr = inputTree.keys()[0]
secondDict = inputTree[firstStr]
featIndex = featLabels.index(firstStr) #第一个分类特性属于第几个标签
key = testVec[featIndex] #将标签转化为索引
valueOfFeat = secondDict[key] #分类特性标签的值
if isinstance(valueOfFeat, dict):
classLabel = classify(valueOfFeat, featLabels, testVec)
else: classLabel = valueOfFeat
return classLabel
trees.classify(mytree,lables,[0,1]) # 输入,返回具体类型
决策树的储存
def storeTree(inputTree,filename):
import pickle
fw = open(filename,'w')
pickle.dump(inputTree,fw)
fw.close()
def grabTree(filename):
import pickle
fr = open(filename)
return pickle.load(fr)
使用决策树预测隐形眼镜类型
fr=open(r'C:\Users\merit\Downloads\next step\MLiA_SourceCode\machinelearninginaction\Ch03\lenses.txt')
lenses=[inst.strip().split('\t') for inst in fr.readlines()]
lensesLabels=['age','prescript','astigmatic','tearRate']
lensesTree=tree.createTree(lenses.lensesLables)
treePlotter.createPlot(lensesTree)
朴素贝叶斯
从文本中构建词向量
def loadDataSet():
postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classVec = [0,1,0,1,0,1] #1 is abusive, 0 not
return postingList,classVec
def createVocabList(dataSet):
vocabSet = set([]) #create empty set
for document in dataSet:
vocabSet = vocabSet | set(document) #union of the two sets, 合并独一无二的词库
return list(vocabSet)
构建文档向量(词集模型)
def setOfWords2Vec(vocabList, inputSet):
returnVec = [0]*len(vocabList) #建立一个和字典一样长的向量
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1 #如果在词典中,用1标出
else: print "the word: %s is not in my Vocabulary!" % word
return returnVec
词袋模型
def bagOfWords2VecMN(vocabList, inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1
return returnVec
从词向量计算频率
trainMat=[]
for postinDoc in listOPosts:
trainMat.append(bayes.setOfWords2Vec(myVocablist,postinDoc)) # 注意此处用append, 不用extend
def trainNB0(trainMatrix,trainCategory):
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pAbusive = sum(trainCategory)/float(numTrainDocs) #计算p(ci)
p0Num = ones(numWords); p1Num = ones(numWords) #change to ones() 使用numpy, 矩阵计算方便, 默认设置1,拉普拉斯平滑?
p0Denom = 2.0; p1Denom = 2.0 #change to 2.0 初始化概率
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num += trainMatrix[i] #矩阵计算
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
p1Vect = log(p1Num/p1Denom) #change to log() #向量 loga*b=loga+logb 防止在a*b的时候出现溢出,先把分量转化成log, 里面计算出的是p(wi|c)的概论,注意计算方法
p0Vect = log(p0Num/p0Denom) #change to log() #用log代替
return p0Vect,p1Vect,pAbusive
分类函数
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
p1 = sum(vec2Classify * p1Vec) + log(pClass1) #element-wise mult 注意此处的处理,一: log相加, 第二,相乘 算出了p(w|c)待分类的和已有的训练集合的结合
p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
if p1 > p0:
return 1
else:
return 0
def testingNB():
listOPosts,listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
trainMat=[]
for postinDoc in listOPosts:
trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))
testEntry = ['love', 'my', 'dalmation']
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)
testEntry = ['stupid', 'garbage']
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)
示例 朴素贝叶斯过滤垃圾邮件
mySent='This book is the best book on Python or M.L, I have ever laid eyes upon.'
regEx=re.compile('\\W*') #注意W,和w不同,W除了单词,数次之外的任何数据, w相反
listofTokens=regEx.split(mySent)
[tok.lower() for tok in listofTokens if len(tok)>0] #去掉空格,小写
文本解析及完整的测试函数
def textParse(bigString): #input is big string, #output is word list
import re
listOfTokens = re.split(r'\W*', bigString)
return [tok.lower() for tok in listOfTokens if len(tok) > 2]
def spamTest():
docList=[]; classList = []; fullText =[]
for i in range(1,26):
wordList = textParse(open('email/spam/%d.txt' % i).read()) #spam读取出单词
docList.append(wordList)
fullText.extend(wordList) #注意extend和extend的区别
classList.append(1)
wordList = textParse(open('email/ham/%d.txt' % i).read()) #ham读取出单词
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList)#create vocabulary
trainingSet = range(50); testSet=[] #create test set
for i in range(10): #注意后面的代码中未出现i,这里只是次数
randIndex = int(random.uniform(0,len(trainingSet))) #最近生成实数,在0到trainingSet范围内
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex]) #注意随机的取法,测试集和训练集的随机取法
trainMat=[]; trainClasses = []
for docIndex in trainingSet:#train the classifier (get probs) trainNB0
trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex])) #装袋模型
trainClasses.append(classList[docIndex])
p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses)) #获得训练集条件概率向量形式表示,数组
errorCount = 0
for docIndex in testSet: #classify the remaining items
wordVector = bagOfWords2VecMN(vocabList, docList[docIndex]) #装袋模型,此处使用装袋模型,如果一个单词次数超过一次, 因为计算p(w|c)的时候是vec2Classify * p1Vec,是否存在bug?,应是词集模型?
if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]: #转化为数组
errorCount += 1
print "classification error",docList[docIndex]
print 'the error rate is: ',float(errorCount)/len(testSet) #统计错误
#return vocabList,fullText
Logistic回归
梯度上升算法
def loadDataSet():
dataMat = []; labelMat = []
fr = open('testSet.txt')
for line in fr.readlines():
lineArr = line.strip().split()
dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])]) #注意把X0设置能1,为了统一成向量形式wx,不带常量b
labelMat.append(int(lineArr[2]))
return dataMat,labelMat
def gradAscent(dataMatIn, classLabels):
dataMatrix = mat(dataMatIn) #convert to NumPy matrix 转化为mat,注意矩阵是数组的一类
labelMat = mat(classLabels).transpose() #convert to NumPy matrix 并转置
m,n = shape(dataMatrix) #矩阵维度
alpha = 0.001
maxCycles = 500
weights = ones((n,1)) #设置初始权重w都为1
for k in range(maxCycles): #heavy on matrix operations
h = sigmoid(dataMatrix*weights) #matrix mult 矩阵乘法,不是点乘,此处是所有的样本点参与运算
error = (labelMat - h) #vector subtraction
weights = weights + alpha * dataMatrix.transpose()* error #matrix mult 注意梯度的计算方法,此处是损失函数对变量的偏导数组成的集合,参见博客http://blog.csdn.net/bbbeoy/article/details/72578478
return weights
画出图形def plotBestFit(weights):
import matplotlib.pyplot as plt
dataMat,labelMat=loadDataSet()
dataArr = array(dataMat)
n = shape(dataArr)[0]
xcord1 = []; ycord1 = []
xcord2 = []; ycord2 = []
for i in range(n):
if int(labelMat[i])== 1:
xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2])
else:
xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
ax.scatter(xcord2, ycord2, s=30, c='green')
x = arange(-3.0, 3.0, 0.1)
y = (-weights[0]-weights[1]*x)/weights[2] #其实是在sigmoid为零处的分界线,找出x1和x2的关系
ax.plot(x, y)
plt.xlabel('X1'); plt.ylabel('X2');
plt.show()
随机梯度上升
def stocGradAscent0(dataMatrix, classLabels):
m,n = shape(dataMatrix)
alpha = 0.01
weights = ones(n) #initialize to all ones
for i in range(m): #注意此处变为样本数量,按照样本顺序
h = sigmoid(sum(dataMatrix[i]*weights)) #一个样本
error = classLabels[i] - h
weights = weights + alpha * error * dataMatrix[i] #一个样本,不是矩阵
return weights
改进的随机梯度上升算法
def stocGradAscent1(dataMatrix, classLabels, numIter=150): #设定迭代次数
m,n = shape(dataMatrix)
weights = ones(n) #initialize to all ones
for j in range(numIter):
dataIndex = range(m)
for i in range(m):
alpha = 4/(1.0+j+i)+0.0001 #apha decreases with iteration, does not
randIndex = int(random.uniform(0,len(dataIndex)))#go to 0 because of the constant 按照随机
h = sigmoid(sum(dataMatrix[randIndex]*weights))
error = classLabels[randIndex] - h
weights = weights + alpha * error * dataMatrix[randIndex]
del(dataIndex[randIndex])
return weights
示例 预测病马的死亡率
处理数据中的缺失值
用logistic进行测试
def colicTest():
frTrain = open('horseColicTraining.txt'); frTest = open('horseColicTest.txt')
trainingSet = []; trainingLabels = []
for line in frTrain.readlines():
currLine = line.strip().split('\t')
lineArr =[]
for i in range(21): #每行的string经过float转化后一个一个放入行列表中,是否可批量? map函数嵌套labmda表达式实现?
lineArr.append(float(currLine[i]))
trainingSet.append(lineArr)
trainingLabels.append(float(currLine[21]))
trainWeights = stocGradAscent1(array(trainingSet), trainingLabels, 1000)
errorCount = 0; numTestVec = 0.0
for line in frTest.readlines():
numTestVec += 1.0
currLine = line.strip().split('\t')
lineArr =[]
for i in range(21):
lineArr.append(float(currLine[i]))
if int(classifyVector(array(lineArr), trainWeights))!= int(currLine[21]):
errorCount += 1
errorRate = (float(errorCount)/numTestVec)
print "the error rate of this test is: %f" % errorRate
return errorRate
def multiTest():
numTests = 10; errorSum=0.0 #测试10次,因为使用的是增强随机梯度算法,每次训练出的权重不一样,迭代次数,步长
for k in range(numTests):
errorSum += colicTest()
print "after %d iterations the average error rate is: %f" % (numTests, errorSum/float(numTests))
支持向量机
简化版本SMO
def loadDataSet(fileName):
dataMat = []; labelMat = []
fr = open(fileName)
for line in fr.readlines():
lineArr = line.strip().split('\t')
dataMat.append([float(lineArr[0]), float(lineArr[1])])
labelMat.append(float(lineArr[2]))
return dataMat,labelMat
def selectJrand(i,m):
j=i #we want to select any J not equal to i
while (j==i):
j = int(random.uniform(0,m)) #注意选择j不等与i的方法,采用while循环。
return j
def clipAlpha(aj,H,L): #限定aj范围,进行调整
if aj > H:
aj = H
if L > aj:
aj = L
return aj
def smoSimple(dataMatIn, classLabels, C, toler, maxIter):
dataMatrix = mat(dataMatIn); labelMat = mat(classLabels).transpose()
b = 0; m,n = shape(dataMatrix)
alphas = mat(zeros((m,1))) #生成为零的矩阵
iter = 0
while (iter < maxIter):
alphaPairsChanged = 0 #判断是否优化
for i in range(m):
fXi = float(multiply(alphas,labelMat).T*(dataMatrix*dataMatrix[i,:].T)) + b #预测的类别 参见http://blog.csdn.net/bbbeoy/article/details/72598473
Ei = fXi - float(labelMat[i])#if checks if an example violates KKT conditions
if ((labelMat[i]*Ei < -toler) and (alphas[i] < C)) or ((labelMat[i]*Ei > toler) and (alphas[i] > 0)):#无法小于一定的容忍度或,正负向测试,后面有ClipAlpha
j = selectJrand(i,m) #选择另外一个
fXj = float(multiply(alphas,labelMat).T*(dataMatrix*dataMatrix[j,:].T)) + b #另外一个放入
Ej = fXj - float(labelMat[j])
alphaIold = alphas[i].copy(); alphaJold = alphas[j].copy();
if (labelMat[i] != labelMat[j]): #当两个不同浩,计算取值范围
L = max(0, alphas[j] - alphas[i])
H = min(C, C + alphas[j] - alphas[i])
else:
L = max(0, alphas[j] + alphas[i] - C)
H = min(C, alphas[j] + alphas[i])
if L==H: print "L==H"; continue
eta = 2.0 * dataMatrix[i,:]*dataMatrix[j,:].T - dataMatrix[i,:]*dataMatrix[i,:].T - dataMatrix[j,:]*dataMatrix[j,:].T #eta = K11+K22-2*K12,也是f(x)的二阶导数
if eta >= 0: print "eta>=0"; continue
alphas[j] -= labelMat[j]*(Ei - Ej)/eta
alphas[j] = clipAlpha(alphas[j],H,L)
if (abs(alphas[j] - alphaJold) < 0.00001): print "j not moving enough"; continue
alphas[i] += labelMat[j]*labelMat[i]*(alphaJold - alphas[j])#update i by the same amount as j
#the update is in the oppostie direction
b1 = b - Ei- labelMat[i]*(alphas[i]-alphaIold)*dataMatrix[i,:]*dataMatrix[i,:].T - labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[i,:]*dataMatrix[j,:].T
b2 = b - Ej- labelMat[i]*(alphas[i]-alphaIold)*dataMatrix[i,:]*dataMatrix[j,:].T - labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[j,:]*dataMatrix[j,:].T
if (0 < alphas[i]) and (C > alphas[i]): b = b1 #对b更新 http://blog.csdn.net/bbbeoy/article/details/72469136
elif (0 < alphas[j]) and (C > alphas[j]): b = b2
else: b = (b1 + b2)/2.0
alphaPairsChanged += 1
print "iter: %d i:%d, pairs changed %d" % (iter,i,alphaPairsChanged)
if (alphaPairsChanged == 0): iter += 1
else: iter = 0
print "iteration number: %d" % iter
return b,alphas
b,alphs=svmMLiA.smoSimple(dataArr,labelArr,0.6, 0.001,40)
得到支持向量个数:
for i in range(100):
if alphs[i]>0.0:print dataArr[i],labelArr[i]
完整plot SMO算法
def smoP(dataMatIn, classLabels, C, toler, maxIter,kTup=('lin', 0)): #full Platt SMO
oS = optStruct(mat(dataMatIn),mat(classLabels).transpose(),C,toler, kTup)
iter = 0
entireSet = True; alphaPairsChanged = 0
while (iter < maxIter) and ((alphaPairsChanged > 0) or (entireSet)):
alphaPairsChanged = 0
if entireSet: #go over all
for i in range(oS.m):
alphaPairsChanged += innerL(i,oS)
print "fullSet, iter: %d i:%d, pairs changed %d" % (iter,i,alphaPairsChanged)
iter += 1
else:#go over non-bound (railed) alphas
nonBoundIs = nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0]
for i in nonBoundIs:
alphaPairsChanged += innerL(i,oS)
print "non-bound, iter: %d i:%d, pairs changed %d" % (iter,i,alphaPairsChanged)
iter += 1
if entireSet: entireSet = False #toggle entire set loop
elif (alphaPairsChanged == 0): entireSet = True
print "iteration number: %d" % iter
return oS.b,oS.alphas
class optStruct:
def __init__(self,dataMatIn, classLabels, C, toler, kTup): # Initialize the structure with the parameters
self.X = dataMatIn
self.labelMat = classLabels
self.C = C
self.tol = toler
self.m = shape(dataMatIn)[0]
self.alphas = mat(zeros((self.m,1)))
self.b = 0
self.eCache = mat(zeros((self.m,2))) #first column is valid flag
self.K = mat(zeros((self.m,self.m)))
for i in range(self.m):
self.K[:,i] = kernelTrans(self.X, self.X[i,:], kTup)
def calcEk(oS, k):
fXk = float(multiply(oS.alphas,oS.labelMat).T*oS.K[:,k] + oS.b)
Ek = fXk - float(oS.labelMat[k])
return Ek
def innerL(i, oS):
Ei = calcEk(oS, i)
if ((oS.labelMat[i]*Ei < -oS.tol) and (oS.alphas[i] < oS.C)) or ((oS.labelMat[i]*Ei > oS.tol) and (oS.alphas[i] > 0)):
j,Ej = selectJ(i, oS, Ei) #this has been changed from selectJrand
alphaIold = oS.alphas[i].copy(); alphaJold = oS.alphas[j].copy();
if (oS.labelMat[i] != oS.labelMat[j]):
L = max(0, oS.alphas[j] - oS.alphas[i])
H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i])
else:
L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C)
H = min(oS.C, oS.alphas[j] + oS.alphas[i])
if L==H: print "L==H"; return 0
eta = 2.0 * oS.K[i,j] - oS.K[i,i] - oS.K[j,j] #changed for kernel
if eta >= 0: print "eta>=0"; return 0
oS.alphas[j] -= oS.labelMat[j]*(Ei - Ej)/eta
oS.alphas[j] = clipAlpha(oS.alphas[j],H,L)
updateEk(oS, j) #added this for the Ecache
if (abs(oS.alphas[j] - alphaJold) < 0.00001): print "j not moving enough"; return 0
oS.alphas[i] += oS.labelMat[j]*oS.labelMat[i]*(alphaJold - oS.alphas[j])#update i by the same amount as j
updateEk(oS, i) #added this for the Ecache #the update is in the oppostie direction
b1 = oS.b - Ei- oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,i] - oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[i,j]
b2 = oS.b - Ej- oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,j]- oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[j,j]
if (0 < oS.alphas[i]) and (oS.C > oS.alphas[i]): oS.b = b1
elif (0 < oS.alphas[j]) and (oS.C > oS.alphas[j]): oS.b = b2
else: oS.b = (b1 + b2)/2.0
return 1
def calcWs(alphas,dataArr,classLabels): #带入计算标签, 最后需要+b
X = mat(dataArr); labelMat = mat(classLabels).transpose()
m,n = shape(X)
w = zeros((n,1))
for i in range(m):
w += multiply(alphas[i]*labelMat[i],X[i,:].T)
return w
data[2]*mat(ws)+b
核转换函数
def testRbf(k1=1.3):
dataArr,labelArr = loadDataSet('testSetRBF.txt')
b,alphas = smoP(dataArr, labelArr, 200, 0.0001, 10000, ('rbf', k1)) #C=200 important
datMat=mat(dataArr); labelMat = mat(labelArr).transpose()
svInd=nonzero(alphas.A>0)[0]
sVs=datMat[svInd] #get matrix of only support vectors
labelSV = labelMat[svInd];
print "there are %d Support Vectors" % shape(sVs)[0]
m,n = shape(datMat)
errorCount = 0
for i in range(m):
kernelEval = kernelTrans(sVs,datMat[i,:],('rbf', k1))
predict=kernelEval.T * multiply(labelSV,alphas[svInd]) + b
if sign(predict)!=sign(labelArr[i]): errorCount += 1
print "the training error rate is: %f" % (float(errorCount)/m)
dataArr,labelArr = loadDataSet('testSetRBF2.txt')
errorCount = 0
datMat=mat(dataArr); labelMat = mat(labelArr).transpose()
m,n = shape(datMat)
for i in range(m):
kernelEval = kernelTrans(sVs,datMat[i,:],('rbf', k1))
predict=kernelEval.T * multiply(labelSV,alphas[svInd]) + b
if sign(predict)!=sign(labelArr[i]): errorCount += 1
print "the test error rate is: %f" % (float(errorCount)/m)
def smoP(dataMatIn, classLabels, C, toler, maxIter,kTup=('lin', 0)): #full Platt SMO
oS = optStruct(mat(dataMatIn),mat(classLabels).transpose(),C,toler, kTup)
iter = 0
entireSet = True; alphaPairsChanged = 0
while (iter < maxIter) and ((alphaPairsChanged > 0) or (entireSet)):
alphaPairsChanged = 0
if entireSet: #go over all
for i in range(oS.m):
alphaPairsChanged += innerL(i,oS)
print "fullSet, iter: %d i:%d, pairs changed %d" % (iter,i,alphaPairsChanged)
iter += 1
else:#go over non-bound (railed) alphas
nonBoundIs = nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0]
for i in nonBoundIs:
alphaPairsChanged += innerL(i,oS)
print "non-bound, iter: %d i:%d, pairs changed %d" % (iter,i,alphaPairsChanged)
iter += 1
if entireSet: entireSet = False #toggle entire set loop
elif (alphaPairsChanged == 0): entireSet = True
print "iteration number: %d" % iter
return oS.b,oS.alphas
class optStruct:
def __init__(self,dataMatIn, classLabels, C, toler, kTup): # Initialize the structure with the parameters
self.X = dataMatIn
self.labelMat = classLabels
self.C = C
self.tol = toler
self.m = shape(dataMatIn)[0]
self.alphas = mat(zeros((self.m,1)))
self.b = 0
self.eCache = mat(zeros((self.m,2))) #first column is valid flag
self.K = mat(zeros((self.m,self.m))) #核转换后的维度?
for i in range(self.m): #每个元素与总体的核函数按照列排布
self.K[:,i] = kernelTrans(self.X, self.X[i,:], kTup)
def kernelTrans(X, A, kTup): #calc the kernel or transform data to a higher dimensional space
m,n = shape(X)
K = mat(zeros((m,1)))
if kTup[0]=='lin': K = X * A.T #linear kernel 线性核函数
elif kTup[0]=='rbf':
for j in range(m): #高斯核 公式:exp{(xj - xi)^2 / (2 * δ^2)} | j = 1,...,N
deltaRow = X[j,:] - A
K[j] = deltaRow*deltaRow.T #矩阵 m列 每个元素与总体
K = exp(K/(-1*kTup[1]**2)) #divide in NumPy is element-wise not matrix like Matlab, /表示广播,对每一项应用公式,矩阵元素展开 为什么不是-2呢?
else: raise NameError('Houston We Have a Problem -- \
That Kernel is not recognized')
return K
def calcEk(oS, k):
fXk = float(multiply(oS.alphas,oS.labelMat).T*oS.K[:,k] + oS.b) #注意和线性的相比,写法,ai,xi
Ek = fXk - float(oS.labelMat[k])
return Ek
def innerL(i, oS):
Ei = calcEk(oS, i)
if ((oS.labelMat[i]*Ei < -oS.tol) and (oS.alphas[i] < oS.C)) or ((oS.labelMat[i]*Ei > oS.tol) and (oS.alphas[i] > 0)):
j,Ej = selectJ(i, oS, Ei) #this has been changed from selectJrand
alphaIold = oS.alphas[i].copy(); alphaJold = oS.alphas[j].copy();
if (oS.labelMat[i] != oS.labelMat[j]):
L = max(0, oS.alphas[j] - oS.alphas[i])
H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i])
else:
L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C)
H = min(oS.C, oS.alphas[j] + oS.alphas[i])
if L==H: print "L==H"; return 0
eta = 2.0 * oS.K[i,j] - oS.K[i,i] - oS.K[j,j] #changed for kernel
if eta >= 0: print "eta>=0"; return 0
oS.alphas[j] -= oS.labelMat[j]*(Ei - Ej)/eta
oS.alphas[j] = clipAlpha(oS.alphas[j],H,L)
updateEk(oS, j) #added this for the Ecache
if (abs(oS.alphas[j] - alphaJold) < 0.00001): print "j not moving enough"; return 0
oS.alphas[i] += oS.labelMat[j]*oS.labelMat[i]*(alphaJold - oS.alphas[j])#update i by the same amount as j
updateEk(oS, i) #added this for the Ecache #the update is in the oppostie direction
b1 = oS.b - Ei- oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,i] - oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[i,j]
b2 = oS.b - Ej- oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,j]- oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[j,j]
if (0 < oS.alphas[i]) and (oS.C > oS.alphas[i]): oS.b = b1
elif (0 < oS.alphas[j]) and (oS.C > oS.alphas[j]): oS.b = b2
else: oS.b = (b1 + b2)/2.0
return 1
else: return 0
手写识别系统回顾
def testDigits(kTup=('rbf', 10)):
dataArr,labelArr = loadImages('trainingDigits')
b,alphas = smoP(dataArr, labelArr, 200, 0.0001, 10000, kTup)
datMat=mat(dataArr); labelMat = mat(labelArr).transpose()
svInd=nonzero(alphas.A>0)[0] #nonzero:,相当于用nonzero()将布尔数组转换成一组整数数组,然后使用整数数组进行下标运算nozero([Ture,False,Ture]) 返回取[0] 才返回 array([1]),限制直接>0,取出支持向量
sVs=datMat[svInd] #注意此处numpy.A 表示矩阵的基数组,把矩阵变为数组了,此处取A后出现一个[i]的值,如果不取A,出现一个[[i]]形式的值,注意巧妙的用法,一般和flatten结合使用
labelSV = labelMat[svInd]; #矩阵取子集
print "there are %d Support Vectors" % shape(sVs)[0]
m,n = shape(datMat)
errorCount = 0
for i in range(m):
kernelEval = kernelTrans(sVs,datMat[i,:],kTup)
predict=kernelEval.T * multiply(labelSV,alphas[svInd]) + b
if sign(predict)!=sign(labelArr[i]): errorCount += 1
print "the training error rate is: %f" % (float(errorCount)/m)
dataArr,labelArr = loadImages('testDigits')
errorCount = 0
datMat=mat(dataArr); labelMat = mat(labelArr).transpose()
m,n = shape(datMat)
for i in range(m):
kernelEval = kernelTrans(sVs,datMat[i,:],kTup)
predict=kernelEval.T * multiply(labelSV,alphas[svInd]) + b
if sign(predict)!=sign(labelArr[i]): errorCount += 1
print "the test error rate is: %f" % (float(errorCount)/m)
利用AdaBoost提高分类预测性能
datMat = matrix([[ 1. , 2.1],
[ 1.5, 1.6],
[ 1.3, 1. ],
[ 1. , 1. ],
[ 2. , 1. ]])
classLabels = [1.0, 1.0, -1.0, -1.0, 1.0]
xcord0 = []
ycord0 = []
xcord1 = []
ycord1 = []
markers =[]
colors =[]
for i in range(len(classLabels)):
if classLabels[i]==1.0:
xcord1.append(datMat[i,0]), ycord1.append(datMat[i,1])
else:
xcord0.append(datMat[i,0]), ycord0.append(datMat[i,1])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord0,ycord0, marker='s', s=90)
ax.scatter(xcord1,ycord1, marker='o', s=50, c='red')
plt.title('decision stump test data')
plt.show()
单层决策器生成器
def buildStump(dataArr,classLabels,D): #弱的单层分类器
dataMatrix = mat(dataArr); labelMat = mat(classLabels).T
m,n = shape(dataMatrix)
numSteps = 10.0; bestStump = {}; bestClasEst = mat(zeros((m,1)))
minError = inf #init error sum, to +infinity 正无穷
for i in range(n):#loop over all dimensions 列遍历
rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max();
stepSize = (rangeMax-rangeMin)/numSteps
for j in range(-1,int(numSteps)+1):#loop over all range in current dimension 超越现在的上下边界,所有才有lt,gt
for inequal in ['lt', 'gt']: #go over less than and greater than
threshVal = (rangeMin + float(j) * stepSize) #从当前特征的最小值还小的地方,一步一步
predictedVals = stumpClassify(dataMatrix,i,threshVal,inequal)#call stump classify with i, j, lessThan
errArr = mat(ones((m,1)))
errArr[predictedVals == labelMat] = 0 #注意批量的计算方法
weightedError = D.T*errArr #calc total error multiplied by D 权重*错误率 adaboost计算错误率特殊的地方,加权错误率 注意和普通不同,普通在权重相等时候可以转化为这样计算
#print "split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % (i, threshVal, inequal, weightedError)
if weightedError < minError:
minError = weightedError
bestClasEst = predictedVals.copy()
bestStump['dim'] = i
bestStump['thresh'] = threshVal
bestStump['ineq'] = inequal
return bestStump,minError,bestClasEst
def stumpClassify(dataMatrix,dimen,threshVal,threshIneq):#just classify the data 单分类器, 遍历求分类思想
retArray = ones((shape(dataMatrix)[0],1))
if threshIneq == 'lt':
retArray[dataMatrix[:,dimen] <= threshVal] = -1.0 #通过阈值分类
else:
retArray[dataMatrix[:,dimen] > threshVal] = -1.0
return retArray
完整Adaboost算法:
def adaBoostTrainDS(dataArr,classLabels,numIt=40):
weakClassArr = []
m = shape(dataArr)[0]
D = mat(ones((m,1))/m) #init D to all equal
aggClassEst = mat(zeros((m,1)))
for i in range(numIt):
bestStump,error,classEst = buildStump(dataArr,classLabels,D)#build Stump
#print "D:",D.T
alpha = float(0.5*log((1.0-error)/max(error,1e-16)))#calc alpha, throw in max(error,eps) to account for error=0 ,每个分类器计算权重,总和等于1?
bestStump['alpha'] = alpha #分类器保存权重
weakClassArr.append(bestStump) #store Stump Params in Array
#print "classEst: ",classEst.T
expon = multiply(-1*alpha*mat(classLabels).T,classEst) #exponent for D calc, getting messy 注意此处巧妙的计算方法,一次计算出+,-
D = multiply(D,exp(expon)) #Calc New D for next iteration
D = D/D.sum()
#calc training error of all classifiers, if this is 0 quit for loop early (use break)
aggClassEst += alpha*classEst #记录每个点的类别估计累计值
#print "aggClassEst: ",aggClassEst.T
aggErrors = multiply(sign(aggClassEst) != mat(classLabels).T,ones((m,1))) #运行时的分类累计值,用二分函数估计结果, 注意multipy的使用。
errorRate = aggErrors.sum()/m
print "total error: ",errorRate
if errorRate == 0.0: break
return weakClassArr,aggClassEst
测试算法:基于Adaboost的分类
def adaClassify(datToClass,classifierArr):
dataMatrix = mat(datToClass)#do stuff similar to last aggClassEst in adaBoostTrainDS
m = shape(dataMatrix)[0]
aggClassEst = mat(zeros((m,1)))
for i in range(len(classifierArr)):
classEst = stumpClassify(dataMatrix,classifierArr[i]['dim'],\
classifierArr[i]['thresh'],\
classifierArr[i]['ineq'])#call stump classify 把样本带入前面训练好的一系列分类器中,alpha也是前面分类器的权重
aggClassEst += classifierArr[i]['alpha']*classEst
print aggClassEst
return sign(aggClassEst)
ROU曲线的绘制及AUC计算函数
def plotROC(predStrengths, classLabels):
import matplotlib.pyplot as plt
cur = (1.0,1.0) #cursor
ySum = 0.0 #variable to calculate AUC
numPosClas = sum(array(classLabels)==1.0) #注意数量计算方法
yStep = 1/float(numPosClas); xStep = 1/float(len(classLabels)-numPosClas) #真阳率 假阳率
sortedIndicies = predStrengths.argsort()#get sorted index, it's reverse 数组值从小到大的索引值
fig = plt.figure()
fig.clf()
ax = plt.subplot(111)
#loop through all the values, drawing a line segment at each point
for index in sortedIndicies.tolist()[0]:
if classLabels[index] == 1.0:
delX = 0; delY = yStep;
else:
delX = xStep; delY = 0;
ySum += cur[1]
#draw line from cur to (cur[0]-delX,cur[1]-delY)
ax.plot([cur[0],cur[0]-delX],[cur[1],cur[1]-delY], c='b') #plot([x1,x2],[y1,y2]) 按照这样对应点画图
cur = (cur[0]-delX,cur[1]-delY)
ax.plot([0,1],[0,1],'b--')
plt.xlabel('False positive rate'); plt.ylabel('True positive rate')
plt.title('ROC curve for AdaBoost horse colic detection system')
ax.axis([0,1,0,1])
plt.show()
print "the Area Under the Curve is: ",ySum*xStep
利用回归预测数值型数据
普通线性回归
def loadDataSet(fileName): #general function to parse tab -delimited floats
numFeat = len(open(fileName).readline().split('\t')) - 1 #get number of fields
dataMat = []; labelMat = []
fr = open(fileName)
for line in fr.readlines():
lineArr =[]
curLine = line.strip().split('\t')
for i in range(numFeat):
lineArr.append(float(curLine[i]))
dataMat.append(lineArr)
labelMat.append(float(curLine[-1]))
return dataMat,labelMat
def standRegres(xArr,yArr):
xMat = mat(xArr); yMat = mat(yArr).T #注意w0参数为1
xTx = xMat.T*xMat
if linalg.det(xTx) == 0.0: #判断行列式是否为0
print "This matrix is singular, cannot do inverse"
return
ws = xTx.I * (xMat.T*yMat) #计算出w, 利用linalg库, ws=linalg.solve(xTx,xMat.T*yMatT)
return ws
xMat=mat(xArr)
yMat=mat(yArr)
yHat=xMat*ws
import matplotlib.pyplot as plt
fig=plt.figure()
ax=fig.add_subplot(111)
ax.scatter(xMat[:,1].flatten().A[0],yMat.flatten().A[0]) ##注意flatten 和A 的用法,把矩阵转化为其它的方式, flatten 压平的用法,和A搭配转化为1-D, 用在矩阵画图等
xCopy=xMat.copy()
xCopy.sort(0)
yHat=xCopy*ws
ax.plot(xCopy[:,1],yHat)
plt.show()
corrcoef(yHat.T,yMat) #在numpy中计算相关系数
局部加权回归
def lwlrTest(testArr,xArr,yArr,k=1.0): #loops over all the data points and applies lwlr to each one
m = shape(testArr)[0]
yHat = zeros(m)
for i in range(m):
yHat[i] = lwlr(testArr[i],xArr,yArr,k) #对每一个值进行预测
return yHat
def lwlr(testPoint,xArr,yArr,k=1.0):
xMat = mat(xArr); yMat = mat(yArr).T
m = shape(xMat)[0]
weights = mat(eye((m))) #创建对角矩阵 ,注意此处把权重设计放在对角矩阵对角线上,而不再普通的矩阵,便于后续计算
for j in range(m): #next 2 lines create weights matrix
diffMat = testPoint - xMat[j,:] #核权重计算
weights[j,j] = exp(diffMat*diffMat.T/(-2.0*k**2)) # 绝对值用距离的平方代替??
xTx = xMat.T * (weights * xMat)
if linalg.det(xTx) == 0.0:
print "This matrix is singular, cannot do inverse"
return
ws = xTx.I * (xMat.T * (weights * yMat))
return testPoint * ws
xMat=mat(xArr)
srtInd=xMat[:1].argsort(0) # 0 跨行, 注意跨行的理解 , 返回索引,多维情况
xSort=xMat[srtInd][:,0,:] # 注意复杂的用法,多个[[[, 一个数组作为索引,会产生复制等操作,维度增加
import matplotlib.pyplot as plt
fig=plt.figure()
ax=fig.add_subplot(111)
ax.plot(xSort[:,1],yHat[srtInd])
ax.scatter(xMat[:,1].flatten().A[0],mat(yArr).flatten().A[0],s=2,c='red')
plt.show()
预测鲍鱼的年龄
def rssError(yArr,yHatArr): #yArr and yHatArr both need to be arrays
return ((yArr-yHatArr)**2).sum()
岭回归
def ridgeTest(xArr,yArr):
xMat = mat(xArr); yMat=mat(yArr).T
yMean = mean(yMat,0) #跨行,按照列
yMat = yMat - yMean #to eliminate X0 take mean off of Y 标准化
#regularize X's
xMeans = mean(xMat,0) #calc mean then subtract it off
xVar = var(xMat,0) #calc variance of Xi then divide by it 方差,跨行
xMat = (xMat - xMeans)/xVar #tMat没有除方差????
numTestPts = 30
wMat = zeros((numTestPts,shape(xMat)[1]))
for i in range(numTestPts):
ws = ridgeRegres(xMat,yMat,exp(i-10)) # lambda应以指数级变化
wMat[i,:]=ws.T #储存不同lambda对应的参数
return wMat
def ridgeRegres(xMat,yMat,lam=0.2):
xTx = xMat.T*xMat
denom = xTx + eye(shape(xMat)[1])*lam #对角矩阵
if linalg.det(denom) == 0.0:
print "This matrix is singular, cannot do inverse"
return
ws = denom.I * (xMat.T*yMat)
return ws
import matplotlib.pyplot as plt
fig=plt.figure()
ax=fig.add_subplot(111)
ax.plot(ridgeWeights) #此处是log lambda
plt.show()
Lasso
向前逐步回归
def stageWise(xArr,yArr,eps=0.01,numIt=100):
xMat = mat(xArr); yMat=mat(yArr).T
yMean = mean(yMat,0)
yMat = yMat - yMean #can also regularize ys but will get smaller coef
xMat = regularize(xMat)
m,n=shape(xMat)
#returnMat = zeros((numIt,n)) #testing code remove
ws = zeros((n,1)); wsTest = ws.copy(); wsMax = ws.copy()
for i in range(numIt):
print ws.T
lowestError = inf; #正无穷
for j in range(n):
for sign in [-1,1]:
wsTest = ws.copy()
wsTest[j] += eps*sign
yTest = xMat*wsTest # 因为原来的wsTest初始化都为0,所以去掉了部分。
rssE = rssError(yMat.A,yTest.A)
if rssE < lowestError:
lowestError = rssE
wsMax = wsTest
ws = wsMax.copy()
#returnMat[i,:]=ws.T
#return returnMat
def regularize(xMat):#regularize by columns
inMat = xMat.copy()
inMeans = mean(inMat,0) #calc mean then subtract it off
inVar = var(inMat,0) #calc variance of Xi then divide by it
inMat = (inMat - inMeans)/inVar
return inMat
预测乐高玩具价格
from time import sleep
import json
import urllib2
def searchForSet(retX, retY, setNum, yr, numPce, origPrc):
sleep(10) #防止短暂的时间内有过多的API调用
myAPIstr = 'AIzaSyD2cR2KFyx12hXu6PFU-wrWot3NXvko8vY'
searchURL = 'https://www.googleapis.com/shopping/search/v1/public/products?key=%s&country=US&q=lego+%d&alt=json' % (myAPIstr, setNum)
pg = urllib2.urlopen(searchURL)
retDict = json.loads(pg.read())
for i in range(len(retDict['items'])):
try:
currItem = retDict['items'][i]
if currItem['product']['condition'] == 'new':
newFlag = 1
else: newFlag = 0
listOfInv = currItem['product']['inventories']
for item in listOfInv:
sellingPrice = item['price']
if sellingPrice > origPrc * 0.5:
print "%d\t%d\t%d\t%f\t%f" % (yr,numPce,newFlag,origPrc, sellingPrice)
retX.append([yr, numPce, newFlag, origPrc])
retY.append(sellingPrice)
except: print 'problem with item %d' % i
def setDataCollect(retX, retY):
searchForSet(retX, retY, 8288, 2006, 800, 49.99)
searchForSet(retX, retY, 10030, 2002, 3096, 269.99)
searchForSet(retX, retY, 10179, 2007, 5195, 499.99)
searchForSet(retX, retY, 10181, 2007, 3428, 199.99)
searchForSet(retX, retY, 10189, 2008, 5922, 299.99)
searchForSet(retX, retY, 10196, 2009, 3263, 249.99)
建立模型
shape(lgX)
lgX1=mat(ones((58,5))) # X0 设置为1,其它的复制过来
lgX1[:,1:5]=mat(lgX)
ws=regression.standRegres(lgX,lgY)
交叉验证
def crossValidation(xArr,yArr,numVal=10):
m = len(yArr)
indexList = range(m)
errorMat = zeros((numVal,30))#create error mat 30columns numVal rows
for i in range(numVal):
trainX=[]; trainY=[] #创建训练和测试集合容器
testX = []; testY = []
random.shuffle(indexList) #随机打乱列表顺序
for j in range(m):#create training set based on first 90% of values in indexList
if j < m*0.9:
trainX.append(xArr[indexList[j]])
trainY.append(yArr[indexList[j]])
else:
testX.append(xArr[indexList[j]])
testY.append(yArr[indexList[j]])
wMat = ridgeTest(trainX,trainY) #get 30 weight vectors from ridge
for k in range(30):#loop over all of the ridge estimates
matTestX = mat(testX); matTrainX=mat(trainX)
meanTrain = mean(matTrainX,0)
varTrain = var(matTrainX,0)
matTestX = (matTestX-meanTrain)/varTrain #regularize test with training params 注意使用训练集标准化过程中的参数对测试集进行标准化处理。
yEst = matTestX * mat(wMat[k,:]).T + mean(trainY)#test ridge results and store
errorMat[i,k]=rssError(yEst.T.A,array(testY)) #注意是转置,然后求基数组,T,A
#print errorMat[i,k]
meanErrors = mean(errorMat,0)#calc avg performance of the different ridge weight vectors
minMean = float(min(meanErrors))
bestWeights = wMat[nonzero(meanErrors==minMean)] #注意nonzero的用法
#can unregularize to get model
#when we regularized we wrote Xreg = (x-meanX)/var(x)
#we can now write in terms of x not Xreg: x*w/var(x) - meanX/var(x) +meanY # 标准化后需要还原
xMat = mat(xArr); yMat=mat(yArr).T
meanX = mean(xMat,0); varX = var(xMat,0)
unReg = bestWeights/varX # 注意运算按照对应模式, 数组对矩阵
print "the best model from Ridge Regression is:\n",unReg
print "with constant term: ",-1*sum(multiply(meanX,unReg)) + mean(yMat)
树回归
def loadDataSet(fileName): #general function to parse tab -delimited floats
dataMat = [] #assume last column is target value
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split('\t')
fltLine = map(float,curLine) #map all elements to float() 注意map,对每个元素执行函数
dataMat.append(fltLine)
return dataMat
def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):#assume dataSet is NumPy Mat so we can array filtering
feat, val = chooseBestSplit(dataSet, leafType, errType, ops)#choose the best split
if feat == None: return val #if the splitting hit a stop condition return val
retTree = {}
retTree['spInd'] = feat
retTree['spVal'] = val
lSet, rSet = binSplitDataSet(dataSet, feat, val)
retTree['left'] = createTree(lSet, leafType, errType, ops)
retTree['right'] = createTree(rSet, leafType, errType, ops)
return retTree
def regLeaf(dataSet):#returns the value used for each leaf
return mean(dataSet[:,-1])
def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)): #注意此处leafType 被赋值于一个函数regLeaf, 被调用的时候条用的是函数regLeaf。注意Python中此种用法
tolS = ops[0]; tolN = ops[1] #容许的错误下降值 切分的最少样本数
#if all the target variables are the same value: quit and return value
if len(set(dataSet[:,-1].T.tolist()[0])) == 1: #exit cond 1 注意一维数组的tolist()[0] 的用法,集合的作用,去重
return None, leafType(dataSet)
m,n = shape(dataSet)
#the choice of the best feature is driven by Reduction in RSS error from mean
S = errType(dataSet)
bestS = inf; bestIndex = 0; bestValue = 0
for featIndex in range(n-1): #对每个特征,注意range
for splitVal in set(dataSet[:,featIndex].T.tolist()[0]): #对每个特征值
mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): continue
newS = errType(mat0) + errType(mat1)
if newS < bestS:
bestIndex = featIndex
bestValue = splitVal
bestS = newS
#if the decrease (S-bestS) is less than a threshold don't do the split
if (S - bestS) < tolS: #如果误差减少不大则退出
return None, leafType(dataSet) #exit cond 2
mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): #exit cond 3
return None, leafType(dataSet)
return bestIndex,bestValue#returns the best feature to split on
#and the value used for that split
def regErr(dataSet):
return var(dataSet[:,-1]) * shape(dataSet)[0] # 总方差=均方差*样本个数
def binSplitDataSet(dataSet, feature, value):
mat0 = dataSet[nonzero(dataSet[:,feature] > value)[0],:]
mat1 = dataSet[nonzero(dataSet[:,feature] <= value)[0],:]
return mat0,mat1
树剪枝
def isTree(obj):
return (type(obj).__name__=='dict') #判断是否是树
def getMean(tree):
if isTree(tree['right']): tree['right'] = getMean(tree['right'])
if isTree(tree['left']): tree['left'] = getMean(tree['left'])
return (tree['left']+tree['right'])/2.0
def prune(tree, testData):
if shape(testData)[0] == 0: return getMean(tree) #if we have no test data collapse the tree
if (isTree(tree['right']) or isTree(tree['left'])):#if the branches are not trees try to prune them
lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
if isTree(tree['left']): tree['left'] = prune(tree['left'], lSet)
if isTree(tree['right']): tree['right'] = prune(tree['right'], rSet)
#if they are now both leafs, see if we can merge them
if not isTree(tree['left']) and not isTree(tree['right']):
lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
errorNoMerge = sum(power(lSet[:,-1] - tree['left'],2)) +\
sum(power(rSet[:,-1] - tree['right'],2)) #计算未合并时候的误差
treeMean = (tree['left']+tree['right'])/2.0 #用此种方法计算平均值??恰当?
errorMerge = sum(power(testData[:,-1] - treeMean,2)) #合并后的误差
if errorMerge < errorNoMerge:
print "merging"
return treeMean
else: return tree
else: return tree
模型树
def modelErr(dataSet): #在每个树叶上进行回归,并计算误差
ws,X,Y = linearSolve(dataSet)
yHat = X * ws
return sum(power(Y - yHat,2)) #批量运算
def linearSolve(dataSet): #helper function used in two places
m,n = shape(dataSet)
X = mat(ones((m,n))); Y = mat(ones((m,1))) #create a copy of data with 1 in 0th postion 一般因为在回归时候常熟b化为向量Xo=1 的常数,一般初始化为1,这样Xo不再改变
X[:,1:n] = dataSet[:,0:n-1]; Y = dataSet[:,-1] #and strip out Y
xTx = X.T*X
if linalg.det(xTx) == 0.0:
raise NameError('This matrix is singular, cannot do inverse,\n\
try increasing the second value of ops')
ws = xTx.I * (X.T * Y)
return ws,X,Y
树回归与标准回归比较
def createForeCast(tree, testData, modelEval=regTreeEval):
m=len(testData)
yHat = mat(zeros((m,1)))
for i in range(m):
yHat[i,0] = treeForeCast(tree, mat(testData[i]), modelEval)
return yHat
def treeForeCast(tree, inData, modelEval=regTreeEval):
if not isTree(tree): return modelEval(tree, inData)
if inData[tree['spInd']] > tree['spVal']:
if isTree(tree['left']): return treeForeCast(tree['left'], inData, modelEval)
else: return modelEval(tree['left'], inData)
else:
if isTree(tree['right']): return treeForeCast(tree['right'], inData, modelEval)
else: return modelEval(tree['right'], inData)
def regTreeEval(model, inDat):
return float(model)
def modelTreeEval(model, inDat):
n = shape(inDat)[1]
X = mat(ones((1,n+1)))
X[:,1:n+1]=inDat
return float(X*model)
使用Tkinter库创建GUI
from Tkinter import *
from numpy import *
import regTrees
root = Tk()
def reDraw(tols,tolN):
pass
def drawNewTree():
pass
Label(root, text="tolN").grid(row=1, column=0)
tolNentry = Entry(root)
tolNentry.grid(row=1, column=1)
tolNentry.insert(0, '10')
Label(root, text="tolS").grid(row=2, column=0)
tolSentry = Entry(root)
tolSentry.grid(row=2, column=1)
tolSentry.insert(0, '1.0')
Button(root, text="ReDraw", command=drawNewTree).grid(row=1, column=2, rowspan=3)
chkBtnVar = IntVar()
chkBtn = Checkbutton(root, text="Model Tree", variable=chkBtnVar)
chkBtn.grid(row=3, column=0, columnspan=2)
reDraw.rawDat = mat(regTrees.loadDataSet('sine.txt'))
reDraw.testDat = arange(min(reDraw.rawDat[:, 0]), max(reDraw.rawDat[:, 0]), 0.01)
reDraw(1.0, 10)
root.mainloop()
利用K-mean对未标准的数据进行分组
def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent):
m = shape(dataSet)[0]
clusterAssment = mat(zeros((m,2)))#create mat to assign data points 二维数组,储存镞索引和到质心的距离
#to a centroid, also holds SE of each point
centroids = createCent(dataSet, k)
clusterChanged = True
while clusterChanged:
clusterChanged = False
for i in range(m):#for each data point assign it to the closest centroid
minDist = inf; minIndex = -1 # 注意此类用法,先设初始定值, inf 无穷大 minIndx类似 编程方法
for j in range(k):
distJI = distMeas(centroids[j,:],dataSet[i,:])
if distJI < minDist:
minDist = distJI; minIndex = j
if clusterAssment[i,0] != minIndex: clusterChanged = True #此处判断是否达到要就,如果达到要求了就不需要循环了
clusterAssment[i,:] = minIndex,minDist**2 # 注意赋值方式,同时赋两个值
print centroids
for cent in range(k):#recalculate centroids
ptsInClust = dataSet[nonzero(clusterAssment[:,0].A==cent)[0]]#get all the point in this cluster 矩阵过滤,先生成T,F的值,然后用nonzero返回真的索引
centroids[cent,:] = mean(ptsInClust, axis=0) #assign centroid to mean 计算“跨行”/列 的平均 注意此处在while循环内
return centroids, clusterAssment
def randCent(dataSet, k):
n = shape(dataSet)[1]
centroids = mat(zeros((k,n)))#create centroid mat
for j in range(n):#create random cluster centers, within bounds of each dimension 不同维度取随机数
minJ = min(dataSet[:,j])
rangeJ = float(max(dataSet[:,j]) - minJ)
centroids[:,j] = mat(minJ + rangeJ * random.rand(k,1)) #标量+四个随机数,生成矩阵,注意随机数的生成方式, 创建一个给定类型的数组,将其填充在一个均匀分布的随机样本[0, 1)中
return centroids
def distEclud(vecA, vecB):
return sqrt(sum(power(vecA - vecB, 2))) #la
使用后处理来提高聚类性能
2分k-mean
def biKmeans(dataSet, k, distMeas=distEclud):
m = shape(dataSet)[0]
clusterAssment = mat(zeros((m,2)))
centroid0 = mean(dataSet, axis=0).tolist()[0] #初始开始整个数据的质心
centList =[centroid0] #create a list with one centroid
for j in range(m):#calc initial Error
clusterAssment[j,1] = distMeas(mat(centroid0), dataSet[j,:])**2
while (len(centList) < k): #判断质心数目
lowestSSE = inf
for i in range(len(centList)):
ptsInCurrCluster = dataSet[nonzero(clusterAssment[:,0].A==i)[0],:]#get the data points currently in cluster i
centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distMeas) #调用普通K-mean分成2份
sseSplit = sum(splitClustAss[:,1])#compare the SSE to the currrent minimum
sseNotSplit = sum(clusterAssment[nonzero(clusterAssment[:,0].A!=i)[0],1])
print "sseSplit, and notSplit: ",sseSplit,sseNotSplit
if (sseSplit + sseNotSplit) < lowestSSE:
bestCentToSplit = i
bestNewCents = centroidMat
bestClustAss = splitClustAss.copy() #注意深度拷贝
lowestSSE = sseSplit + sseNotSplit
bestClustAss[nonzero(bestClustAss[:,0].A == 1)[0],0] = len(centList) #change 1 to 3,4, or whatever # 此两行对已经分割的两类打标签,bestCentTosplit是代表原来的那个标签不变,leng(centList)由于从0开始,增加了一个新标签
bestClustAss[nonzero(bestClustAss[:,0].A == 0)[0],0] = bestCentToSplit
print 'the bestCentToSplit is: ',bestCentToSplit
print 'the len of bestClustAss is: ', len(bestClustAss)
centList[bestCentToSplit] = bestNewCents[0,:].tolist()[0]#replace a centroid with two best centroids 先把原来的那个取代,然后再增加一个新质点
centList.append(bestNewCents[1,:].tolist()[0])
clusterAssment[nonzero(clusterAssment[:,0].A == bestCentToSplit)[0],:]= bestClustAss#reassign new clusters, and SSE 把被分割的那部分进行更新
return mat(centList), clusterAssment
对地图上的点进行聚类
import urllib
import json
def geoGrab(stAddress, city):
apiStem = 'http://where.yahooapis.com/geocode?' #create a dict and constants for the goecoder
params = {}
params['flags'] = 'J'#JSON return type 参数放在字典里面
params['appid'] = 'aaa0VN6k'
params['location'] = '%s %s' % (stAddress, city) #注意此种方式的用法
url_params = urllib.urlencode(params)
yahooApi = apiStem + url_params #print url_params
print yahooApi
c=urllib.urlopen(yahooApi)
return json.loads(c.read())
from time import sleep
def massPlaceFind(fileName):
fw = open('places.txt', 'w')
for line in open(fileName).readlines(): #打开文件,获取参数
line = line.strip()
lineArr = line.split('\t')
retDict = geoGrab(lineArr[1], lineArr[2])
if retDict['ResultSet']['Error'] == 0:
lat = float(retDict['ResultSet']['Results'][0]['latitude'])
lng = float(retDict['ResultSet']['Results'][0]['longitude'])
print "%s\t%f\t%f" % (lineArr[0], lat, lng)
fw.write('%s\t%f\t%f\n' % (line, lat, lng))
else: print "error fetching"
sleep(1) #时间延迟 不要短期内频繁访问API
fw.close()
知道经度纬度计算表面距离
def distSLC(vecA, vecB):#Spherical Law of Cosines
a = sin(vecA[0,1]*pi/180) * sin(vecB[0,1]*pi/180)
b = cos(vecA[0,1]*pi/180) * cos(vecB[0,1]*pi/180) * \
cos(pi * (vecB[0,0]-vecA[0,0]) /180)
return arccos(a + b)*6371.0 #pi is imported with numpy
画出聚类后的图
import matplotlib
import matplotlib.pyplot as plt
def clusterClubs(numClust=5):
datList = []
for line in open('places.txt').readlines():
lineArr = line.split('\t')
datList.append([float(lineArr[4]), float(lineArr[3])])
datMat = mat(datList)
myCentroids, clustAssing = biKmeans(datMat, numClust, distMeas=distSLC)
fig = plt.figure()
rect=[0.1,0.1,0.8,0.8]
scatterMarkers=['s', 'o', '^', '8', 'p', \
'd', 'v', 'h', '>', '<']
axprops = dict(xticks=[], yticks=[])
ax0=fig.add_axes(rect, label='ax0', **axprops)
imgP = plt.imread('Portland.png') #创建矩阵
ax0.imshow(imgP)
ax1=fig.add_axes(rect, label='ax1', frameon=False)
for i in range(numClust):
ptsInCurrCluster = datMat[nonzero(clustAssing[:,0].A==i)[0],:]
markerStyle = scatterMarkers[i % len(scatterMarkers)]
ax1.scatter(ptsInCurrCluster[:,0].flatten().A[0], ptsInCurrCluster[:,1].flatten().A[0], marker=markerStyle, s=90)
ax1.scatter(myCentroids[:,0].flatten().A[0], myCentroids[:,1].flatten().A[0], marker='+', s=300)
plt.show()
使用Apriori进行关联分析
def loadDataSet():
return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
def createC1(dataSet):
C1 = []
for transaction in dataSet:
for item in transaction:
if not [item] in C1: #注意不在列表中的检查方法 注意带了[],列表内部也是列表
C1.append([item])
C1.sort()
return map(frozenset, C1)#use frozen set so we 注意是不可变集合,利用map对应每个列表内的元素
#can use it as a key in a dict
def scanD(D, Ck, minSupport):
ssCnt = {}
for tid in D:
for can in Ck:
if can.issubset(tid): #检查是否是子集
if not ssCnt.has_key(can): ssCnt[can]=1 #集合中是否有元素的判断方法, 也可not in? 此处用了has_key方法代替
else: ssCnt[can] += 1
numItems = float(len(D))
retList = []
supportData = {}
for key in ssCnt:
support = ssCnt[key]/numItems
if support >= minSupport:
retList.insert(0,key) #在列表开始位置插入值, key是一列表类型
supportData[key] = support #存放所有数据数据支持度
return retList, supportData
def apriori(dataSet, minSupport = 0.5):
C1 = createC1(dataSet)
D = map(set, dataSet)
L1, supportData = scanD(D, C1, minSupport) #获得支持度大于阈值的项和所有候选项的支持度
L = [L1]
k = 2 #注意k的选择k可取任意,只要-2的地方换成减完后为0?
while (len(L[k-2]) > 0): #L是列表的列表
Ck = aprioriGen(L[k-2], k) #产生项的组合,输入的项是列表的列表, 每次出入的组合不一样。一层一层的组合。
Lk, supK = scanD(D, Ck, minSupport)#scan DB to get Lk
supportData.update(supK) # 此处使用update的方法更新项的支持度,字典类型没有append
L.append(Lk) #注意lk是嵌套列表,[[1,2],[0,1]]类型
k += 1
return L, supportData
def aprioriGen(Lk, k): #creates Ck 注意产生项之间组合的方法
retList = []
lenLk = len(Lk)
for i in range(lenLk):
for j in range(i+1, lenLk):
L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2] # 判断前k-2个元素是否相等,如用[0,1],[0,2],[1,2]组合成[0,1,2]的时候少循环两次
L1.sort(); L2.sort()
if L1==L2: #if first k-2 elements are equal, 如果相等则并集加入retList,[1,3],[3,5]不能合并成[1,3,5],因为要合并成,至少[1,5]也满足阈值要求
retList.append(Lk[i] | Lk[j]) #set union 并集合 注意用|表示并集,使用集合由此优势
return retList
从频繁项集中发掘关联规则
def generateRules(L, supportData, minConf=0.7): #supportData is a dict coming from scanD
bigRuleList = []
for i in range(1, len(L)):#only get the sets with two or more items ,不是从0开始
for freqSet in L[i]:
H1 = [frozenset([item]) for item in freqSet] #注意frozenset的用法
if (i > 1):
rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
else:
calcConf(freqSet, H1, supportData, bigRuleList, minConf) #函数执行操作,没接收返回值。
return bigRuleList
def calcConf(freqSet, H, supportData, brl, minConf=0.7):
prunedH = [] #create new list to return
for conseq in H:
conf = supportData[freqSet]/supportData[freqSet-conseq] #calc confidence 注意集合相减,补集
if conf >= minConf:
print freqSet-conseq,'-->',conseq,'conf:',conf
brl.append((freqSet-conseq, conseq, conf))
prunedH.append(conseq) #H集合是可以出现在关联规则右侧的元素,prunedH 也保存了这样的元素利于下一次使用
return prunedH
def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
m = len(H[0])
if (len(freqSet) > (m + 1)): #try further merging
Hmp1 = aprioriGen(H, m+1)#create Hm+1 new candidates 合并
Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf) #获得新的子集,如果满足条件,子集进入下一个函数循环,此步是关键,控制
if (len(Hmp1) > 1): #need at least two sets to merge
rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)
示例,发现国会投票中的模式
from time import sleep
from votesmart import votesmart
votesmart.apikey = 'a7fa40adec6f4a77178799fae4441030'
votesmart.apikey = 'get your api key first'
def getActionIds():
actionIdList = []; billTitleList = []
fr = open('recent20bills.txt')
for line in fr.readlines():
billNum = int(line.split('\t')[0])
try: #try-except 模块
billDetail = votesmart.votes.getBill(billNum) #api call API端口
for action in billDetail.actions:
if action.level == 'House' and \
(action.stage == 'Passage' or action.stage == 'Amendment Vote'):
actionId = int(action.actionId)
print 'bill: %d has actionId: %d' % (billNum, actionId)
actionIdList.append(actionId)
billTitleList.append(line.strip().split('\t')[1])
except:
print "problem getting bill %d" % billNum
sleep(1) #delay to be polite 需要此步延迟时间
return actionIdList, billTitleList
毒蘑菇案例
mushDatSet=[line.split() for line in open('mushroom.dat').readlines()] # 注意此用法
使用FP-growth算法来高效的发现频繁项集
class treeNode:
def __init__(self, nameValue, numOccur, parentNode):
self.name = nameValue
self.count = numOccur
self.nodeLink = None
self.parent = parentNode #needs to be updated
self.children = {} #子树采用字典形式储存
def inc(self, numOccur):
self.count += numOccur
def disp(self, ind=1):
print ' '*ind, self.name, ' ', self.count # ''*ind 打印ind个空格
for child in self.children.values(): #字典的values方法
child.disp(ind+1)
构建FP树
def loadSimpDat():
simpDat = [['r', 'z', 'h', 'j', 'p'],
['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],
['z'],
['r', 'x', 'n', 'o', 's'],
['y', 'r', 'x', 'z', 'q', 't', 'p'],
['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]
return simpDat
def createInitSet(dataSet): #数据格式化
retDict = {}
for trans in dataSet:
retDict[frozenset(trans)] = 1
return retDict
def createTree(dataSet, minSup=1): #create FP-tree from dataset but don't mine
headerTable = {} #头指针表
#go over dataSet twice
for trans in dataSet:#first pass counts frequency of occurance
for item in trans:
headerTable[item] = headerTable.get(item, 0) + dataSet[trans] #get用法,如果item存在,则返回,否则返回0 此处的基础是每个相机的元素不能出现2此,如[x,x,y] 频繁模式挖掘不管出现此数,只看出现了没有,数据整理的时候做好就不会有这样的数据
for k in headerTable.keys(): #remove items not meeting minSup
if headerTable[k] < minSup:
del(headerTable[k]) # 字典用del,删除头指针中不满足规定的元素
freqItemSet = set(headerTable.keys()) #字典中key()的用法,提取出所有的key
#print 'freqItemSet: ',freqItemSet
if len(freqItemSet) == 0: return None, None #if no items meet min support -->get out 注意在python中返回的都是None
for k in headerTable:
headerTable[k] = [headerTable[k], None] #reformat headerTable to use Node link None存放将来的连接?先用None代替?
#print 'headerTable: ',headerTable
retTree = treeNode('Null Set', 1, None) #create tree
for tranSet, count in dataSet.items(): #go through dataset 2nd time 注意字典中,item()用法,返回了一个tuple类型的值,包括key和values
localD = {}
for item in tranSet: #put transaction items in order
if item in freqItemSet:
localD[item] = headerTable[item][0]
if len(localD) > 0:
orderedItems = [v[0] for v in sorted(localD.items(), key=lambda p: p[1], reverse=True)] #注意复杂嵌套列表排序,先对localD(字典)取itmes()后变成许多元素,后又根据元素的第二元素p[1]排序,此时排序完后,按照元组的第一个元素v[0]输出推导列表
updateTree(orderedItems, retTree, headerTable, count)#populate tree with ordered freq itemset
return retTree, headerTable #return tree and header table
def updateTree(items, inTree, headerTable, count):
if items[0] in inTree.children:#check if orderedItems[0] in retTree.children
inTree.children[items[0]].inc(count) #incrament count 增加数量
else: #add items[0] to inTree.children
inTree.children[items[0]] = treeNode(items[0], count, inTree) #加进树
if headerTable[items[0]][1] == None: #update header table
headerTable[items[0]][1] = inTree.children[items[0]] #注意把连接更新到指针头中去了,这种方式的应用
else:
updateHeader(headerTable[items[0]][1], inTree.children[items[0]])
if len(items) > 1:#call updateTree() with remaining ordered items 把剩下的部分连接到书中去,此时是下一层的树,及刚加完元素的下层
updateTree(items[1::], inTree.children[items[0]], headerTable, count) #注意此处items[1::] 保证了下一个元素,items[0] 保证是刚加完的树
def updateHeader(nodeToTest, targetNode): #this version does not use recursion
while (nodeToTest.nodeLink != None): #Do not use recursion to traverse a linked list!
nodeToTest = nodeToTest.nodeLink #多余?本来就有,为了检查保险?
nodeToTest.nodeLink = targetNode
从一颗FP树中挖掘频繁模式
def findPrefixPath(basePat, treeNode): #treeNode comes from header table
condPats = {}
while treeNode != None:
prefixPath = []
ascendTree(treeNode, prefixPath)
if len(prefixPath) > 1:
condPats[frozenset(prefixPath[1:])] = treeNode.count
treeNode = treeNode.nodeLink #转到链接
return condPats
def ascendTree(leafNode, prefixPath): #ascends from leaf node to root
if leafNode.parent != None:
prefixPath.append(leafNode.name)
ascendTree(leafNode.parent, prefixPath) #递归函数
def mineTree(inTree, headerTable, minSup, preFix, freqItemList):
bigL = [v[0] for v in sorted(headerTable.items(), key=lambda p: p[1])] #(sort header table) 和上次的排序一样。
for basePat in bigL: #start from bottom of header table
newFreqSet = preFix.copy() #注意是copy, 从上一次循环来,便于组合成新的
newFreqSet.add(basePat) #组合成频繁项集
#print 'finalFrequent Item: ',newFreqSet #append to set
freqItemList.append(newFreqSet)
condPattBases = findPrefixPath(basePat, headerTable[basePat][1])
#print 'condPattBases :',basePat, condPattBases
#2. construct cond FP-tree from cond. pattern base
myCondTree, myHead = createTree(condPattBases, minSup)
#print 'head from conditional tree: ', myHead
if myHead != None: #3. mine cond. FP-tree
#print 'conditional tree for: ',newFreqSet
#myCondTree.disp(1)
mineTree(myCondTree, myHead, minSup, newFreqSet, freqItemList)
在Twitter源词中发现一些共词
import twitter
from time import sleep
import re
def textParse(bigString):
urlsRemoved = re.sub('(http:[/][/]|www.)([a-z]|[A-Z]|[0-9]|[/.]|[~])*', '', bigString) #用re.sub实现正则的替换, 可以用re.compile把模式写在里面,和在sub直接写有点不不同
listOfTokens = re.split(r'\W*', urlsRemoved) #注意re.split的分割方法
return [tok.lower() for tok in listOfTokens if len(tok) > 2]
def getLotsOfTweets(searchStr):
CONSUMER_KEY = ''
CONSUMER_SECRET = ''
ACCESS_TOKEN_KEY = ''
ACCESS_TOKEN_SECRET = ''
api = twitter.Api(consumer_key=CONSUMER_KEY, consumer_secret=CONSUMER_SECRET,
access_token_key=ACCESS_TOKEN_KEY,
access_token_secret=ACCESS_TOKEN_SECRET)
#you can get 1500 results 15 pages * 100 per page
resultsPages = []
for i in range(1,15):
print "fetching page %d" % i
searchResults = api.GetSearch(searchStr, per_page=100, page=i)
resultsPages.append(searchResults)
sleep(6)
return resultsPages
def mineTweets(tweetArr, minSup=5):
parsedList = []
for i in range(14):
for j in range(100):
parsedList.append(textParse(tweetArr[i][j].text))
initSet = createInitSet(parsedList)
myFPtree, myHeaderTab = createTree(initSet, minSup) #先建立树
myFreqList = []
mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList) #频繁模式挖掘
return myFreqList
利用PCA简化数据
from numpy import *
def loadDataSet(fileName, delim='\t'):
fr = open(fileName)
stringArr = [line.strip().split(delim) for line in fr.readlines()] #strip移除一句两段空格
datArr = [map(float,line) for line in stringArr] #注意此种用法
return mat(datArr)
def pca(dataMat, topNfeat=9999999):
meanVals = mean(dataMat, axis=0)
meanRemoved = dataMat - meanVals #remove mean
covMat = cov(meanRemoved, rowvar=0) #协方差矩阵,注意后面的参数
eigVals,eigVects = linalg.eig(mat(covMat)) #计算特征值和特征向量
eigValInd = argsort(eigVals) #sort, sort goes smallest to largest
eigValInd = eigValInd[:-(topNfeat+1):-1] #cut off unwanted dimensions
redEigVects = eigVects[:,eigValInd] #reorganize eig vects largest to smallest 按列?
lowDDataMat = meanRemoved * redEigVects#transform data into new dimensions 数据线性转换,注意此种用法,把原来的数据转为降维后的数据,将数据转到新空间
reconMat = (lowDDataMat * redEigVects.T) + meanVals #注意此处过程,把提取出特征向量后新空间的数据,新空间的数据还原到原来维度,为什么不用逆矩阵呢? 注意
return lowDDataMat, reconMat
import matplotlib.pyplot as plt
fig=plt.figure()
ax=fig.add_subplot(111)
ax.scatter(dataMat[:,0].flatten().A[0], dataMat[:,1].flatten().A[0], marker='^',s=90)
ax.scatter(reconMat[:,0].flatten().A[0], reconMat[:,1].flatten().A[0], marker='o',s=50,c='red')
用PCA对半导体数据降维
将Nan用平均值代替
def replaceNanWithMean():
datMat = loadDataSet('secom.data', ' ')
numFeat = shape(datMat)[1]
for i in range(numFeat):
meanVal = mean(datMat[nonzero(~isnan(datMat[:,i].A))[0],i]) #values that are not NaN (a number) 注意"~"的作用,取反的作用。
datMat[nonzero(isnan(datMat[:,i].A))[0],i] = meanVal #set NaN values to mean
return datMat #许多特征值为零,说明它是其它的特征的副本,不提供格外的信息。
利用SVD简化数据
def loadExData():
return[[0, 0, 0, 2, 2],
[0, 0, 0, 3, 3],
[0, 0, 0, 1, 1],
[1, 1, 1, 0, 0],
[2, 2, 2, 0, 0],
[5, 5, 5, 0, 0],
[1, 1, 1, 0, 0]]
import svdRec
from numpy import *
Data=svdRec.loadExData()
U,Sigima,VT=linalg.svd(Data)
print "U:", U
print "Sigima", Sigima
print "VT", VT
Sig3=mat([[Sigima[0],0,0],[0,Sigima[1],0],[0,0,Sigima[2]]])
U[:,:3]*Sig3*VT[:3,:]
print U
基于协同过滤的引擎
距离计算,转化为相似度
from numpy import linalg as la
def ecludSim(inA,inB):
return 1.0/(1.0 + la.norm(inA - inB)) #la中的norm 计算泛数,注意了,距离计算
def pearsSim(inA,inB):
if len(inA) < 3 : return 1.0
return 0.5+0.5*corrcoef(inA, inB, rowvar = 0)[0][1]
def cosSim(inA,inB):
num = float(inA.T*inB)
denom = la.norm(inA)*la.norm(inB)
return 0.5+0.5*(num/denom)
myMat[0,1]=myMat[0,0]=myMat[1,0]=myMat[2,0]=4
def standEst(dataMat, user, simMeas, item):
n = shape(dataMat)[1]
simTotal = 0.0; ratSimTotal = 0.0
for j in range(n):
userRating = dataMat[user,j]
if userRating == 0: continue #注意continue不是退出真个循环,而是跳过进入下一循环,和break不一样
overLap = nonzero(logical_and(dataMat[:,item].A>0, \ # 注意logical_and的用法,逻辑表达式,比较两列是否相等,结合.A,nonzero 的使用
dataMat[:,j].A>0))[0]
if len(overLap) == 0: similarity = 0
else: similarity = simMeas(dataMat[overLap,item], \
dataMat[overLap,j])
print 'the %d and %d similarity is: %f' % (item, j, similarity)
simTotal += similarity
ratSimTotal += similarity * userRating
if simTotal == 0: return 0
else: return ratSimTotal/simTotal #考虑当前用户的评分,归一化,最后分数在1-5之间, 类似权重?
ef recommend(dataMat, user, N=3, simMeas=cosSim, estMethod=standEst):
unratedItems = nonzero(dataMat[user,:].A==0)[1]#find unrated items 未评级的
if len(unratedItems) == 0: return 'you rated everything'
itemScores = []
for item in unratedItems:
estimatedScore = estMethod(dataMat, user, simMeas, item) #返回的是tuple类型
itemScores.append((item, estimatedScore))
return sorted(itemScores, key=lambda jj: jj[1], reverse=True)[:N] #结果排序,注意书写形式
def svdEst(dataMat, user, simMeas, item):
n = shape(dataMat)[1]
simTotal = 0.0; ratSimTotal = 0.0
U,Sigma,VT = la.svd(dataMat) #奇异值分解
Sig4 = mat(eye(4)*Sigma[:4]) #arrange Sig4 into a diagonal matrix 取4个奇异值,构造对角举证,注意构造方式,是单位矩阵相乘
xformedItems = dataMat.T * U[:,:4] * Sig4.I #create transformed items 注意构造到低维矩阵的方式,首选是取转置,再相乘
for j in range(n):
userRating = dataMat[user,j]
if userRating == 0 or j==item: continue
similarity = simMeas(xformedItems[item,:].T,\
xformedItems[j,:].T)
print 'the %d and %d similarity is: %f' % (item, j, similarity)
simTotal += similarity
ratSimTotal += similarity * userRating
if simTotal == 0: return 0
else: return ratSimTotal/simTotal
基于SVD的图像压缩
def imgCompress(numSV=3, thresh=0.8):
myl = []
for line in open('0_5.txt').readlines():
newRow = []
for i in range(32):
newRow.append(int(line[i]))
myl.append(newRow)
myMat = mat(myl)
print "****original matrix******"
printMat(myMat, thresh)
U,Sigma,VT = la.svd(myMat)
SigRecon = mat(zeros((numSV, numSV)))
for k in range(numSV):#construct diagonal matrix from vector
SigRecon[k,k] = Sigma[k]
reconMat = U[:,:numSV]*SigRecon*VT[:numSV,:]
print "****reconstructed matrix using %d singular values******" % numSV
printMat(reconMat, thresh)
大数据与Mapreduce
在lunix 执行 cat inputFile.txt|python mapper.py|sort|python reducer.py>outputFile.txt 注意此种运行方式
import sys #
from numpy import mat, mean, power
def read_input(file):
for line in file:
yield line.rstrip()
input = read_input(sys.stdin)#creates a list of input lines 使用sys.stdin
input = [float(line) for line in input] #overwrite with floats
numInputs = len(input)
input = mat(input)
sqInput = power(input,2)
#output size, mean, mean(square values)
print "%d\t%f\t%f" % (numInputs, mean(input), mean(sqInput)) #calc mean of columns
print >> sys.stderr, "report: still alive" #注意stderr的使用方式 print >>
mrjob的一个MapReduce脚本解析
分布式SVM的Pegasos算法
机器学习实战 学习笔记
最新推荐文章于 2023-12-02 14:35:39 发布