机器学习实战学习笔记

最新推荐文章于 2023-12-02 14:35:39 发布
张博208
最新推荐文章于 2023-12-02 14:35:39 发布
阅读量2.3k
点赞数 1
分类专栏： Python Machine Learning Algorithm
本文链接：https://blog.csdn.net/bbbeoy/article/details/72699143
版权
Machine Learning 同时被 3 个专栏收录
208 篇文章 14 订阅
订阅专栏
Python
158 篇文章 2 订阅
订阅专栏
Algorithm
51 篇文章 11 订阅
订阅专栏
jupyter notebook

机器学习基础

from numpy import *
random.rand(4,4)
randMat=mat(random.rand(4,4))   mat 把数组转化为矩阵
invrandMat=randMat.I   .I 矩阵求逆
invrandMat*randMat  矩阵乘法
eye(4) 产生单位矩阵

K-近邻算法

from numpy import *
import operator 
def createDataSet():
    group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
    labels = ['A','A','B','B']
    return group, labels

import kNN
group,labels=kNN.createDataSet()


def classify0(inX, dataSet, labels, k): 
    dataSetSize = dataSet.shape[0]  #shape 数组维度  去行
    diffMat = tile(inX, (dataSetSize,1)) - dataSet    # tile 矩阵重复 ，InX 要分类的点的向量，广播思想，矩阵运算思想
    sqDiffMat = diffMat**2
    sqDistances = sqDiffMat.sum(axis=1)
    distances = sqDistances**0.5
    sortedDistIndicies = distances.argsort()      # argsort函数返回的是数组值从小到大的索引值
    classCount={}          
    for i in range(k):
        voteIlabel = labels[sortedDistIndicies[i]]
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1   #get指定dict默认键的值，setdefault相似的函数
    sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)   #按字典键值进行排序， operator包中 sorted(iterable[, cmp[, key[, reverse]]])
    return sortedClassCount[0][0]

kNN.classify0([0,0],group,labels,3)


def file2matrix(filename):
    fr = open(filename)
    numberOfLines = len(fr.readlines())         #获得文件行数
    returnMat = zeros((numberOfLines,3))        #prepare matrix to return
    classLabelVector = []                       #prepare labels return   
    fr = open(filename)
    index = 0
    for line in fr.readlines():
        line = line.strip()
        listFromLine = line.split('\t')
        returnMat[index,:] = listFromLine[0:3]
        classLabelVector.append(int(listFromLine[-1]))
        index += 1
    return returnMat,classLabelVector

import kNN
import matplotlib
from numpy import *
import matplotlib.pyplot as plt
datingDataMat,datingLabels=kNN.file2matrix('datingTestSet2.txt')

fig=plt.figure()
ax=fig.add_subplot(111)
ax.scatter(datingDataMat[:,1], datingDataMat[:,2], 15.0*array(datingLabels), 15.0*array(datingLabels))
ax.axis([-2,25,-0.2,2.0])
plt.xlabel('Percentage of Time Spent Playing Video Games')
plt.ylabel('Liters of Ice Cream Consumed Per Week')
plt.show()


数据准备，归一化数值

def autoNorm(dataSet):
    minVals = dataSet.min(0)
    maxVals = dataSet.max(0)    #0代表了列方向，1代表了行方向
    ranges = maxVals - minVals
    normDataSet = zeros(shape(dataSet))  #产生了和原来大小一样的都为0的
    m = dataSet.shape[0]
    normDataSet = dataSet - tile(minVals, (m,1))  #广播，重复 数据减去列的最小值
    normDataSet = normDataSet/tile(ranges, (m,1))   #广播范围， 注意有些软件包中/表示矩阵相除
    return normDataSet, ranges, minVals

normMat,ranges,minVals=kNN.autoNorm(datingDataMat)


作为完整算法验证分类器

def datingClassTest():
    hoRatio = 0.50      #设置训练集大小
    datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')        
    normMat, ranges, minVals = autoNorm(datingDataMat)   #归一化后的数据
    m = normMat.shape[0]
    numTestVecs = int(m*hoRatio)  # 训练集
    errorCount = 0.0
    for i in range(numTestVecs):
        classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3) # 注意第一个参数是测试数据， 后两个参数代表训练数据集
        print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i])
        if (classifierResult != datingLabels[i]): errorCount += 1.0
    print "the total error rate is: %f" % (errorCount/float(numTestVecs))
    print errorCount

构建完整可用系统
def classifyPerson():
	resultList=['not at all','in small doses','in large doses']
	percentTats=float(raw_input("percentage of time spent playing video games?"))
	ffMiles=float(raw_inut("frequent filer miles earned per year?"))
	iceCream=float(raw_input("liters of ice cream consumed per year?"))
	datingDateMat,dataingLabels=file2matrix('datingTestSet2.txt')
	normMat,ranges,minVals=autoNorm(datingDataMat)
	inArr=array([ffMiles,percentTats,iceCream])
	classifierResult=classify0((inArr-minVals)/ranges,normMat,datingLabels,3)
	print "you will probably like this person:"resultList[classifierResult-1]

将图像转化为测试向量
def img2vector(filename):
    returnVect = zeros((1,1024))  
    fr = open(filename)
    for i in range(32):
        lineStr = fr.readline()
        for j in range(32):
            returnVect[0,32*i+j] = int(lineStr[j])
    return returnVect

使用K临近算法识别手写数字
def handwritingClassTest():
    hwLabels = []
    trainingFileList = listdir('trainingDigits')           #导入训练集数据文件目录 os模块的listdir方法
    m = len(trainingFileList)
    trainingMat = zeros((m,1024))
    for i in range(m):
        fileNameStr = trainingFileList[i]
        fileStr = fileNameStr.split('.')[0]     #0_0.txt 格式的文件， 取出0_0
        classNumStr = int(fileStr.split('_')[0])   #0_0 中取出第一个0， 提取出该文件代表的哪个数字，标签
        hwLabels.append(classNumStr)
        trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr)
    testFileList = listdir('testDigits')        #iterate through the test set
    errorCount = 0.0
    mTest = len(testFileList)
    for i in range(mTest):
        fileNameStr = testFileList[i]
        fileStr = fileNameStr.split('.')[0]     #take off .txt
        classNumStr = int(fileStr.split('_')[0])
        vectorUnderTest = img2vector('testDigits/%s' % fileNameStr)
        classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3) # 注意第一个是测试集，后两个参数是训练集的
        print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr)
        if (classifierResult != classNumStr): errorCount += 1.0
    print "\nthe total number of errors is: %d" % errorCount
    print "\nthe total error rate is: %f" % (errorCount/float(mTest))


决策树

计算熵
def calcShannonEnt(dataSet):
    numEntries = len(dataSet)      #传入了列表
    labelCounts = {}
    for featVec in dataSet:              #the the number of unique elements and their occurance
        currentLabel = featVec[-1]   #标签
        if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1   #抽取出的分类特性数量统计
    shannonEnt = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key])/numEntries     #计算特性的每个分类概论
        shannonEnt -= prob * log(prob,2) #log base 2    计算熵
    return shannonEnt

创建数据集
def createDataSet():
    dataSet = [[1, 1, 'yes'],
               [1, 1, 'yes'],
               [1, 0, 'no'],
               [0, 1, 'no'],
               [0, 1, 'no']]
    labels = ['no surfacing','flippers']
    #change to discrete values
    return dataSet, labels

划分数据集

def splitDataSet(dataSet, axis, value):  # dataSet为列表
    retDataSet = []
    for featVec in dataSet:
        if featVec[axis] == value:  #按照axis 列的值是否等于value分组
            reducedFeatVec = featVec[:axis]     #此处取需要分组列表的axis特征的前几项
            reducedFeatVec.extend(featVec[axis+1:])   #此处把需要分组列表的axis特征的前后项连接到前面的几项上去，因此组成的新列表不包括axis列，其它的都包括
            retDataSet.append(reducedFeatVec)   # 注意append和extend的区别，append 增加的时候不管增加的是列表等，都不展开， 而extend会展开list
    return retDataSet

选择最好的数据集划分方式
def chooseBestFeatureToSplit(dataSet):
    numFeatures = len(dataSet[0]) - 1     #最后一个是标签，获得特征
    baseEntropy = calcShannonEnt(dataSet) #此处计算的是总熵，非条件熵
    bestInfoGain = 0.0; bestFeature = -1
    for i in range(numFeatures):        #iterate over all the features
        featList = [example[i] for example in dataSet]  #选择特征i的全部样例
        uniqueVals = set(featList)       #特征i的不同类， 从列表中创建集合是获得列表唯一元素的最快方法
        newEntropy = 0.0
        for value in uniqueVals:
            subDataSet = splitDataSet(dataSet, i, value)   #先计算出特征i等于value的， 对value进行循环，然后不同的value的相加
            prob = len(subDataSet)/float(len(dataSet))
            newEntropy += prob * calcShannonEnt(subDataSet)     #注意前面所说最后一列是标签列 ，条件熵， 分的越好，条件熵可能为0
        infoGain = baseEntropy - newEntropy     #计算信息增益
        if (infoGain > bestInfoGain):       #compare this to the best gain so far
            bestInfoGain = infoGain         #if better than current best, set to best
            bestFeature = i
    return bestFeature                      #returns an integer

投票器：
import operator 
def majorityCnt(classList):
    classCount={}
    for vote in classList:
        if vote not in classCount.keys(): classCount[vote] = 0           
        classCount[vote] += 1
    sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)  #注意写法
    return sortedClassCount[0][0]




递归决策树构建
def createTree(dataSet,labels):
    classList = [example[-1] for example in dataSet]         #最后一列是标签，分类标签，这里是所有的分类标签
    if classList.count(classList[0]) == len(classList):  #判断所有的分类标签是否相同，相同
        return classList[0]#stop splitting when all of the classes are equal
    if len(dataSet[0]) == 1: #用完了所有的特征，任然不能划分的时候，采取投票法 ，注意剩下的是分类标签了
        return majorityCnt(classList)
    bestFeat = chooseBestFeatureToSplit(dataSet)
    bestFeatLabel = labels[bestFeat]  #特征的名字套上
    myTree = {bestFeatLabel:{}}
    del(labels[bestFeat])  #去掉已经使用的特征
    featValues = [example[bestFeat] for example in dataSet]    #所有特征类
    uniqueVals = set(featValues) #找出唯一
    for value in uniqueVals:
        subLabels = labels[:]       #copy all of labels, so trees don't mess up existing labels
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels)  # 注意此处树的增加采用[bestFeatLabel][value]，value在for循环中，value离散数据
    return myTree                            

绘制树图

获得叶节点的数目和层数
def getNumLeafs(myTree):
    numLeafs = 0
    firstStr = myTree.keys()[0]
    secondDict = myTree[firstStr]
    for key in secondDict.keys():
        if type(secondDict[key]).__name__=='dict':#test to see if the nodes are dictonaires, if not they are leaf nodes
            numLeafs += getNumLeafs(secondDict[key])
        else:   numLeafs +=1
    return numLeafs

def getTreeDepth(myTree):
    maxDepth = 0
    firstStr = myTree.keys()[0]
    secondDict = myTree[firstStr]
    for key in secondDict.keys():
        if type(secondDict[key]).__name__=='dict':#test to see if the nodes are dictonaires, if not they are leaf nodes
            thisDepth = 1 + getTreeDepth(secondDict[key])
        else:   thisDepth = 1
        if thisDepth > maxDepth: maxDepth = thisDepth
    return maxDepth

def createPlot(inTree):
    fig = plt.figure(1, facecolor='white')
    fig.clf()
    axprops = dict(xticks=[], yticks=[])
    createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)    #no ticks
    #createPlot.ax1 = plt.subplot(111, frameon=False) #ticks for demo puropses 
    plotTree.totalW = float(getNumLeafs(inTree))
    plotTree.totalD = float(getTreeDepth(inTree))
    plotTree.xOff = -0.5/plotTree.totalW; plotTree.yOff = 1.0;
    plotTree(inTree, (0.5,1.0), '')
    plt.show()

def plotTree(myTree, parentPt, nodeTxt):#if the first key tells you what feat was split on
    numLeafs = getNumLeafs(myTree)  #this determines the x width of this tree
    depth = getTreeDepth(myTree)
    firstStr = myTree.keys()[0]     #the text label for this node should be this
    cntrPt = (plotTree.xOff + (1.0 + float(numLeafs))/2.0/plotTree.totalW, plotTree.yOff)
    plotMidText(cntrPt, parentPt, nodeTxt)
    plotNode(firstStr, cntrPt, parentPt, decisionNode)
    secondDict = myTree[firstStr]
    plotTree.yOff = plotTree.yOff - 1.0/plotTree.totalD
    for key in secondDict.keys():
        if type(secondDict[key]).__name__=='dict':#test to see if the nodes are dictonaires, if not they are leaf nodes   
            plotTree(secondDict[key],cntrPt,str(key))        #recursion
        else:   #it's a leaf node print the leaf node
            plotTree.xOff = plotTree.xOff + 1.0/plotTree.totalW
            plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode)
            plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))
    plotTree.yOff = plotTree.yOff + 1.0/plotTree.totalD
#if you do get a dictonary you know it's a tree, and the first element will be another dict


测试算法：使用决策树进行分类
def classify(inputTree,featLabels,testVec):
    firstStr = inputTree.keys()[0]   
    secondDict = inputTree[firstStr]            
    featIndex = featLabels.index(firstStr)             #第一个分类特性属于第几个标签
    key = testVec[featIndex]           #将标签转化为索引
    valueOfFeat = secondDict[key]          #分类特性标签的值
    if isinstance(valueOfFeat, dict): 
        classLabel = classify(valueOfFeat, featLabels, testVec)
    else: classLabel = valueOfFeat
    return classLabel
trees.classify(mytree,lables,[0,1]) # 输入，返回具体类型

决策树的储存
def storeTree(inputTree,filename):
    import pickle
    fw = open(filename,'w')
    pickle.dump(inputTree,fw)
    fw.close()
    
def grabTree(filename):
    import pickle
    fr = open(filename)
    return pickle.load(fr)


使用决策树预测隐形眼镜类型
fr=open(r'C:\Users\merit\Downloads\next step\MLiA_SourceCode\machinelearninginaction\Ch03\lenses.txt')
lenses=[inst.strip().split('\t') for inst in fr.readlines()]
lensesLabels=['age','prescript','astigmatic','tearRate']
lensesTree=tree.createTree(lenses.lensesLables)
treePlotter.createPlot(lensesTree)


朴素贝叶斯
从文本中构建词向量
def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]    #1 is abusive, 0 not
    return postingList,classVec


def createVocabList(dataSet):
    vocabSet = set([])               #create empty set
    for document in dataSet:
        vocabSet = vocabSet | set(document)         #union of the two sets， 合并独一无二的词库
    return list(vocabSet)

构建文档向量（词集模型）
def setOfWords2Vec(vocabList, inputSet):   
    returnVec = [0]*len(vocabList)          #建立一个和字典一样长的向量
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1        #如果在词典中，用1标出
        else: print "the word: %s is not in my Vocabulary!" % word
    return returnVec
词袋模型
def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

从词向量计算频率



trainMat=[]
for postinDoc in listOPosts:
    trainMat.append(bayes.setOfWords2Vec(myVocablist,postinDoc))  # 注意此处用append, 不用extend


def trainNB0(trainMatrix,trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory)/float(numTrainDocs)  #计算p(ci)
    p0Num = ones(numWords); p1Num = ones(numWords)      #change to ones() 使用numpy, 矩阵计算方便， 默认设置1，拉普拉斯平滑？
    p0Denom = 2.0; p1Denom = 2.0                        #change to 2.0  初始化概率
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]                 #矩阵计算
            p1Denom += sum(trainMatrix[i])            
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = log(p1Num/p1Denom)          #change to log()   #向量 loga*b=loga+logb 防止在a*b的时候出现溢出，先把分量转化成log， 里面计算出的是p(wi|c)的概论，注意计算方法
    p0Vect = log(p0Num/p0Denom)          #change to log()   #用log代替
    return p0Vect,p1Vect,pAbusive


分类函数
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):          
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)    #element-wise mult  注意此处的处理，一： log相加， 第二，相乘 算出了p（w|c）待分类的和已有的训练集合的结合
    p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else: 
        return 0


def testingNB():
    listOPosts,listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat=[]
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
    print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)
    testEntry = ['stupid', 'garbage']
    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
    print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)


示例 朴素贝叶斯过滤垃圾邮件

mySent='This book is the best book on Python or M.L, I have ever laid eyes upon.'
regEx=re.compile('\\W*')   #注意W，和w不同，W除了单词，数次之外的任何数据， w相反
listofTokens=regEx.split(mySent)
[tok.lower() for tok in listofTokens if len(tok)>0]  #去掉空格，小写

文本解析及完整的测试函数
def textParse(bigString):    #input is big string, #output is word list
    import re
    listOfTokens = re.split(r'\W*', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2] 


def spamTest():
    docList=[]; classList = []; fullText =[]
    for i in range(1,26):
        wordList = textParse(open('email/spam/%d.txt' % i).read())  #spam读取出单词
        docList.append(wordList)
        fullText.extend(wordList) #注意extend和extend的区别
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt' % i).read()) #ham读取出单词
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)#create vocabulary
    trainingSet = range(50); testSet=[]           #create test set
    for i in range(10):  #注意后面的代码中未出现i，这里只是次数
        randIndex = int(random.uniform(0,len(trainingSet))) #最近生成实数，在0到trainingSet范围内
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])  #注意随机的取法，测试集和训练集的随机取法
    trainMat=[]; trainClasses = []
    for docIndex in trainingSet:#train the classifier (get probs) trainNB0
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))  #装袋模型
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))  #获得训练集条件概率向量形式表示，数组
    errorCount = 0
    for docIndex in testSet:        #classify the remaining items
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])   #装袋模型，此处使用装袋模型，如果一个单词次数超过一次， 因为计算p(w|c)的时候是vec2Classify * p1Vec,是否存在bug？，应是词集模型？
        if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]: #转化为数组
            errorCount += 1
            print "classification error",docList[docIndex]
    print 'the error rate is: ',float(errorCount)/len(testSet)        #统计错误
    #return vocabList,fullText

Logistic回归
梯度上升算法

def loadDataSet():
    dataMat = []; labelMat = []
    fr = open('testSet.txt')
    for line in fr.readlines():
        lineArr = line.strip().split()
        dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])   #注意把X0设置能1，为了统一成向量形式wx，不带常量b
        labelMat.append(int(lineArr[2]))
    return dataMat,labelMat




def gradAscent(dataMatIn, classLabels):
    dataMatrix = mat(dataMatIn)             #convert to NumPy matrix  转化为mat，注意矩阵是数组的一类
    labelMat = mat(classLabels).transpose() #convert to NumPy matrix 并转置
    m,n = shape(dataMatrix)     #矩阵维度
    alpha = 0.001
    maxCycles = 500
    weights = ones((n,1))   #设置初始权重w都为1
    for k in range(maxCycles):              #heavy on matrix operations
        h = sigmoid(dataMatrix*weights)     #matrix mult 矩阵乘法，不是点乘，此处是所有的样本点参与运算
        error = (labelMat - h)              #vector subtraction  
        weights = weights + alpha * dataMatrix.transpose()* error #matrix mult     注意梯度的计算方法，此处是损失函数对变量的偏导数组成的集合，参见博客http://blog.csdn.net/bbbeoy/article/details/72578478
    return weights


画出图形def plotBestFit(weights):
    import matplotlib.pyplot as plt
    dataMat,labelMat=loadDataSet()
    dataArr = array(dataMat)
    n = shape(dataArr)[0] 
    xcord1 = []; ycord1 = []
    xcord2 = []; ycord2 = []
    for i in range(n):
        if int(labelMat[i])== 1:
            xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2])
        else:
            xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2])
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
    ax.scatter(xcord2, ycord2, s=30, c='green')
    x = arange(-3.0, 3.0, 0.1)
    y = (-weights[0]-weights[1]*x)/weights[2]       #其实是在sigmoid为零处的分界线，找出x1和x2的关系
    ax.plot(x, y)
    plt.xlabel('X1'); plt.ylabel('X2');
    plt.show()

随机梯度上升

def stocGradAscent0(dataMatrix, classLabels):
    m,n = shape(dataMatrix)
    alpha = 0.01
    weights = ones(n)   #initialize to all ones
    for i in range(m):               #注意此处变为样本数量，按照样本顺序
        h = sigmoid(sum(dataMatrix[i]*weights))    #一个样本
        error = classLabels[i] - h
        weights = weights + alpha * error * dataMatrix[i]    #一个样本，不是矩阵
    return weights

改进的随机梯度上升算法
def stocGradAscent1(dataMatrix, classLabels, numIter=150): #设定迭代次数
    m,n = shape(dataMatrix)
    weights = ones(n)   #initialize to all ones
    for j in range(numIter):
        dataIndex = range(m)
        for i in range(m):
            alpha = 4/(1.0+j+i)+0.0001    #apha decreases with iteration, does not 
            randIndex = int(random.uniform(0,len(dataIndex)))#go to 0 because of the constant  按照随机
            h = sigmoid(sum(dataMatrix[randIndex]*weights))
            error = classLabels[randIndex] - h
            weights = weights + alpha * error * dataMatrix[randIndex]
            del(dataIndex[randIndex])
    return weights

示例 预测病马的死亡率

处理数据中的缺失值

用logistic进行测试

def colicTest():
    frTrain = open('horseColicTraining.txt'); frTest = open('horseColicTest.txt')
    trainingSet = []; trainingLabels = []
    for line in frTrain.readlines():
        currLine = line.strip().split('\t')
        lineArr =[]
        for i in range(21):  #每行的string经过float转化后一个一个放入行列表中，是否可批量？ map函数嵌套labmda表达式实现？
            lineArr.append(float(currLine[i]))
        trainingSet.append(lineArr)
        trainingLabels.append(float(currLine[21]))
    trainWeights = stocGradAscent1(array(trainingSet), trainingLabels, 1000)
    errorCount = 0; numTestVec = 0.0
    for line in frTest.readlines():
        numTestVec += 1.0
        currLine = line.strip().split('\t')
        lineArr =[]
        for i in range(21):
            lineArr.append(float(currLine[i]))
        if int(classifyVector(array(lineArr), trainWeights))!= int(currLine[21]):
            errorCount += 1
    errorRate = (float(errorCount)/numTestVec)
    print "the error rate of this test is: %f" % errorRate
    return errorRate


def multiTest():
    numTests = 10; errorSum=0.0         #测试10次，因为使用的是增强随机梯度算法，每次训练出的权重不一样，迭代次数，步长
    for k in range(numTests):
        errorSum += colicTest()
    print "after %d iterations the average error rate is: %f" % (numTests, errorSum/float(numTests))
        

支持向量机
简化版本SMO
def loadDataSet(fileName):
    dataMat = []; labelMat = []
    fr = open(fileName)
    for line in fr.readlines():
        lineArr = line.strip().split('\t')
        dataMat.append([float(lineArr[0]), float(lineArr[1])])
        labelMat.append(float(lineArr[2]))
    return dataMat,labelMat



def selectJrand(i,m):
    j=i #we want to select any J not equal to i
    while (j==i):
        j = int(random.uniform(0,m))   #注意选择j不等与i的方法，采用while循环。
    return j

def clipAlpha(aj,H,L):    #限定aj范围，进行调整
    if aj > H: 
        aj = H
    if L > aj:
        aj = L
    return aj


def smoSimple(dataMatIn, classLabels, C, toler, maxIter):
    dataMatrix = mat(dataMatIn); labelMat = mat(classLabels).transpose()
    b = 0; m,n = shape(dataMatrix)
    alphas = mat(zeros((m,1)))  #生成为零的矩阵
    iter = 0
    while (iter < maxIter):
        alphaPairsChanged = 0       #判断是否优化
        for i in range(m):
            fXi = float(multiply(alphas,labelMat).T*(dataMatrix*dataMatrix[i,:].T)) + b   #预测的类别  参见http://blog.csdn.net/bbbeoy/article/details/72598473
            Ei = fXi - float(labelMat[i])#if checks if an example violates KKT conditions
            if ((labelMat[i]*Ei < -toler) and (alphas[i] < C)) or ((labelMat[i]*Ei > toler) and (alphas[i] > 0)):#无法小于一定的容忍度或，正负向测试，后面有ClipAlpha
                j = selectJrand(i,m)   #选择另外一个
                fXj = float(multiply(alphas,labelMat).T*(dataMatrix*dataMatrix[j,:].T)) + b  #另外一个放入
                Ej = fXj - float(labelMat[j])
                alphaIold = alphas[i].copy(); alphaJold = alphas[j].copy(); 
                if (labelMat[i] != labelMat[j]):     #当两个不同浩，计算取值范围
                    L = max(0, alphas[j] - alphas[i])
                    H = min(C, C + alphas[j] - alphas[i])
                else:
                    L = max(0, alphas[j] + alphas[i] - C)
                    H = min(C, alphas[j] + alphas[i])
                if L==H: print "L==H"; continue
                eta = 2.0 * dataMatrix[i,:]*dataMatrix[j,:].T - dataMatrix[i,:]*dataMatrix[i,:].T - dataMatrix[j,:]*dataMatrix[j,:].T  #eta = K11+K22-2*K12,也是f(x)的二阶导数  
                if eta >= 0: print "eta>=0"; continue
                alphas[j] -= labelMat[j]*(Ei - Ej)/eta
                alphas[j] = clipAlpha(alphas[j],H,L)
                if (abs(alphas[j] - alphaJold) < 0.00001): print "j not moving enough"; continue
                alphas[i] += labelMat[j]*labelMat[i]*(alphaJold - alphas[j])#update i by the same amount as j
                                                                        #the update is in the oppostie direction
                b1 = b - Ei- labelMat[i]*(alphas[i]-alphaIold)*dataMatrix[i,:]*dataMatrix[i,:].T - labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[i,:]*dataMatrix[j,:].T
                b2 = b - Ej- labelMat[i]*(alphas[i]-alphaIold)*dataMatrix[i,:]*dataMatrix[j,:].T - labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[j,:]*dataMatrix[j,:].T
                if (0 < alphas[i]) and (C > alphas[i]): b = b1  #对b更新   http://blog.csdn.net/bbbeoy/article/details/72469136
                elif (0 < alphas[j]) and (C > alphas[j]): b = b2
                else: b = (b1 + b2)/2.0
                alphaPairsChanged += 1
                print "iter: %d i:%d, pairs changed %d" % (iter,i,alphaPairsChanged)
        if (alphaPairsChanged == 0): iter += 1
        else: iter = 0
        print "iteration number: %d" % iter
    return b,alphas


b,alphs=svmMLiA.smoSimple(dataArr,labelArr,0.6, 0.001,40)

得到支持向量个数：

for i in range(100):
	if alphs[i]>0.0:print dataArr[i],labelArr[i]


完整plot SMO算法
def smoP(dataMatIn, classLabels, C, toler, maxIter,kTup=('lin', 0)):    #full Platt SMO
    oS = optStruct(mat(dataMatIn),mat(classLabels).transpose(),C,toler, kTup)
    iter = 0
    entireSet = True; alphaPairsChanged = 0
    while (iter < maxIter) and ((alphaPairsChanged > 0) or (entireSet)):
        alphaPairsChanged = 0
        if entireSet:   #go over all
            for i in range(oS.m):        
                alphaPairsChanged += innerL(i,oS)
                print "fullSet, iter: %d i:%d, pairs changed %d" % (iter,i,alphaPairsChanged)
            iter += 1
        else:#go over non-bound (railed) alphas
            nonBoundIs = nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0]
            for i in nonBoundIs:
                alphaPairsChanged += innerL(i,oS)
                print "non-bound, iter: %d i:%d, pairs changed %d" % (iter,i,alphaPairsChanged)
            iter += 1
        if entireSet: entireSet = False #toggle entire set loop
        elif (alphaPairsChanged == 0): entireSet = True  
        print "iteration number: %d" % iter
    return oS.b,oS.alphas


class optStruct:
    def __init__(self,dataMatIn, classLabels, C, toler, kTup):  # Initialize the structure with the parameters 
        self.X = dataMatIn
        self.labelMat = classLabels
        self.C = C
        self.tol = toler
        self.m = shape(dataMatIn)[0]
        self.alphas = mat(zeros((self.m,1)))
        self.b = 0
        self.eCache = mat(zeros((self.m,2))) #first column is valid flag
        self.K = mat(zeros((self.m,self.m)))
        for i in range(self.m):
            self.K[:,i] = kernelTrans(self.X, self.X[i,:], kTup)

def calcEk(oS, k):
    fXk = float(multiply(oS.alphas,oS.labelMat).T*oS.K[:,k] + oS.b)
    Ek = fXk - float(oS.labelMat[k])
    return Ek



def innerL(i, oS):
    Ei = calcEk(oS, i)
    if ((oS.labelMat[i]*Ei < -oS.tol) and (oS.alphas[i] < oS.C)) or ((oS.labelMat[i]*Ei > oS.tol) and (oS.alphas[i] > 0)):
        j,Ej = selectJ(i, oS, Ei) #this has been changed from selectJrand
        alphaIold = oS.alphas[i].copy(); alphaJold = oS.alphas[j].copy();
        if (oS.labelMat[i] != oS.labelMat[j]):
            L = max(0, oS.alphas[j] - oS.alphas[i])
            H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i])
        else:
            L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C)
            H = min(oS.C, oS.alphas[j] + oS.alphas[i])
        if L==H: print "L==H"; return 0
        eta = 2.0 * oS.K[i,j] - oS.K[i,i] - oS.K[j,j] #changed for kernel
        if eta >= 0: print "eta>=0"; return 0
        oS.alphas[j] -= oS.labelMat[j]*(Ei - Ej)/eta
        oS.alphas[j] = clipAlpha(oS.alphas[j],H,L)
        updateEk(oS, j) #added this for the Ecache
        if (abs(oS.alphas[j] - alphaJold) < 0.00001): print "j not moving enough"; return 0
        oS.alphas[i] += oS.labelMat[j]*oS.labelMat[i]*(alphaJold - oS.alphas[j])#update i by the same amount as j
        updateEk(oS, i) #added this for the Ecache                    #the update is in the oppostie direction
        b1 = oS.b - Ei- oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,i] - oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[i,j]
        b2 = oS.b - Ej- oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,j]- oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[j,j]
        if (0 < oS.alphas[i]) and (oS.C > oS.alphas[i]): oS.b = b1
        elif (0 < oS.alphas[j]) and (oS.C > oS.alphas[j]): oS.b = b2
        else: oS.b = (b1 + b2)/2.0
        return 1

def calcWs(alphas,dataArr,classLabels):  #带入计算标签， 最后需要+b
    X = mat(dataArr); labelMat = mat(classLabels).transpose()
    m,n = shape(X)
    w = zeros((n,1))
    for i in range(m):
        w += multiply(alphas[i]*labelMat[i],X[i,:].T)
    return w

data[2]*mat(ws)+b

核转换函数
def testRbf(k1=1.3):
    dataArr,labelArr = loadDataSet('testSetRBF.txt')
    b,alphas = smoP(dataArr, labelArr, 200, 0.0001, 10000, ('rbf', k1)) #C=200 important
    datMat=mat(dataArr); labelMat = mat(labelArr).transpose()
    svInd=nonzero(alphas.A>0)[0]
    sVs=datMat[svInd] #get matrix of only support vectors
    labelSV = labelMat[svInd];
    print "there are %d Support Vectors" % shape(sVs)[0]
    m,n = shape(datMat)
    errorCount = 0
    for i in range(m):
        kernelEval = kernelTrans(sVs,datMat[i,:],('rbf', k1))
        predict=kernelEval.T * multiply(labelSV,alphas[svInd]) + b
        if sign(predict)!=sign(labelArr[i]): errorCount += 1
    print "the training error rate is: %f" % (float(errorCount)/m)
    dataArr,labelArr = loadDataSet('testSetRBF2.txt')
    errorCount = 0
    datMat=mat(dataArr); labelMat = mat(labelArr).transpose()
    m,n = shape(datMat)
    for i in range(m):
        kernelEval = kernelTrans(sVs,datMat[i,:],('rbf', k1))
        predict=kernelEval.T * multiply(labelSV,alphas[svInd]) + b
        if sign(predict)!=sign(labelArr[i]): errorCount += 1    
    print "the test error rate is: %f" % (float(errorCount)/m)    



def smoP(dataMatIn, classLabels, C, toler, maxIter,kTup=('lin', 0)):    #full Platt SMO
    oS = optStruct(mat(dataMatIn),mat(classLabels).transpose(),C,toler, kTup)
    iter = 0
    entireSet = True; alphaPairsChanged = 0
    while (iter < maxIter) and ((alphaPairsChanged > 0) or (entireSet)):
        alphaPairsChanged = 0
        if entireSet:   #go over all
            for i in range(oS.m):        
                alphaPairsChanged += innerL(i,oS)
                print "fullSet, iter: %d i:%d, pairs changed %d" % (iter,i,alphaPairsChanged)
            iter += 1
        else:#go over non-bound (railed) alphas
            nonBoundIs = nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0]
            for i in nonBoundIs:
                alphaPairsChanged += innerL(i,oS)
                print "non-bound, iter: %d i:%d, pairs changed %d" % (iter,i,alphaPairsChanged)
            iter += 1
        if entireSet: entireSet = False #toggle entire set loop
        elif (alphaPairsChanged == 0): entireSet = True  
        print "iteration number: %d" % iter
    return oS.b,oS.alphas


class optStruct:
    def __init__(self,dataMatIn, classLabels, C, toler, kTup):  # Initialize the structure with the parameters 
        self.X = dataMatIn
        self.labelMat = classLabels
        self.C = C
        self.tol = toler
        self.m = shape(dataMatIn)[0]
        self.alphas = mat(zeros((self.m,1)))
        self.b = 0
        self.eCache = mat(zeros((self.m,2))) #first column is valid flag
        self.K = mat(zeros((self.m,self.m)))   #核转换后的维度？
        for i in range(self.m):  #每个元素与总体的核函数按照列排布
            self.K[:,i] = kernelTrans(self.X, self.X[i,:], kTup)
        


def kernelTrans(X, A, kTup):   #calc the kernel or transform data to a higher dimensional space
    m,n = shape(X)
    K = mat(zeros((m,1)))
    if kTup[0]=='lin': K = X * A.T   #linear kernel 线性核函数
    elif kTup[0]=='rbf':
        for j in range(m):   #高斯核 公式：exp{(xj - xi)^2 / (2 * δ^2)} | j = 1,...,N
            deltaRow = X[j,:] - A
            K[j] = deltaRow*deltaRow.T  #矩阵 m列 每个元素与总体
        K = exp(K/(-1*kTup[1]**2)) #divide in NumPy is element-wise not matrix like Matlab， /表示广播，对每一项应用公式，矩阵元素展开 为什么不是-2呢？
    else: raise NameError('Houston We Have a Problem -- \
    That Kernel is not recognized')
    return K





def calcEk(oS, k):
    fXk = float(multiply(oS.alphas,oS.labelMat).T*oS.K[:,k] + oS.b)   #注意和线性的相比，写法，ai，xi
    Ek = fXk - float(oS.labelMat[k])
    return Ek


def innerL(i, oS):
    Ei = calcEk(oS, i)
    if ((oS.labelMat[i]*Ei < -oS.tol) and (oS.alphas[i] < oS.C)) or ((oS.labelMat[i]*Ei > oS.tol) and (oS.alphas[i] > 0)):
        j,Ej = selectJ(i, oS, Ei) #this has been changed from selectJrand
        alphaIold = oS.alphas[i].copy(); alphaJold = oS.alphas[j].copy();
        if (oS.labelMat[i] != oS.labelMat[j]):
            L = max(0, oS.alphas[j] - oS.alphas[i])
            H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i])
        else:
            L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C)
            H = min(oS.C, oS.alphas[j] + oS.alphas[i])
        if L==H: print "L==H"; return 0
        eta = 2.0 * oS.K[i,j] - oS.K[i,i] - oS.K[j,j] #changed for kernel
        if eta >= 0: print "eta>=0"; return 0
        oS.alphas[j] -= oS.labelMat[j]*(Ei - Ej)/eta
        oS.alphas[j] = clipAlpha(oS.alphas[j],H,L)
        updateEk(oS, j) #added this for the Ecache
        if (abs(oS.alphas[j] - alphaJold) < 0.00001): print "j not moving enough"; return 0
        oS.alphas[i] += oS.labelMat[j]*oS.labelMat[i]*(alphaJold - oS.alphas[j])#update i by the same amount as j
        updateEk(oS, i) #added this for the Ecache                    #the update is in the oppostie direction
        b1 = oS.b - Ei- oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,i] - oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[i,j]
        b2 = oS.b - Ej- oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,j]- oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[j,j]
        if (0 < oS.alphas[i]) and (oS.C > oS.alphas[i]): oS.b = b1
        elif (0 < oS.alphas[j]) and (oS.C > oS.alphas[j]): oS.b = b2
        else: oS.b = (b1 + b2)/2.0
        return 1
    else: return 0


手写识别系统回顾







def testDigits(kTup=('rbf', 10)):
    dataArr,labelArr = loadImages('trainingDigits')
    b,alphas = smoP(dataArr, labelArr, 200, 0.0001, 10000, kTup)
    datMat=mat(dataArr); labelMat = mat(labelArr).transpose()
    svInd=nonzero(alphas.A>0)[0]   #nonzero:，相当于用nonzero()将布尔数组转换成一组整数数组，然后使用整数数组进行下标运算nozero([Ture,False,Ture]) 返回取[0] 才返回 array([1])，限制直接>0，取出支持向量
    sVs=datMat[svInd]    #注意此处numpy.A 表示矩阵的基数组，把矩阵变为数组了,此处取A后出现一个[i]的值，如果不取A，出现一个[[i]]形式的值，注意巧妙的用法，一般和flatten结合使用
    labelSV = labelMat[svInd]; #矩阵取子集
    print "there are %d Support Vectors" % shape(sVs)[0]
    m,n = shape(datMat)
    errorCount = 0
    for i in range(m):
        kernelEval = kernelTrans(sVs,datMat[i,:],kTup)
        predict=kernelEval.T * multiply(labelSV,alphas[svInd]) + b
        if sign(predict)!=sign(labelArr[i]): errorCount += 1
    print "the training error rate is: %f" % (float(errorCount)/m)
    dataArr,labelArr = loadImages('testDigits')
    errorCount = 0
    datMat=mat(dataArr); labelMat = mat(labelArr).transpose()
    m,n = shape(datMat)
    for i in range(m):
        kernelEval = kernelTrans(sVs,datMat[i,:],kTup)
        predict=kernelEval.T * multiply(labelSV,alphas[svInd]) + b
        if sign(predict)!=sign(labelArr[i]): errorCount += 1    
    print "the test error rate is: %f" % (float(errorCount)/m) 



利用AdaBoost提高分类预测性能



datMat = matrix([[ 1. ,  2.1],
        [ 1.5,  1.6],
        [ 1.3,  1. ],
        [ 1. ,  1. ],
        [ 2. ,  1. ]])
classLabels = [1.0, 1.0, -1.0, -1.0, 1.0]


xcord0 = []
ycord0 = []
xcord1 = []
ycord1 = []
markers =[]
colors =[]

for i in range(len(classLabels)):
    if classLabels[i]==1.0:
        xcord1.append(datMat[i,0]), ycord1.append(datMat[i,1])
    else:
        xcord0.append(datMat[i,0]), ycord0.append(datMat[i,1])
fig = plt.figure()
ax = fig.add_subplot(111)       
ax.scatter(xcord0,ycord0, marker='s', s=90)
ax.scatter(xcord1,ycord1, marker='o', s=50, c='red')
plt.title('decision stump test data')
plt.show()

单层决策器生成器


def buildStump(dataArr,classLabels,D): #弱的单层分类器
    dataMatrix = mat(dataArr); labelMat = mat(classLabels).T
    m,n = shape(dataMatrix)
    numSteps = 10.0; bestStump = {}; bestClasEst = mat(zeros((m,1)))
    minError = inf #init error sum, to +infinity   正无穷
    for i in range(n):#loop over all dimensions  列遍历
        rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max();
        stepSize = (rangeMax-rangeMin)/numSteps
        for j in range(-1,int(numSteps)+1):#loop over all range in current dimension 超越现在的上下边界，所有才有lt,gt 
            for inequal in ['lt', 'gt']: #go over less than and greater than
                threshVal = (rangeMin + float(j) * stepSize)   #从当前特征的最小值还小的地方，一步一步
                predictedVals = stumpClassify(dataMatrix,i,threshVal,inequal)#call stump classify with i, j, lessThan
                errArr = mat(ones((m,1)))
                errArr[predictedVals == labelMat] = 0   #注意批量的计算方法
                weightedError = D.T*errArr  #calc total error multiplied by D  权重*错误率 adaboost计算错误率特殊的地方，加权错误率 注意和普通不同，普通在权重相等时候可以转化为这样计算
                #print "split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % (i, threshVal, inequal, weightedError)
                if weightedError < minError:
                    minError = weightedError
                    bestClasEst = predictedVals.copy()
                    bestStump['dim'] = i
                    bestStump['thresh'] = threshVal
                    bestStump['ineq'] = inequal
    return bestStump,minError,bestClasEst



def stumpClassify(dataMatrix,dimen,threshVal,threshIneq):#just classify the data  单分类器， 遍历求分类思想
    retArray = ones((shape(dataMatrix)[0],1))
    if threshIneq == 'lt':
        retArray[dataMatrix[:,dimen] <= threshVal] = -1.0   #通过阈值分类
    else:
        retArray[dataMatrix[:,dimen] > threshVal] = -1.0
    return retArray


完整Adaboost算法：


def adaBoostTrainDS(dataArr,classLabels,numIt=40):
    weakClassArr = []
    m = shape(dataArr)[0]
    D = mat(ones((m,1))/m)   #init D to all equal
    aggClassEst = mat(zeros((m,1)))
    for i in range(numIt):
        bestStump,error,classEst = buildStump(dataArr,classLabels,D)#build Stump
        #print "D:",D.T
        alpha = float(0.5*log((1.0-error)/max(error,1e-16)))#calc alpha, throw in max(error,eps) to account for error=0 ，每个分类器计算权重，总和等于1？
        bestStump['alpha'] = alpha   #分类器保存权重
        weakClassArr.append(bestStump)                  #store Stump Params in Array
        #print "classEst: ",classEst.T
        expon = multiply(-1*alpha*mat(classLabels).T,classEst) #exponent for D calc, getting messy 注意此处巧妙的计算方法，一次计算出+，-
        D = multiply(D,exp(expon))                              #Calc New D for next iteration
        D = D/D.sum()
        #calc training error of all classifiers, if this is 0 quit for loop early (use break)
        aggClassEst += alpha*classEst   #记录每个点的类别估计累计值
        #print "aggClassEst: ",aggClassEst.T
        aggErrors = multiply(sign(aggClassEst) != mat(classLabels).T,ones((m,1)))   #运行时的分类累计值，用二分函数估计结果， 注意multipy的使用。
        errorRate = aggErrors.sum()/m
        print "total error: ",errorRate
        if errorRate == 0.0: break
    return weakClassArr,aggClassEst

测试算法：基于Adaboost的分类

def adaClassify(datToClass,classifierArr):
    dataMatrix = mat(datToClass)#do stuff similar to last aggClassEst in adaBoostTrainDS
    m = shape(dataMatrix)[0]
    aggClassEst = mat(zeros((m,1)))
    for i in range(len(classifierArr)):
        classEst = stumpClassify(dataMatrix,classifierArr[i]['dim'],\
                                 classifierArr[i]['thresh'],\
                                 classifierArr[i]['ineq'])#call stump classify   把样本带入前面训练好的一系列分类器中，alpha也是前面分类器的权重
        aggClassEst += classifierArr[i]['alpha']*classEst
        print aggClassEst
    return sign(aggClassEst)




ROU曲线的绘制及AUC计算函数


def plotROC(predStrengths, classLabels):
    import matplotlib.pyplot as plt
    cur = (1.0,1.0) #cursor
    ySum = 0.0 #variable to calculate AUC
    numPosClas = sum(array(classLabels)==1.0)  #注意数量计算方法
    yStep = 1/float(numPosClas); xStep = 1/float(len(classLabels)-numPosClas)  #真阳率  假阳率
    sortedIndicies = predStrengths.argsort()#get sorted index, it's reverse   数组值从小到大的索引值
    fig = plt.figure()
    fig.clf()
    ax = plt.subplot(111)
    #loop through all the values, drawing a line segment at each point
    for index in sortedIndicies.tolist()[0]:
        if classLabels[index] == 1.0:
            delX = 0; delY = yStep;
        else:
            delX = xStep; delY = 0;
            ySum += cur[1]
        #draw line from cur to (cur[0]-delX,cur[1]-delY)
        ax.plot([cur[0],cur[0]-delX],[cur[1],cur[1]-delY], c='b')   #plot（[x1,x2],[y1,y2]) 按照这样对应点画图
        cur = (cur[0]-delX,cur[1]-delY)
    ax.plot([0,1],[0,1],'b--')
    plt.xlabel('False positive rate'); plt.ylabel('True positive rate')
    plt.title('ROC curve for AdaBoost horse colic detection system')
    ax.axis([0,1,0,1])
    plt.show()
    print "the Area Under the Curve is: ",ySum*xStep

利用回归预测数值型数据

普通线性回归
def loadDataSet(fileName):      #general function to parse tab -delimited floats
    numFeat = len(open(fileName).readline().split('\t')) - 1 #get number of fields 
    dataMat = []; labelMat = []
    fr = open(fileName)
    for line in fr.readlines():
        lineArr =[]
        curLine = line.strip().split('\t')
        for i in range(numFeat):
            lineArr.append(float(curLine[i]))
        dataMat.append(lineArr)
        labelMat.append(float(curLine[-1]))
    return dataMat,labelMat


def standRegres(xArr,yArr):
    xMat = mat(xArr); yMat = mat(yArr).T   #注意w0参数为1
    xTx = xMat.T*xMat
    if linalg.det(xTx) == 0.0:      #判断行列式是否为0
        print "This matrix is singular, cannot do inverse"
        return
    ws = xTx.I * (xMat.T*yMat)   #计算出w， 利用linalg库， ws=linalg.solve(xTx,xMat.T*yMatT)
    return ws


xMat=mat(xArr)
yMat=mat(yArr)
yHat=xMat*ws

import matplotlib.pyplot as plt
fig=plt.figure()
ax=fig.add_subplot(111)
ax.scatter(xMat[:,1].flatten().A[0],yMat.flatten().A[0])  ##注意flatten 和A 的用法，把矩阵转化为其它的方式， flatten 压平的用法，和A搭配转化为1-D，  用在矩阵画图等

xCopy=xMat.copy()
xCopy.sort(0)
yHat=xCopy*ws
ax.plot(xCopy[:,1],yHat)
plt.show()

corrcoef(yHat.T,yMat)  #在numpy中计算相关系数

局部加权回归
def lwlrTest(testArr,xArr,yArr,k=1.0):  #loops over all the data points and applies lwlr to each one
    m = shape(testArr)[0]
    yHat = zeros(m)
    for i in range(m):
        yHat[i] = lwlr(testArr[i],xArr,yArr,k)   #对每一个值进行预测
    return yHat


def lwlr(testPoint,xArr,yArr,k=1.0):
    xMat = mat(xArr); yMat = mat(yArr).T
    m = shape(xMat)[0]
    weights = mat(eye((m)))                      #创建对角矩阵 ，注意此处把权重设计放在对角矩阵对角线上，而不再普通的矩阵，便于后续计算
    for j in range(m):                      #next 2 lines create weights matrix
        diffMat = testPoint - xMat[j,:]     #核权重计算
        weights[j,j] = exp(diffMat*diffMat.T/(-2.0*k**2))   # 绝对值用距离的平方代替？？
    xTx = xMat.T * (weights * xMat)
    if linalg.det(xTx) == 0.0:
        print "This matrix is singular, cannot do inverse"
        return
    ws = xTx.I * (xMat.T * (weights * yMat))
    return testPoint * ws


xMat=mat(xArr)
srtInd=xMat[:1].argsort(0)  # 0 跨行， 注意跨行的理解 ， 返回索引，多维情况
xSort=xMat[srtInd][:,0,:]  # 注意复杂的用法，多个[[[, 一个数组作为索引，会产生复制等操作，维度增加
import matplotlib.pyplot as plt
fig=plt.figure()
ax=fig.add_subplot(111)
ax.plot(xSort[:,1],yHat[srtInd])
ax.scatter(xMat[:,1].flatten().A[0],mat(yArr).flatten().A[0],s=2,c='red')
plt.show()


预测鲍鱼的年龄
def rssError(yArr,yHatArr): #yArr and yHatArr both need to be arrays
    return ((yArr-yHatArr)**2).sum()
岭回归

def ridgeTest(xArr,yArr):
    xMat = mat(xArr); yMat=mat(yArr).T
    yMean = mean(yMat,0)   #跨行，按照列
    yMat = yMat - yMean     #to eliminate X0 take mean off of Y    标准化
    #regularize X's
    xMeans = mean(xMat,0)   #calc mean then subtract it off
    xVar = var(xMat,0)      #calc variance of Xi then divide by it 方差，跨行
    xMat = (xMat - xMeans)/xVar             #tMat没有除方差？？？？
    numTestPts = 30
    wMat = zeros((numTestPts,shape(xMat)[1]))
    for i in range(numTestPts):
        ws = ridgeRegres(xMat,yMat,exp(i-10))              # lambda应以指数级变化
        wMat[i,:]=ws.T        #储存不同lambda对应的参数
    return wMat



def ridgeRegres(xMat,yMat,lam=0.2):
    xTx = xMat.T*xMat
    denom = xTx + eye(shape(xMat)[1])*lam                  #对角矩阵
    if linalg.det(denom) == 0.0:
        print "This matrix is singular, cannot do inverse"
        return
    ws = denom.I * (xMat.T*yMat)
    return ws


import matplotlib.pyplot as plt
fig=plt.figure()
ax=fig.add_subplot(111)
ax.plot(ridgeWeights)   #此处是log lambda
plt.show()

Lasso
向前逐步回归

def stageWise(xArr,yArr,eps=0.01,numIt=100):
    xMat = mat(xArr); yMat=mat(yArr).T
    yMean = mean(yMat,0)
    yMat = yMat - yMean     #can also regularize ys but will get smaller coef
    xMat = regularize(xMat)
    m,n=shape(xMat)
    #returnMat = zeros((numIt,n)) #testing code remove
    ws = zeros((n,1)); wsTest = ws.copy(); wsMax = ws.copy()
    for i in range(numIt):
        print ws.T
        lowestError = inf;   #正无穷
        for j in range(n):
            for sign in [-1,1]:
                wsTest = ws.copy()
                wsTest[j] += eps*sign
                yTest = xMat*wsTest    # 因为原来的wsTest初始化都为0，所以去掉了部分。
                rssE = rssError(yMat.A,yTest.A)
                if rssE < lowestError:
                    lowestError = rssE
                    wsMax = wsTest
        ws = wsMax.copy()
        #returnMat[i,:]=ws.T
    #return returnMat


def regularize(xMat):#regularize by columns
    inMat = xMat.copy()
    inMeans = mean(inMat,0)   #calc mean then subtract it off
    inVar = var(inMat,0)      #calc variance of Xi then divide by it
    inMat = (inMat - inMeans)/inVar
    return inMat

预测乐高玩具价格
from time import sleep
import json
import urllib2
def searchForSet(retX, retY, setNum, yr, numPce, origPrc):
    sleep(10) #防止短暂的时间内有过多的API调用
    myAPIstr = 'AIzaSyD2cR2KFyx12hXu6PFU-wrWot3NXvko8vY'
    searchURL = 'https://www.googleapis.com/shopping/search/v1/public/products?key=%s&country=US&q=lego+%d&alt=json' % (myAPIstr, setNum)
    pg = urllib2.urlopen(searchURL)
    retDict = json.loads(pg.read())
    for i in range(len(retDict['items'])):
        try:
            currItem = retDict['items'][i]
            if currItem['product']['condition'] == 'new':
                newFlag = 1
            else: newFlag = 0
            listOfInv = currItem['product']['inventories']
            for item in listOfInv:
                sellingPrice = item['price']
                if  sellingPrice > origPrc * 0.5:
                    print "%d\t%d\t%d\t%f\t%f" % (yr,numPce,newFlag,origPrc, sellingPrice)
                    retX.append([yr, numPce, newFlag, origPrc])
                    retY.append(sellingPrice)
        except: print 'problem with item %d' % i
    
def setDataCollect(retX, retY):
    searchForSet(retX, retY, 8288, 2006, 800, 49.99)
    searchForSet(retX, retY, 10030, 2002, 3096, 269.99)
    searchForSet(retX, retY, 10179, 2007, 5195, 499.99)
    searchForSet(retX, retY, 10181, 2007, 3428, 199.99)
    searchForSet(retX, retY, 10189, 2008, 5922, 299.99)
    searchForSet(retX, retY, 10196, 2009, 3263, 249.99)

建立模型
shape(lgX)
lgX1=mat(ones((58,5)))         # X0 设置为1，其它的复制过来
lgX1[:,1:5]=mat(lgX)

ws=regression.standRegres(lgX,lgY)

交叉验证
def crossValidation(xArr,yArr,numVal=10):
    m = len(yArr)                           
    indexList = range(m)
    errorMat = zeros((numVal,30))#create error mat 30columns numVal rows
    for i in range(numVal):
        trainX=[]; trainY=[]        #创建训练和测试集合容器
        testX = []; testY = []
        random.shuffle(indexList)   #随机打乱列表顺序
        for j in range(m):#create training set based on first 90% of values in indexList
            if j < m*0.9: 
                trainX.append(xArr[indexList[j]])
                trainY.append(yArr[indexList[j]])
            else:
                testX.append(xArr[indexList[j]])
                testY.append(yArr[indexList[j]])
        wMat = ridgeTest(trainX,trainY)    #get 30 weight vectors from ridge
        for k in range(30):#loop over all of the ridge estimates
            matTestX = mat(testX); matTrainX=mat(trainX)
            meanTrain = mean(matTrainX,0)
            varTrain = var(matTrainX,0)
            matTestX = (matTestX-meanTrain)/varTrain #regularize test with training params 注意使用训练集标准化过程中的参数对测试集进行标准化处理。
            yEst = matTestX * mat(wMat[k,:]).T + mean(trainY)#test ridge results and store
            errorMat[i,k]=rssError(yEst.T.A,array(testY))  #注意是转置，然后求基数组，T，A
            #print errorMat[i,k]
    meanErrors = mean(errorMat,0)#calc avg performance of the different ridge weight vectors
    minMean = float(min(meanErrors))
    bestWeights = wMat[nonzero(meanErrors==minMean)]  #注意nonzero的用法
    #can unregularize to get model
    #when we regularized we wrote Xreg = (x-meanX)/var(x)
    #we can now write in terms of x not Xreg:  x*w/var(x) - meanX/var(x) +meanY  # 标准化后需要还原
    xMat = mat(xArr); yMat=mat(yArr).T
    meanX = mean(xMat,0); varX = var(xMat,0)
    unReg = bestWeights/varX   # 注意运算按照对应模式， 数组对矩阵
    print "the best model from Ridge Regression is:\n",unReg
    print "with constant term: ",-1*sum(multiply(meanX,unReg)) + mean(yMat)

树回归

def loadDataSet(fileName):      #general function to parse tab -delimited floats
    dataMat = []                #assume last column is target value
    fr = open(fileName)
    for line in fr.readlines():
        curLine = line.strip().split('\t')
        fltLine = map(float,curLine) #map all elements to float()   注意map，对每个元素执行函数
        dataMat.append(fltLine)
    return dataMat



def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):#assume dataSet is NumPy Mat so we can array filtering
    feat, val = chooseBestSplit(dataSet, leafType, errType, ops)#choose the best split
    if feat == None: return val #if the splitting hit a stop condition return val
    retTree = {}
    retTree['spInd'] = feat
    retTree['spVal'] = val
    lSet, rSet = binSplitDataSet(dataSet, feat, val)
    retTree['left'] = createTree(lSet, leafType, errType, ops)
    retTree['right'] = createTree(rSet, leafType, errType, ops)
    return retTree    



def regLeaf(dataSet):#returns the value used for each leaf
    return mean(dataSet[:,-1])


def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):    #注意此处leafType 被赋值于一个函数regLeaf， 被调用的时候条用的是函数regLeaf。注意Python中此种用法
    tolS = ops[0]; tolN = ops[1]   #容许的错误下降值  切分的最少样本数
    #if all the target variables are the same value: quit and return value
    if len(set(dataSet[:,-1].T.tolist()[0])) == 1: #exit cond 1  注意一维数组的tolist()[0] 的用法，集合的作用，去重
        return None, leafType(dataSet)
    m,n = shape(dataSet)
    #the choice of the best feature is driven by Reduction in RSS error from mean
    S = errType(dataSet)
    bestS = inf; bestIndex = 0; bestValue = 0
    for featIndex in range(n-1):  #对每个特征，注意range
        for splitVal in set(dataSet[:,featIndex].T.tolist()[0]): #对每个特征值
            mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
            if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): continue
            newS = errType(mat0) + errType(mat1)
            if newS < bestS: 
                bestIndex = featIndex
                bestValue = splitVal
                bestS = newS
    #if the decrease (S-bestS) is less than a threshold don't do the split
    if (S - bestS) < tolS:          #如果误差减少不大则退出
        return None, leafType(dataSet) #exit cond 2
    mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
    if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN):  #exit cond 3
        return None, leafType(dataSet)
    return bestIndex,bestValue#returns the best feature to split on
                              #and the value used for that split




def regErr(dataSet):
    return var(dataSet[:,-1]) * shape(dataSet)[0]   # 总方差=均方差*样本个数




def binSplitDataSet(dataSet, feature, value):
    mat0 = dataSet[nonzero(dataSet[:,feature] > value)[0],:]
    mat1 = dataSet[nonzero(dataSet[:,feature] <= value)[0],:]
    return mat0,mat1


树剪枝
def isTree(obj):
    return (type(obj).__name__=='dict')   #判断是否是树

def getMean(tree):
    if isTree(tree['right']): tree['right'] = getMean(tree['right'])
    if isTree(tree['left']): tree['left'] = getMean(tree['left'])
    return (tree['left']+tree['right'])/2.0
    
def prune(tree, testData):
    if shape(testData)[0] == 0: return getMean(tree) #if we have no test data collapse the tree
    if (isTree(tree['right']) or isTree(tree['left'])):#if the branches are not trees try to prune them
        lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
    if isTree(tree['left']): tree['left'] = prune(tree['left'], lSet)
    if isTree(tree['right']): tree['right'] =  prune(tree['right'], rSet)
    #if they are now both leafs, see if we can merge them
    if not isTree(tree['left']) and not isTree(tree['right']):
        lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
        errorNoMerge = sum(power(lSet[:,-1] - tree['left'],2)) +\
            sum(power(rSet[:,-1] - tree['right'],2))                 #计算未合并时候的误差
        treeMean = (tree['left']+tree['right'])/2.0     #用此种方法计算平均值？？恰当？
        errorMerge = sum(power(testData[:,-1] - treeMean,2))  #合并后的误差
        if errorMerge < errorNoMerge: 
            print "merging"
            return treeMean
        else: return tree
    else: return tree


模型树

def modelErr(dataSet):           #在每个树叶上进行回归，并计算误差
    ws,X,Y = linearSolve(dataSet)
    yHat = X * ws
    return sum(power(Y - yHat,2))   #批量运算


def linearSolve(dataSet):   #helper function used in two places
    m,n = shape(dataSet)
    X = mat(ones((m,n))); Y = mat(ones((m,1)))        #create a copy of data with 1 in 0th postion   一般因为在回归时候常熟b化为向量Xo=1 的常数，一般初始化为1，这样Xo不再改变
    X[:,1:n] = dataSet[:,0:n-1]; Y = dataSet[:,-1]    #and strip out Y
    xTx = X.T*X
    if linalg.det(xTx) == 0.0:
        raise NameError('This matrix is singular, cannot do inverse,\n\
        try increasing the second value of ops')
    ws = xTx.I * (X.T * Y)               
    return ws,X,Y

树回归与标准回归比较

def createForeCast(tree, testData, modelEval=regTreeEval):
    m=len(testData)
    yHat = mat(zeros((m,1)))
    for i in range(m):
        yHat[i,0] = treeForeCast(tree, mat(testData[i]), modelEval)
    return yHat



def treeForeCast(tree, inData, modelEval=regTreeEval):
    if not isTree(tree): return modelEval(tree, inData)
    if inData[tree['spInd']] > tree['spVal']:
        if isTree(tree['left']): return treeForeCast(tree['left'], inData, modelEval)
        else: return modelEval(tree['left'], inData)
    else:
        if isTree(tree['right']): return treeForeCast(tree['right'], inData, modelEval)
        else: return modelEval(tree['right'], inData)
        

def regTreeEval(model, inDat):
    return float(model)

def modelTreeEval(model, inDat):
    n = shape(inDat)[1]
    X = mat(ones((1,n+1)))
    X[:,1:n+1]=inDat
    return float(X*model)

使用Tkinter库创建GUI

from Tkinter import *
from numpy import *
import regTrees

root = Tk()

def reDraw(tols,tolN):
    pass
def drawNewTree():
    pass


Label(root, text="tolN").grid(row=1, column=0)
tolNentry = Entry(root)
tolNentry.grid(row=1, column=1)
tolNentry.insert(0, '10')
Label(root, text="tolS").grid(row=2, column=0)
tolSentry = Entry(root)
tolSentry.grid(row=2, column=1)
tolSentry.insert(0, '1.0')
Button(root, text="ReDraw", command=drawNewTree).grid(row=1, column=2, rowspan=3)
chkBtnVar = IntVar()
chkBtn = Checkbutton(root, text="Model Tree", variable=chkBtnVar)
chkBtn.grid(row=3, column=0, columnspan=2)

reDraw.rawDat = mat(regTrees.loadDataSet('sine.txt'))
reDraw.testDat = arange(min(reDraw.rawDat[:, 0]), max(reDraw.rawDat[:, 0]), 0.01)
reDraw(1.0, 10)

root.mainloop()

利用K-mean对未标准的数据进行分组

def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent):
    m = shape(dataSet)[0]
    clusterAssment = mat(zeros((m,2)))#create mat to assign data points 二维数组，储存镞索引和到质心的距离
                                      #to a centroid, also holds SE of each point
    centroids = createCent(dataSet, k)
    clusterChanged = True
    while clusterChanged:
        clusterChanged = False
        for i in range(m):#for each data point assign it to the closest centroid
            minDist = inf; minIndex = -1   # 注意此类用法，先设初始定值， inf 无穷大 minIndx类似 编程方法
            for j in range(k):
                distJI = distMeas(centroids[j,:],dataSet[i,:])
                if distJI < minDist:
                    minDist = distJI; minIndex = j
            if clusterAssment[i,0] != minIndex: clusterChanged = True  #此处判断是否达到要就，如果达到要求了就不需要循环了
            clusterAssment[i,:] = minIndex,minDist**2  # 注意赋值方式，同时赋两个值 
        print centroids
        for cent in range(k):#recalculate centroids
            ptsInClust = dataSet[nonzero(clusterAssment[:,0].A==cent)[0]]#get all the point in this cluster  矩阵过滤，先生成T，F的值，然后用nonzero返回真的索引
            centroids[cent,:] = mean(ptsInClust, axis=0) #assign centroid to mean  计算“跨行”/列 的平均 注意此处在while循环内
    return centroids, clusterAssment 




def randCent(dataSet, k):
    n = shape(dataSet)[1]
    centroids = mat(zeros((k,n)))#create centroid mat 
    for j in range(n):#create random cluster centers, within bounds of each dimension 不同维度取随机数
        minJ = min(dataSet[:,j]) 
        rangeJ = float(max(dataSet[:,j]) - minJ)
        centroids[:,j] = mat(minJ + rangeJ * random.rand(k,1))  #标量+四个随机数，生成矩阵，注意随机数的生成方式， 创建一个给定类型的数组，将其填充在一个均匀分布的随机样本[0, 1)中
    return centroids



def distEclud(vecA, vecB):
    return sqrt(sum(power(vecA - vecB, 2))) #la

使用后处理来提高聚类性能
2分k-mean
def biKmeans(dataSet, k, distMeas=distEclud):
    m = shape(dataSet)[0]
    clusterAssment = mat(zeros((m,2)))
    centroid0 = mean(dataSet, axis=0).tolist()[0]  #初始开始整个数据的质心
    centList =[centroid0] #create a list with one centroid
    for j in range(m):#calc initial Error
        clusterAssment[j,1] = distMeas(mat(centroid0), dataSet[j,:])**2
    while (len(centList) < k):  #判断质心数目
        lowestSSE = inf
        for i in range(len(centList)):
            ptsInCurrCluster = dataSet[nonzero(clusterAssment[:,0].A==i)[0],:]#get the data points currently in cluster i
            centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distMeas)  #调用普通K-mean分成2份
            sseSplit = sum(splitClustAss[:,1])#compare the SSE to the currrent minimum
            sseNotSplit = sum(clusterAssment[nonzero(clusterAssment[:,0].A!=i)[0],1])
            print "sseSplit, and notSplit: ",sseSplit,sseNotSplit
            if (sseSplit + sseNotSplit) < lowestSSE:
                bestCentToSplit = i
                bestNewCents = centroidMat
                bestClustAss = splitClustAss.copy() #注意深度拷贝
                lowestSSE = sseSplit + sseNotSplit
        bestClustAss[nonzero(bestClustAss[:,0].A == 1)[0],0] = len(centList) #change 1 to 3,4, or whatever  # 此两行对已经分割的两类打标签，bestCentTosplit是代表原来的那个标签不变，leng(centList)由于从0开始，增加了一个新标签
        bestClustAss[nonzero(bestClustAss[:,0].A == 0)[0],0] = bestCentToSplit
        print 'the bestCentToSplit is: ',bestCentToSplit
        print 'the len of bestClustAss is: ', len(bestClustAss)
        centList[bestCentToSplit] = bestNewCents[0,:].tolist()[0]#replace a centroid with two best centroids  先把原来的那个取代，然后再增加一个新质点
        centList.append(bestNewCents[1,:].tolist()[0])
        clusterAssment[nonzero(clusterAssment[:,0].A == bestCentToSplit)[0],:]= bestClustAss#reassign new clusters, and SSE  把被分割的那部分进行更新
    return mat(centList), clusterAssment
对地图上的点进行聚类
import urllib
import json
def geoGrab(stAddress, city):
    apiStem = 'http://where.yahooapis.com/geocode?'  #create a dict and constants for the goecoder
    params = {}
    params['flags'] = 'J'#JSON return type        参数放在字典里面
    params['appid'] = 'aaa0VN6k'
    params['location'] = '%s %s' % (stAddress, city)   #注意此种方式的用法
    url_params = urllib.urlencode(params)
    yahooApi = apiStem + url_params      #print url_params
    print yahooApi
    c=urllib.urlopen(yahooApi)
    return json.loads(c.read())



from time import sleep
def massPlaceFind(fileName):
    fw = open('places.txt', 'w')
    for line in open(fileName).readlines():　　　　　　#打开文件，获取参数
        line = line.strip()
        lineArr = line.split('\t')
        retDict = geoGrab(lineArr[1], lineArr[2])
        if retDict['ResultSet']['Error'] == 0:
            lat = float(retDict['ResultSet']['Results'][0]['latitude'])
            lng = float(retDict['ResultSet']['Results'][0]['longitude'])
            print "%s\t%f\t%f" % (lineArr[0], lat, lng)
            fw.write('%s\t%f\t%f\n' % (line, lat, lng))
        else: print "error fetching"
        sleep(1)  #时间延迟 不要短期内频繁访问API
    fw.close()


知道经度纬度计算表面距离
def distSLC(vecA, vecB):#Spherical Law of Cosines
    a = sin(vecA[0,1]*pi/180) * sin(vecB[0,1]*pi/180)
    b = cos(vecA[0,1]*pi/180) * cos(vecB[0,1]*pi/180) * \
                      cos(pi * (vecB[0,0]-vecA[0,0]) /180)
    return arccos(a + b)*6371.0 #pi is imported with numpy


画出聚类后的图
import matplotlib
import matplotlib.pyplot as plt
def clusterClubs(numClust=5):
    datList = []
    for line in open('places.txt').readlines():
        lineArr = line.split('\t')
        datList.append([float(lineArr[4]), float(lineArr[3])])
    datMat = mat(datList)
    myCentroids, clustAssing = biKmeans(datMat, numClust, distMeas=distSLC)
    fig = plt.figure()
    rect=[0.1,0.1,0.8,0.8]
    scatterMarkers=['s', 'o', '^', '8', 'p', \
                    'd', 'v', 'h', '>', '<']
    axprops = dict(xticks=[], yticks=[])
    ax0=fig.add_axes(rect, label='ax0', **axprops)
    imgP = plt.imread('Portland.png')　　#创建矩阵
    ax0.imshow(imgP)
    ax1=fig.add_axes(rect, label='ax1', frameon=False)
    for i in range(numClust):
        ptsInCurrCluster = datMat[nonzero(clustAssing[:,0].A==i)[0],:]
        markerStyle = scatterMarkers[i % len(scatterMarkers)]
        ax1.scatter(ptsInCurrCluster[:,0].flatten().A[0], ptsInCurrCluster[:,1].flatten().A[0], marker=markerStyle, s=90)
    ax1.scatter(myCentroids[:,0].flatten().A[0], myCentroids[:,1].flatten().A[0], marker='+', s=300)
    plt.show()



使用Apriori进行关联分析
def loadDataSet():
    return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]


def createC1(dataSet):
    C1 = []
    for transaction in dataSet:
        for item in transaction:
            if not [item] in C1:      #注意不在列表中的检查方法 注意带了[],列表内部也是列表
                C1.append([item]) 
                
    C1.sort()
    return map(frozenset, C1)#use frozen set so we     注意是不可变集合，利用map对应每个列表内的元素
                            #can use it as a key in a dict    




def scanD(D, Ck, minSupport):
    ssCnt = {}
    for tid in D:
        for can in Ck:
            if can.issubset(tid):     #检查是否是子集
                if not ssCnt.has_key(can): ssCnt[can]=1       #集合中是否有元素的判断方法， 也可not in？ 此处用了has_key方法代替
                else: ssCnt[can] += 1
    numItems = float(len(D))
    retList = []
    supportData = {}
    for key in ssCnt:
        support = ssCnt[key]/numItems
        if support >= minSupport:
            retList.insert(0,key)               #在列表开始位置插入值， key是一列表类型
        supportData[key] = support             #存放所有数据数据支持度
    return retList, supportData




def apriori(dataSet, minSupport = 0.5):
    C1 = createC1(dataSet)
    D = map(set, dataSet)
    L1, supportData = scanD(D, C1, minSupport)   #获得支持度大于阈值的项和所有候选项的支持度
    L = [L1]
    k = 2         #注意k的选择k可取任意，只要-2的地方换成减完后为0？
    while (len(L[k-2]) > 0):       #L是列表的列表
        Ck = aprioriGen(L[k-2], k)    #产生项的组合，输入的项是列表的列表， 每次出入的组合不一样。一层一层的组合。
        Lk, supK = scanD(D, Ck, minSupport)#scan DB to get Lk
        supportData.update(supK)      # 此处使用update的方法更新项的支持度，字典类型没有append
        L.append(Lk)     #注意lk是嵌套列表，[[1,2],[0,1]]类型
        k += 1     
    return L, supportData




def aprioriGen(Lk, k): #creates Ck   注意产生项之间组合的方法
    retList = []
    lenLk = len(Lk)
    for i in range(lenLk):
        for j in range(i+1, lenLk): 
            L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2]    # 判断前k-2个元素是否相等，如用[0,1],[0,2],[1,2]组合成[0,1,2]的时候少循环两次
            L1.sort(); L2.sort()
            if L1==L2: #if first k-2 elements are equal， 如果相等则并集加入retList，[1,3],[3,5]不能合并成[1,3,5],因为要合并成，至少[1,5]也满足阈值要求
                retList.append(Lk[i] | Lk[j]) #set union  并集合 注意用|表示并集，使用集合由此优势
    return retList

从频繁项集中发掘关联规则

def generateRules(L, supportData, minConf=0.7):  #supportData is a dict coming from scanD
    bigRuleList = []
    for i in range(1, len(L)):#only get the sets with two or more items ，不是从0开始
        for freqSet in L[i]:
            H1 = [frozenset([item]) for item in freqSet]        #注意frozenset的用法
            if (i > 1):
                rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
            else:
                calcConf(freqSet, H1, supportData, bigRuleList, minConf)  #函数执行操作，没接收返回值。
    return bigRuleList         

def calcConf(freqSet, H, supportData, brl, minConf=0.7):
    prunedH = [] #create new list to return   
    for conseq in H:
        conf = supportData[freqSet]/supportData[freqSet-conseq] #calc confidence  注意集合相减，补集
        if conf >= minConf: 
            print freqSet-conseq,'-->',conseq,'conf:',conf
            brl.append((freqSet-conseq, conseq, conf))
            prunedH.append(conseq)    #H集合是可以出现在关联规则右侧的元素，prunedH 也保存了这样的元素利于下一次使用
    return prunedH

def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
    m = len(H[0])
    if (len(freqSet) > (m + 1)): #try further merging  
        Hmp1 = aprioriGen(H, m+1)#create Hm+1 new candidates 合并 
        Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)  #获得新的子集，如果满足条件，子集进入下一个函数循环，此步是关键，控制
        if (len(Hmp1) > 1):    #need at least two sets to merge
            rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)


示例，发现国会投票中的模式
from time import sleep
from votesmart import votesmart
votesmart.apikey = 'a7fa40adec6f4a77178799fae4441030'
votesmart.apikey = 'get your api key first'
def getActionIds():
    actionIdList = []; billTitleList = []
    fr = open('recent20bills.txt') 
    for line in fr.readlines():
        billNum = int(line.split('\t')[0])
        try:                     #try-except 模块
            billDetail = votesmart.votes.getBill(billNum) #api call  API端口
            for action in billDetail.actions:
                if action.level == 'House' and \
                (action.stage == 'Passage' or action.stage == 'Amendment Vote'):
                    actionId = int(action.actionId)
                    print 'bill: %d has actionId: %d' % (billNum, actionId)
                    actionIdList.append(actionId)
                    billTitleList.append(line.strip().split('\t')[1])
        except:
            print "problem getting bill %d" % billNum
        sleep(1)                                      #delay to be polite 需要此步延迟时间
    return actionIdList, billTitleList

毒蘑菇案例
mushDatSet=[line.split() for line in open('mushroom.dat').readlines()]　＃　注意此用法

使用FP－growth算法来高效的发现频繁项集
class treeNode:
    def __init__(self, nameValue, numOccur, parentNode):
        self.name = nameValue
        self.count = numOccur
        self.nodeLink = None
        self.parent = parentNode      #needs to be updated
        self.children = {} 　#子树采用字典形式储存
    
    def inc(self, numOccur):
        self.count += numOccur
        
    def disp(self, ind=1):
        print '  '*ind, self.name, ' ', self.count               # ''*ind 打印ind个空格
        for child in self.children.values():　　 #字典的values方法
            child.disp(ind+1)



构建FP树

def loadSimpDat():
    simpDat = [['r', 'z', 'h', 'j', 'p'],
               ['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],
               ['z'],
               ['r', 'x', 'n', 'o', 's'],
               ['y', 'r', 'x', 'z', 'q', 't', 'p'],
               ['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]
    return simpDat


def createInitSet(dataSet):  #数据格式化
    retDict = {}
    for trans in dataSet:
        retDict[frozenset(trans)] = 1
    return retDict



def createTree(dataSet, minSup=1): #create FP-tree from dataset but don't mine
    headerTable = {}  #头指针表
    #go over dataSet twice
    for trans in dataSet:#first pass counts frequency of occurance
        for item in trans:
            headerTable[item] = headerTable.get(item, 0) + dataSet[trans]  #get用法，如果item存在，则返回，否则返回0 此处的基础是每个相机的元素不能出现2此，如[x,x,y] 频繁模式挖掘不管出现此数，只看出现了没有，数据整理的时候做好就不会有这样的数据
    for k in headerTable.keys():  #remove items not meeting minSup
        if headerTable[k] < minSup: 
            del(headerTable[k])          # 字典用del，删除头指针中不满足规定的元素
    freqItemSet = set(headerTable.keys())     #字典中key()的用法，提取出所有的key
    #print 'freqItemSet: ',freqItemSet
    if len(freqItemSet) == 0: return None, None  #if no items meet min support -->get out  注意在python中返回的都是None
    for k in headerTable:
        headerTable[k] = [headerTable[k], None] #reformat headerTable to use Node link  None存放将来的连接？先用None代替？
    #print 'headerTable: ',headerTable
    retTree = treeNode('Null Set', 1, None) #create tree
    for tranSet, count in dataSet.items():  #go through dataset 2nd time  注意字典中，item（）用法，返回了一个tuple类型的值，包括key和values
        localD = {}
        for item in tranSet:  #put transaction items in order
            if item in freqItemSet:
                localD[item] = headerTable[item][0]
        if len(localD) > 0:
            orderedItems = [v[0] for v in sorted(localD.items(), key=lambda p: p[1], reverse=True)]  #注意复杂嵌套列表排序，先对localD（字典）取itmes()后变成许多元素，后又根据元素的第二元素p[1]排序，此时排序完后，按照元组的第一个元素v[0]输出推导列表
            updateTree(orderedItems, retTree, headerTable, count)#populate tree with ordered freq itemset
    return retTree, headerTable #return tree and header table



def updateTree(items, inTree, headerTable, count):
    if items[0] in inTree.children:#check if orderedItems[0] in retTree.children
        inTree.children[items[0]].inc(count) #incrament count  增加数量
    else:   #add items[0] to inTree.children
        inTree.children[items[0]] = treeNode(items[0], count, inTree)      #加进树
        if headerTable[items[0]][1] == None: #update header table 
            headerTable[items[0]][1] = inTree.children[items[0]]    #注意把连接更新到指针头中去了，这种方式的应用
        else:
            updateHeader(headerTable[items[0]][1], inTree.children[items[0]])
    if len(items) > 1:#call updateTree() with remaining ordered items         把剩下的部分连接到书中去，此时是下一层的树，及刚加完元素的下层
        updateTree(items[1::], inTree.children[items[0]], headerTable, count)    #注意此处items[1::] 保证了下一个元素，items[0] 保证是刚加完的树

def updateHeader(nodeToTest, targetNode):   #this version does not use recursion
    while (nodeToTest.nodeLink != None):    #Do not use recursion to traverse a linked list!
        nodeToTest = nodeToTest.nodeLink     #多余？本来就有，为了检查保险？
    nodeToTest.nodeLink = targetNode



从一颗FP树中挖掘频繁模式

def findPrefixPath(basePat, treeNode): #treeNode comes from header table
    condPats = {}
    while treeNode != None:
        prefixPath = []
        ascendTree(treeNode, prefixPath)
        if len(prefixPath) > 1: 
            condPats[frozenset(prefixPath[1:])] = treeNode.count
        treeNode = treeNode.nodeLink       #转到链接
    return condPats



def ascendTree(leafNode, prefixPath): #ascends from leaf node to root
    if leafNode.parent != None:
        prefixPath.append(leafNode.name)
        ascendTree(leafNode.parent, prefixPath)      #递归函数



def mineTree(inTree, headerTable, minSup, preFix, freqItemList):
    bigL = [v[0] for v in sorted(headerTable.items(), key=lambda p: p[1])] #(sort header table) 和上次的排序一样。
    for basePat in bigL:  #start from bottom of header table
        newFreqSet = preFix.copy()  #注意是copy， 从上一次循环来，便于组合成新的
        newFreqSet.add(basePat)  #组合成频繁项集
        #print 'finalFrequent Item: ',newFreqSet    #append to set
        freqItemList.append(newFreqSet)  
        condPattBases = findPrefixPath(basePat, headerTable[basePat][1])
        #print 'condPattBases :',basePat, condPattBases
        #2. construct cond FP-tree from cond. pattern base
        myCondTree, myHead = createTree(condPattBases, minSup)
        #print 'head from conditional tree: ', myHead
        if myHead != None: #3. mine cond. FP-tree
            #print 'conditional tree for: ',newFreqSet
            #myCondTree.disp(1)            
            mineTree(myCondTree, myHead, minSup, newFreqSet, freqItemList)



在Twitter源词中发现一些共词

import twitter
from time import sleep
import re

def textParse(bigString):
    urlsRemoved = re.sub('(http:[/][/]|www.)([a-z]|[A-Z]|[0-9]|[/.]|[~])*', '', bigString)    #用re.sub实现正则的替换, 可以用re.compile把模式写在里面，和在sub直接写有点不不同
    listOfTokens = re.split(r'\W*', urlsRemoved)         #注意re.split的分割方法
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]

def getLotsOfTweets(searchStr):
    CONSUMER_KEY = ''
    CONSUMER_SECRET = ''
    ACCESS_TOKEN_KEY = ''
    ACCESS_TOKEN_SECRET = ''
    api = twitter.Api(consumer_key=CONSUMER_KEY, consumer_secret=CONSUMER_SECRET,
                      access_token_key=ACCESS_TOKEN_KEY, 
                      access_token_secret=ACCESS_TOKEN_SECRET)
    #you can get 1500 results 15 pages * 100 per page
    resultsPages = []
    for i in range(1,15):
        print "fetching page %d" % i
        searchResults = api.GetSearch(searchStr, per_page=100, page=i)
        resultsPages.append(searchResults)
        sleep(6)
    return resultsPages

def mineTweets(tweetArr, minSup=5):
    parsedList = []
    for i in range(14):
        for j in range(100):
            parsedList.append(textParse(tweetArr[i][j].text))
    initSet = createInitSet(parsedList)
    myFPtree, myHeaderTab = createTree(initSet, minSup)          #先建立树
    myFreqList = [] 
    mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList)  #频繁模式挖掘
    return myFreqList

利用PCA简化数据



from numpy import *

def loadDataSet(fileName, delim='\t'):
    fr = open(fileName)
    stringArr = [line.strip().split(delim) for line in fr.readlines()]         #strip移除一句两段空格
    datArr = [map(float,line) for line in stringArr]               #注意此种用法
    return mat(datArr)


def pca(dataMat, topNfeat=9999999):
    meanVals = mean(dataMat, axis=0)
    meanRemoved = dataMat - meanVals #remove mean
    covMat = cov(meanRemoved, rowvar=0)            #协方差矩阵，注意后面的参数
    eigVals,eigVects = linalg.eig(mat(covMat))        #计算特征值和特征向量
    eigValInd = argsort(eigVals)            #sort, sort goes smallest to largest
    eigValInd = eigValInd[:-(topNfeat+1):-1]  #cut off unwanted dimensions
    redEigVects = eigVects[:,eigValInd]       #reorganize eig vects largest to smallest  按列？
    lowDDataMat = meanRemoved * redEigVects#transform data into new dimensions  数据线性转换，注意此种用法，把原来的数据转为降维后的数据，将数据转到新空间
    reconMat = (lowDDataMat * redEigVects.T) + meanVals  #注意此处过程，把提取出特征向量后新空间的数据，新空间的数据还原到原来维度，为什么不用逆矩阵呢? 注意
    return lowDDataMat, reconMat



import matplotlib.pyplot as plt
fig=plt.figure()
ax=fig.add_subplot(111)
ax.scatter(dataMat[:,0].flatten().A[0], dataMat[:,1].flatten().A[0], marker='^',s=90)
ax.scatter(reconMat[:,0].flatten().A[0], reconMat[:,1].flatten().A[0], marker='o',s=50,c='red')

用PCA对半导体数据降维
将Nan用平均值代替
def replaceNanWithMean(): 
    datMat = loadDataSet('secom.data', ' ')
    numFeat = shape(datMat)[1]
    for i in range(numFeat):
        meanVal = mean(datMat[nonzero(~isnan(datMat[:,i].A))[0],i]) #values that are not NaN (a number)　注意＂~＂的作用，取反的作用。
        datMat[nonzero(isnan(datMat[:,i].A))[0],i] = meanVal  #set NaN values to mean
    return datMat　　＃许多特征值为零，说明它是其它的特征的副本，不提供格外的信息。

利用SVD简化数据

def loadExData():
    return[[0, 0, 0, 2, 2],
           [0, 0, 0, 3, 3],
           [0, 0, 0, 1, 1],
           [1, 1, 1, 0, 0],
           [2, 2, 2, 0, 0],
           [5, 5, 5, 0, 0],
           [1, 1, 1, 0, 0]]



import svdRec
from numpy import *
Data=svdRec.loadExData()
U,Sigima,VT=linalg.svd(Data)
print "U:", U
print "Sigima", Sigima
print "VT", VT
Sig3=mat([[Sigima[0],0,0],[0,Sigima[1],0],[0,0,Sigima[2]]])
U[:,:3]*Sig3*VT[:3,:]
print U


基于协同过滤的引擎
距离计算，转化为相似度

from numpy import linalg as la
def ecludSim(inA,inB):
    return 1.0/(1.0 + la.norm(inA - inB))　　　　　　　　 #la中的norm　计算泛数，注意了，距离计算

def pearsSim(inA,inB):
    if len(inA) < 3 : return 1.0
    return 0.5+0.5*corrcoef(inA, inB, rowvar = 0)[0][1]　　　　　　　

def cosSim(inA,inB):
    num = float(inA.T*inB)
    denom = la.norm(inA)*la.norm(inB)
    return 0.5+0.5*(num/denom)

myMat[0,1]=myMat[0,0]=myMat[1,0]=myMat[2,0]=4

def standEst(dataMat, user, simMeas, item):
    n = shape(dataMat)[1]
    simTotal = 0.0; ratSimTotal = 0.0
    for j in range(n):
        userRating = dataMat[user,j]
        if userRating == 0: continue   #注意continue不是退出真个循环，而是跳过进入下一循环，和break不一样
        overLap = nonzero(logical_and(dataMat[:,item].A>0, \ # 注意logical_and的用法，逻辑表达式，比较两列是否相等，结合.A，nonzero 的使用
                                      dataMat[:,j].A>0))[0]
        if len(overLap) == 0: similarity = 0
        else: similarity = simMeas(dataMat[overLap,item], \
                                   dataMat[overLap,j])
        print 'the %d and %d similarity is: %f' % (item, j, similarity)
        simTotal += similarity
        ratSimTotal += similarity * userRating
    if simTotal == 0: return 0
    else: return ratSimTotal/simTotal  #考虑当前用户的评分，归一化，最后分数在1-5之间， 类似权重？

ef recommend(dataMat, user, N=3, simMeas=cosSim, estMethod=standEst):
    unratedItems = nonzero(dataMat[user,:].A==0)[1]#find unrated items  未评级的
    if len(unratedItems) == 0: return 'you rated everything'
    itemScores = []
    for item in unratedItems:
        estimatedScore = estMethod(dataMat, user, simMeas, item)  #返回的是tuple类型
        itemScores.append((item, estimatedScore)) 
    return sorted(itemScores, key=lambda jj: jj[1], reverse=True)[:N]  #结果排序，注意书写形式

def svdEst(dataMat, user, simMeas, item):
    n = shape(dataMat)[1]
    simTotal = 0.0; ratSimTotal = 0.0
    U,Sigma,VT = la.svd(dataMat)     #奇异值分解
    Sig4 = mat(eye(4)*Sigma[:4]) #arrange Sig4 into a diagonal matrix  取4个奇异值，构造对角举证，注意构造方式，是单位矩阵相乘
    xformedItems = dataMat.T * U[:,:4] * Sig4.I  #create transformed items 注意构造到低维矩阵的方式，首选是取转置，再相乘 
    for j in range(n):
        userRating = dataMat[user,j]
        if userRating == 0 or j==item: continue
        similarity = simMeas(xformedItems[item,:].T,\
                             xformedItems[j,:].T)
        print 'the %d and %d similarity is: %f' % (item, j, similarity)
        simTotal += similarity
        ratSimTotal += similarity * userRating
    if simTotal == 0: return 0
    else: return ratSimTotal/simTotal


基于SVD的图像压缩

def imgCompress(numSV=3, thresh=0.8):
    myl = []
    for line in open('0_5.txt').readlines():
        newRow = []
        for i in range(32):
            newRow.append(int(line[i]))
        myl.append(newRow)
    myMat = mat(myl)
    print "****original matrix******"
    printMat(myMat, thresh)
    U,Sigma,VT = la.svd(myMat)
    SigRecon = mat(zeros((numSV, numSV)))
    for k in range(numSV):#construct diagonal matrix from vector
        SigRecon[k,k] = Sigma[k]
    reconMat = U[:,:numSV]*SigRecon*VT[:numSV,:]
    print "****reconstructed matrix using %d singular values******" % numSV
    printMat(reconMat, thresh)


大数据与Mapreduce
在lunix 执行 cat inputFile.txt|python mapper.py|sort|python reducer.py>outputFile.txt 注意此种运行方式

import sys  #
from numpy import mat, mean, power

def read_input(file):
    for line in file:
        yield line.rstrip()
        
input = read_input(sys.stdin)#creates a list of input lines  使用sys.stdin 
input = [float(line) for line in input] #overwrite with floats
numInputs = len(input)
input = mat(input)
sqInput = power(input,2)

#output size, mean, mean(square values)
print "%d\t%f\t%f" % (numInputs, mean(input), mean(sqInput)) #calc mean of columns
print >> sys.stderr, "report: still alive"    #注意stderr的使用方式 print >>

mrjob的一个MapReduce脚本解析
分布式SVM的Pegasos算法