机器学习实战(9) ——树回归(python实现)

这是学习机器学习算法实战这本书时,写的代码实战。让自己对各个算法有更直观的了解,不能一直不写啊。不管简单还是不简单都亲自一行一行的敲一遍啊。

具体的源码和和数据链接:https://pan.baidu.com/s/1G2S2pb5gfBnxGNNTFgTkEA 密码:fov0

这个第九章的代码和自己做的测试ressTree.py 这章代码坑较多。写的真的心累啊

# -*- coding: utf-8 -*-
# author: Yufeng Song
from numpy import *
def loadDataSet(fileName):      #general function to parse tab -delimited floats
    dataMat = []                #assume last column is target value
    fr = open(fileName)
    for line in fr.readlines():
        curLine = line.strip().split('\t')
        # fltLine = map(float,curLine) #map all elements to float() 这个是错的啊
        fltLine = list(map(float,curLine))#python3
        # fltLine = float(float(curLine)) #TypeError: float() argument must be a string or a number, not 'list'
        # dataMat.append(fltLine)
        dataMat.append(curLine)
    # print(dataMat)
    return dataMat

def binSplitDataSet(dataSet, feature, value):
    mat0 = dataSet[nonzero(dataSet[:,feature] > value)[0],:]
    # print(nonzero(dataSet[:,feature] > value))
    #(array([1], dtype=int64), array([0], dtype=int64))
    mat1 = dataSet[nonzero(dataSet[:,feature] <= value)[0],:]
    # print('*'*40)
    # print(dataSet[:,feature] <= value)
    # print('*'*40)
    # print(nonzero(dataSet[:,feature] <= value))
    # print('*'*40)
    # print(nonzero(dataSet[:,feature] <= value)[0])
    # print('*'*40)
    # print(dataSet[nonzero(dataSet[:,feature] <= value)[0],:])
    # print('*'*40)
    # print(dataSet[nonzero(dataSet[:,feature] <= value)[0],:][0])
    # print('*'*40)
    # print(mat1)
    # print('*'*40)
    # print(dataSet[[1,2],:])
    # print('*'*40)
    # print(dataSet[:,feature] > value)
    # print('*'*40)
    return mat0,mat1

def regLeaf(dataSet):#returns the value used for each leaf
    # print(dataSet)
    return mean(dataSet[:,-1])
    # return

def regErr(dataSet):
    return var(dataSet[:,-1]) * shape(dataSet)[0]

def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):
    tolS = ops[0]; tolN = ops[1]
    # print(dataSet)
    # print('*'*40)
    #if all the target variables are the same value: quit and return value
    print('AAAAAA')
    print(dataSet[:,-1].T.tolist()[0])
    if len(set(dataSet[:,-1].T.tolist()[0])) == 1: #exit cond 1
        return None, leafType(dataSet)
    m,n = shape(dataSet)
    #the choice of the best feature is driven by Reduction in RSS error from mean
    # print('#'*100)
    # print(dataSet)
    S = errType(dataSet)
    bestS = inf; bestIndex = 0; bestValue = 0
    for featIndex in range(n-1):
        # for splitVal in set(dataSet[:,featIndex]): 妈的这个是错的
        # print('AAAA'*55)
        # print(dataSet[:,featIndex].T.A)
        # print(dataSet[:,featIndex].T.A.tolist())
        for splitVal in set(dataSet[:,featIndex].T.A.tolist()[0]):
            mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
            if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): continue
            newS = errType(mat0) + errType(mat1)
            if newS < bestS:
                bestIndex = featIndex
                bestValue = splitVal
                bestS = newS
    #if the decrease (S-bestS) is less than a threshold don't do the split
    if (S - bestS) < tolS:
        return None, leafType(dataSet) #exit cond 2
    mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
    if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN):  #exit cond 3
        return None, leafType(dataSet)
    return bestIndex,bestValue#returns the best feature to split on
                              #and the value used for that split

def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):#assume dataSet is NumPy Mat so we can array filtering
    # print(dataSet)
    # print('#'*50)
    feat, val = chooseBestSplit(dataSet, leafType, errType, ops)#choose the best split
    if feat == None: return val #if the splitting hit a stop condition return val
    retTree = {}
    retTree['spInd'] = feat
    retTree['spVal'] = val
    lSet, rSet = binSplitDataSet(dataSet, feat, val)
    retTree['left'] = createTree(lSet, leafType, errType, ops)
    retTree['right'] = createTree(rSet, leafType, errType, ops)
    return retTree

def isTree(obj):
    return (type(obj).__name__=='dict')

def getMean(tree):
    if isTree(tree['right']): tree['right'] = getMean(tree['right'])
    if isTree(tree['left']): tree['left'] = getMean(tree['left'])
    return (tree['left']+tree['right'])/2.0

def prune(tree, testData):
    if shape(testData)[0] == 0: return getMean(tree) #if we have no test data collapse the tree
    if (isTree(tree['right']) or isTree(tree['left'])):#if the branches are not trees try to prune them
        lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
    if isTree(tree['left']): tree['left'] = prune(tree['left'], lSet)
    if isTree(tree['right']): tree['right'] =  prune(tree['right'], rSet)
    #if they are now both leafs, see if we can merge them
    if not isTree(tree['left']) and not isTree(tree['right']):
        lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
        errorNoMerge = sum(power(lSet[:,-1] - tree['left'],2)) +\
            sum(power(rSet[:,-1] - tree['right'],2))
        treeMean = (tree['left']+tree['right'])/2.0
        errorMerge = sum(power(testData[:,-1] - treeMean,2))
        if errorMerge < errorNoMerge:
            print ("merging")
            return treeMean
        else: return tree
    else: return tree


def linearSolve(dataSet):   #helper function used in two places
    m,n = shape(dataSet)
    X = mat(ones((m,n))); Y = mat(ones((m,1)))#create a copy of data with 1 in 0th postion
    X[:,1:n] = dataSet[:,0:n-1]; Y = dataSet[:,-1]#and strip out Y
    xTx = X.T*X
    if linalg.det(xTx) == 0.0:
        raise NameError('This matrix is singular, cannot do inverse,\n\
        try increasing the second value of ops')
    ws = xTx.I * (X.T * Y)
    return ws,X,Y

def modelLeaf(dataSet):#create linear model and return coeficients
    ws,X,Y = linearSolve(dataSet)
    return ws

def modelErr(dataSet):
    ws,X,Y = linearSolve(dataSet)
    yHat = X * ws
    return sum(power(Y - yHat,2))

def regTreeEval(model, inDat):
    return float(model)

def modelTreeEval(model, inDat):
    n = shape(inDat)[1]
    X = mat(ones((1,n+1)))
    X[:,1:n+1]=inDat
    return float(X*model)

def treeForeCast(tree, inData, modelEval=regTreeEval):
    if not isTree(tree): return modelEval(tree, inData)
    if inData[tree['spInd']] > tree['spVal']:
        if isTree(tree['left']): return treeForeCast(tree['left'], inData, modelEval)
        else: return modelEval(tree['left'], inData)
    else:
        if isTree(tree['right']): return treeForeCast(tree['right'], inData, modelEval)
        else: return modelEval(tree['right'], inData)

def createForeCast(tree, testData, modelEval=regTreeEval):
    m=len(testData)
    yHat = mat(zeros((m,1)))
    for i in range(m):
        yHat[i,0] = treeForeCast(tree, mat(testData[i]), modelEval)
    return yHat

if __name__ == '__main__':
    # testMat = mat(eye(4))
    # print(testMat)
    # mat0,mat1 = binSplitDataSet(testMat,1,0.5)
    # print('#'*40)
    # print(mat0)
    # print(mat1)
    # myDat = loadDataSet("ex00.txt")
    # myDat = genfromtxt("ex00.txt",delimiter="\t",dtype=float)
    # myDat = genfromtxt("ex0.txt",delimiter="\t",dtype=float)

    # print(myDat)
    # myMat = mat(myDat)
    # trees = createTree(myMat)
    # print(trees)
    # print(trees.sort())
    # print(sorted(trees.items(),key= lambda x:x[0],reverse=True))

    #170页测试
    # myDat2 = genfromtxt("ex2.txt",delimiter="\t",dtype=float)
    # # myDat2 = loadDataSet('ex2.txt')
    # myMat2 = mat(myDat2)
    # trees = createTree(myMat2)
    # # print(trees)
    # # print(createTree(myMat2,ops=(1000,4)))
    #
    # myTree = createTree(myMat2,ops=(0,1))
    # myDatTest =  genfromtxt("ex2test.txt",delimiter="\t",dtype=float)
    # myDat2 = loadDataSet('ex2test.txt')
    # myMat2Test = mat(myDatTest)
    # print(prune(myTree,myMat2Test))

    # myMat2 = mat(loadDataSet('exp2.txt'))
    # print(myMat2,modelLeaf,modelErr,(1,10))

    # 175页测试
    # trainMat = mat(loadDataSet('bikeSpeedVsIq_train.txt'))
    # trainMat = mat(genfromtxt("bikeSpeedVsIq_train.txt",delimiter="\t",dtype=float))
    # myDatTest =  genfromtxt("ex2test.txt",delimiter="\t",dtype=float)
    # testMat = mat(loadDataSet('bikeSpeedVsIq_test.txt'))
    # testMat = mat(genfromtxt("bikeSpeedVsIq_test.txt",delimiter="\t",dtype=float))
    # myTree = createTree(trainMat,ops=(1,20))
    # yHat = createForeCast(myTree,testMat[:,0])
    # print(corrcoef(yHat,testMat[:,-1]))
    # print(corrcoef(yHat,testMat[:,1])[0,1])  这个有问题啊,这个值对应的不对
    # ws,X,Y = linearSolve(trainMat)
    # print(ws)
    #
    # for i in range(shape(testMat)[0]):
    #     yHat[i] = testMat[i,0]*ws[1,0] + ws[0,0]
    # print(corrcoef(yHat,testMat[:,1],rowvar=0)[0,1])

    # from tkinter import *
    # root = Tk()
    # myLabel = Label(root,text='Hello World')
    # myLabel.grid()
    # root.mainloop()



  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值