树回归

输入数据与目标变量之间为非线性关系时,可用树回归,使用树对预测值分段,包括分段常数、分段直线,前者为回归树,后者为模型树。若数据过拟合,需剪枝。

#!/usr/bin/python  
# -*- coding: utf-8 -*-  
#coding=utf-8

from numpy import *

#导入数据
def loadDataSet(fileName):
    datMat = []
    fr = open(fileName)
    for line in fr.readlines():
        curLine = line.strip().split('\t')
        frLine = map(float, curLine)
        datMat.append(frLine)
    return datMat

#输入参数:数据集合,待切分的特征,该特征的某个某个值
def binSplitDataSet(dataSet, feature, value):
    mat0 = dataSet[nonzero(dataSet[:, feature] > value)[0], :][0]
    mat1 = dataSet[nonzero(dataSet[:, feature] <= value)[0], :][0]
    return mat0, mat1

#生成叶节点,为目标变量的均值
def regLeaf(dataSet):
    return mean(dataSet[:, -1])

#误差估计函数,总方差
def regErr(dataSet):
    return var(dataSet[:, -1]) * shape(dataSet)[0]

#找到数据的最佳二元切分方式
#如果找不到一个“好”的二元切分,返回None并同时调用createTree()产生叶结点
def chooseBestSplit(dataSet, leafType = regLeaf, errType = regErr, ops=(1,4)):
    tolS = ops[0]  #容许的误差下降值
    tolN = ops[1]  #切分的最少样本数,如果为1,直接返回
    if len(set(dataSet[:, -1].T.tolist()[0])) == 1:
        return None, leafType(dataSet)
    m, n = shape(dataSet)
    S = errType(dataSet)  #误差
    bestS = inf
    bestIndex = 0
    bestValue = 0
    for featIndex in range(n-1):  #每个特征
        for splitVal in set(dataSet[:, featIndex]):  #该特征的所有取值
            mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
            if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN):
                continue
            newS = errType(mat0) + errType(mat1)  #新误差
            if newS < bestS:
                bestIndex = featIndex
                bestValue = splitVal
                bestS = newS
    if (S - bestS) < tolS:  #如果误差减少不大则退出
        return None, leafType(dataSet)
    mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
    if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN):  #如果切分出的数据集很小则退出
        return None, leafType(dataSet)
    return bestIndex, bestValue

#SART 分类回归树
#输入参数:数据集合,建立叶结点函数,误差计算函数,构建树所需的其它参数的元组
def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):
    feat, val = chooseBestSplit(dataSet, leafType, errType, ops)  #将数据集切分成2部分
    if feat == None:  #满足停止条件时,返回叶节点值
        return val
    retTree = {}
    retTree['spInd'] = feat
    retTree['spVal'] = val
    lSet, rSet = binSplitDataSet(dataSet, feat, val)  
    retTree['left'] = createTree(lSet, leafType, errType, ops)
    retTree['right'] = createTree(rSet, leafType, errType, ops)
    return retTree

#回归树剪枝函数
#判断输入数据是否为一棵树
def isTree(obj):
    return (type(obj).__name__ == 'dict')

#计算2个叶结点的平均值。对树进行塌陷处理
def getMean(tree):
    if isTree(tree['right']):
        tree['right'] = getMean(tree['right'])
    if isTree(tree['left']):
        tree['left'] = getMean(tree['left'])
    return (tree['right'] + tree['left']) / 2.0

#输入参数:待剪枝的树,待测试的数据
def prune(tree, testData):
    if shape(testData)[0] == 0:
        return getMean(tree)
    if (isTree(tree['right']) or isTree(tree['left'])):
        lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
    #对左右子树剪枝
    if isTree(tree['left']):
        tree['left'] = prune(tree['left'], lSet)
    if isTree(tree['right']):
        tree['right'] = prune(tree['right'], rSet)
    #检查剪枝后的左右子树是否是树,如果不是,可以进行合并
    #与合并前的误差进行比较,如果合并后的误差小,则合并,否则不合并
    if not isTree(tree['left']) and not isTree(tree['right']):
        lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
        errorNoMerge = sum(power(lSet[:,-1] - tree['left'], 2)) + sum(power(rSet[:,-1] - tree['right'], 2))
        treeMean = (tree['left'] + tree['right']) / 2.0
        errorMerge = sum(power(testData[:, -1] - treeMean, 2))
        if errorMerge < errorNoMerge:
            print "merging"
            return treeMean
        else:
            return tree
    else:
        return tree

#模型树
#模型树的叶结点生成函数
#将数据集格式化成目标变量Y和自变量X
def linearSolve(dataSet):
    m, n = shape(dataSet)
    X = mat(ones((m, n)))
    Y = mat(ones((m, 1)))
    X[:, 1:n] = dataSet[:, 0:n-1]
    Y = dataSet[:, -1]
    xTx = X.T * X
    if linalg.det(xTx) == 0.0:
        raise NameError('This matrix is singular, cannot do inverse, \n try increaseing the second value of ops')
    ws = xTx.I * (X.T * Y)
    return ws, X, Y

#当数据不再需要切分时,生成叶结点的模型
def modelLeaf(dataSet):
    ws, X, Y = linearSolve(dataSet)
    return ws

#在给定的数据集上计算误差
def modelErr(dataSet):
    ws, X, Y = linearSolve(dataSet)
    yHat = X * ws
    return sum(power(Y-yHat, 2))

#利用树回归进行预测
#对回归树叶结点进行预测
def regTreeEval(model, inDat):
    return float(model)

#对模型树结点进行预测
def modelTreeEval(model, inDat):
    n = shape(inDat)[1]
    X = mat(ones((1, n+1)))
    X[:, 1:n+1] = inDat  #增加第0列
    return float(X * model)

#对于输入的单个数据点或者行向量,返回一个浮点值
def treeForeCast(tree, inData, modelEval=regTreeEval):
    if not isTree(tree):  #如果是叶结点
        return modelEval(tree, inData)
    if inData[tree['spInd']] > tree['spVal']:
        if isTree(tree['left']):
            return treeForeCast(tree['left'], inData, modelEval)
        else:
            return modelEval(tree['left'], inData)
    else:
        if isTree(tree['right']):
            return treeForeCast(tree['right'], inData, modelEval)
        else:
            return modelEval(tree['right'], inData)

def createForeCast(tree, testData, modelEval=regTreeEval):
    m = len(testData)
    yHat = mat(zeros((m, 1)))
    for i in range(m):
        yHat[i, 0] = treeForeCast(tree, mat(testData[i]), modelEval)
    return yHat

测试:回归树

>>> import regTree
>>> myDatl = loadDataSet('ex0.txt')
>>> myDatl = mat(myDatl)
>>> createTree(myDatl)
{'spInd': 1, 'spVal': matrix([[ 0.39435]]), 'right': {'spInd': 1, 'spVal': matrix([[ 0.197834]]), 'right': -0.023838155555555553, 'left': 1.0289583666666664}, 'left': {'spInd': 1, 'spVal': matrix([[ 0.582002]]), 'right': 1.9800350714285717, 'left': {'spInd': 1, 'spVal': matrix([[ 0.797583]]), 'right': 2.9836209534883724, 'left': 3.9871632000000004}}}

测试:剪枝

>>> myMat2 = loadDataSet('ex2.txt')
>>> myMat2 = mat(myMat2)
>>> myTree = createTree(myMat2, ops=(0,1))
>>> myDatTest = loadDataSet('ex2test.txt')
>>> myMat2Test = mat(myDatTest)
>>> prune(myTree, myMat2Test)
merging
merging
... ... 
850000001}}, 'left': {'spInd': 0, 'spVal': matrix([[ 0.948822]]), 'right': 69.318648999999994, 'left': 96.41885225}}}}}}}}}}}, 'left': {'spInd': 0, 'spVal': matrix([[ 0.965969]]), 'right': {'spInd': 0, 'spVal': matrix([[ 0.956951]]), 'right': 111.2013225, 'left': {'spInd': 0, 'spVal': matrix([[ 0.958512]]), 'right': 135.83701300000001, 'left': {'spInd': 0, 'spVal': matrix([[ 0.960398]]), 'right': 123.559747, 'left': 112.386764}}}, 'left': 92.523991499999994}}}}

测试:模型树

>>> import regTree
>>> myMat2 = mat(loadDataSet('exp2.txt'))
>>> createTree(myMat2, modelLeaf, modelErr, (1,10))
{'spInd': 0, 'spVal': matrix([[ 0.285477]]), 
'right': matrix([[ 3.46877936],[ 1.18521743]]), 
'left': matrix([[  1.69855694e-03],[1.19647739e+01]])}

测试:树回归与标准回归比较
R^2 越接近1越好

#回归树
>>> trainMat = mat(loadDataSet('bikeSpeedVsIq_train.txt'))
>>> testMat = mat(loadDataSet('bikeSpeedVsIq_test.txt'))
>>> myTree = createTree(trainMat, ops=(1,20))
>>> yHat = createForeCast(myTree, testMat[:,0])
>>> corrcoef(yHat, testMat[:,1], rowvar=0)[0,1]
0.96408523182221306

#模型树
>>> import regTree
>>> trainMat = mat(loadDataSet('bikeSpeedVsIq_train.txt'))
>>> testMat = mat(loadDataSet('bikeSpeedVsIq_test.txt'))
>>> myTree = createTree(trainMat, modelLeaf, modelErr, ops=(1,20))
>>> yHat = createForeCast(myTree, testMat[:, 0], modelTreeEval)
>>> corrcoef(yHat, testMat[:,1], rowvar=0)[0,1]
0.97604121913806285

#标准回归
>>> ws, X, Y = linearSolve(trainMat)
>>> ws
matrix([[ 37.58916794], [6.18978355]])
>>> yHat = testMat[:,0] * ws[1,0] + ws[0,0]
>>> corrcoef(yHat, testMat[:,1], rowvar=0)[0,1]
0.94346842356747584

可以看到,树回归要由于标准回归

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值