CH09-回归树理论
1.树回归算法的优缺点
优点:可以对复杂和非线性的问题建模.
缺点:结果不容易理解.
适用数据类型:数值型和标称型.
2.树回归和分类树的思路类似,且方法如下
收集数据:采用任意方法收集数据.
准备数据:需要数值型的数据,标称型数据应该映射成为二值型数据.
分析数据:汇出数据的二维可视化显示结果,以字典方式生成树
训练算法:大部分时间都花费在叶节点树模型的构建上.
测试算法:使用测试数据上的R*R值来分析模型的效果.
使用算法:使用训练出的树做预测,预测结果还可以来做很多事情.
3.连续和离散型特征的树的构建
3.1 在树的构建过程中,需要使用到字典,该字典包含以下4个元素:
1.带切分的特征
2.待切分的特征值
3.右子树
4.左子树
3.2 构建树的伪代码
1.找到最佳的待切分特征
2.如果该节点不能再分,将该节点存为叶节点
3.执行二元切分
4.在右子树调用方法
5.在左子树调用方法
4.将CART算法用于回归
在构建树种新增伪代码
1.对每个特征
2.对每个特征值
3.将数据切成两份
4.计算切分的误差
5.如果当前误差小于当前最小误差,那么将切分设定为最佳切分并且更新最小误差
5.树剪枝
一棵树如果节点过多,就会出现“过拟合”,通过降低决策树的复杂度来避免过拟合的过程称为剪枝。
5.1 预剪枝方法
定义一个高度,当决策树达到该高度的时候就停止决策树的增长。
达到某个节点的实例具有相同的特征向量,即使这些实例不属于同一类,也可以停止决策树的生长,这个方法对处理数据冲突的时候比较有效。
定义一个阀值,当某个节点树小于阀值的时候就可以停止。
定义一个阀值,通过计算每次扩张对系统性能的增益,并比较增益值与该阀值大小来决定是否停止决策树的增长。
5.2 后剪枝方法
REP(错误率降低剪枝)
1.删除以此节点为根的子树
2.使其成为叶子节点
3.赋予该节点关联的训练数据的最常见分类
4.当修剪后的树对于验证集合的性能不会比原来的树差时,才真正删除该节点
6.小结:
CART算法可以用于构建二元树并处理离散型和连续型的切分,该算法构建出的树会倾向于对数据过拟合。
7. 代码实现
import pandas as pd
import numpy as np
%matplotlib inline
%matplotlib notebook
import matplotlib.pyplot as plt
from numpy import *
# 导入回归树
import regTrees
# 回归树1
trainMat=mat(regTrees.loadDataSet('data/bikeSpeedVsIq_train.txt'))
testMat=mat(regTrees.loadDataSet('data/bikeSpeedVsIq_test.txt'))
myTree=regTrees.createTree(trainMat, ops=(1,20))
yHat = regTrees.createForeCast(myTree, testMat[:,0])
corrcoef(yHat, testMat[:,1],rowvar=0)[0,1]
#0.964085231822215
# 回归树的树
myTree
{'spInd': 0,
'spVal': 10.0,
'left': {'spInd': 0,
'spVal': 17.0,
'left': {'spInd': 0,
'spVal': 20.0,
'left': 168.34161286956524,
'right': 157.0484078846154},
'right': {'spInd': 0,
'spVal': 14.0,
'left': 141.06067981481482,
'right': 122.90893026923078}},
'right': {'spInd': 0,
'spVal': 7.0,
'left': 94.7066578125,
'right': {'spInd': 0,
'spVal': 5.0,
'left': 69.02117757692308,
'right': 50.94683665}}}
# 模型树1
myTree=regTrees.createTree(trainMat, regTrees.modelLeaf,
regTrees.modelErr,(1,20))
yHat = regTrees.createForeCast(myTree, testMat[:,0], regTrees.modelTreeEval)
corrcoef(yHat, testMat[:,1],rowvar=0)[0,1]
#0.9760412191380629
# 模型树的树
myTree
{'spInd': 0,
'spVal': 4.0,
'left': {'spInd': 0,
'spVal': 12.0,
'left': {'spInd': 0,
'spVal': 16.0,
'left': {'spInd': 0,
'spVal': 20.0,
'left': matrix([[47.58621512],
[ 5.51066299]]),
'right': matrix([[37.54851927],
[ 6.23298637]])},
'right': matrix([[43.41251481],
[ 6.37966738]])},
'right': {'spInd': 0,
'spVal': 9.0,
'left': matrix([[-2.87684083],
[10.20804482]]),
'right': {'spInd': 0,
'spVal': 6.0,
'left': matrix([[-11.84548851],
[ 12.12382261]]),
'right': matrix([[-17.21714265],
[ 13.72153115]])}}},
'right': matrix([[ 68.87014372],
[-11.78556471]])}
# 标准回归的效果1
ws,X,Y=regTrees.linearSolve(trainMat)
for i in range(shape(testMat)[0]):
yHat[i]=testMat[i,0]*ws[1,0]+ws[0,0]
corrcoef(yHat, testMat[:,1],rowvar=0)[0,1]
#0.9434684235674766
# 标准回归系数
ws
#matrix([[37.58916794],
# [ 6.18978355]])
8.regTrees.py
'''
Created on Feb 4, 2011
Tree-Based Regression Methods
@author: Peter Harrington
'''
from numpy import *
def loadDataSet(fileName): #general function to parse tab -delimited floats
dataMat = [] #assume last column is target value
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split('\t')
fltLine = list(map(float,curLine)) #map all elements to float()
dataMat.append(fltLine)
return dataMat
def binSplitDataSet(dataSet, feature, value):
mat0 = dataSet[nonzero(dataSet[:,feature] > value)[0],:]
mat1 = dataSet[nonzero(dataSet[:,feature] <= value)[0],:]
return mat0,mat1
def regLeaf(dataSet):#returns the value used for each leaf
return mean(dataSet[:,-1])
def regErr(dataSet):
return var(dataSet[:,-1]) * shape(dataSet)[0]
def linearSolve(dataSet): #helper function used in two places
m,n = shape(dataSet)
X = mat(ones((m,n))); Y = mat(ones((m,1)))#create a copy of data with 1 in 0th postion
X[:,1:n] = dataSet[:,0:n-1]; Y = dataSet[:,-1]#and strip out Y
xTx = X.T*X
if linalg.det(xTx) == 0.0:
raise NameError('This matrix is singular, cannot do inverse,\n\
try increasing the second value of ops')
ws = xTx.I * (X.T * Y)
return ws,X,Y
def modelLeaf(dataSet):#create linear model and return coeficients
ws,X,Y = linearSolve(dataSet)
return ws
def modelErr(dataSet):
ws,X,Y = linearSolve(dataSet)
yHat = X * ws
return sum(power(Y - yHat,2))
def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):
tolS = ops[0]; tolN = ops[1]
#if all the target variables are the same value: quit and return value
if len(set(dataSet[:,-1].T.tolist()[0])) == 1: #exit cond 1
return None, leafType(dataSet)
m,n = shape(dataSet)
#the choice of the best feature is driven by Reduction in RSS error from mean
S = errType(dataSet)
bestS = inf; bestIndex = 0; bestValue = 0
for featIndex in range(n-1):
for splitVal in set(dataSet[:,featIndex].T.tolist()[0]):
mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): continue
newS = errType(mat0) + errType(mat1)
if newS < bestS:
bestIndex = featIndex
bestValue = splitVal
bestS = newS
#if the decrease (S-bestS) is less than a threshold don't do the split
if (S - bestS) < tolS:
return None, leafType(dataSet) #exit cond 2
mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): #exit cond 3
return None, leafType(dataSet)
return bestIndex,bestValue#returns the best feature to split on
#and the value used for that split
def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):#assume dataSet is NumPy Mat so we can array filtering
feat, val = chooseBestSplit(dataSet, leafType, errType, ops)#choose the best split
if feat == None: return val #if the splitting hit a stop condition return val
retTree = {}
retTree['spInd'] = feat
retTree['spVal'] = val
lSet, rSet = binSplitDataSet(dataSet, feat, val)
retTree['left'] = createTree(lSet, leafType, errType, ops)
retTree['right'] = createTree(rSet, leafType, errType, ops)
return retTree
def isTree(obj):
return (type(obj).__name__=='dict')
def getMean(tree):
if isTree(tree['right']): tree['right'] = getMean(tree['right'])
if isTree(tree['left']): tree['left'] = getMean(tree['left'])
return (tree['left']+tree['right'])/2.0
def prune(tree, testData):
if shape(testData)[0] == 0: return getMean(tree) #if we have no test data collapse the tree
if (isTree(tree['right']) or isTree(tree['left'])):#if the branches are not trees try to prune them
lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
if isTree(tree['left']): tree['left'] = prune(tree['left'], lSet)
if isTree(tree['right']): tree['right'] = prune(tree['right'], rSet)
#if they are now both leafs, see if we can merge them
if not isTree(tree['left']) and not isTree(tree['right']):
lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
errorNoMerge = sum(power(lSet[:,-1] - tree['left'],2)) +\
sum(power(rSet[:,-1] - tree['right'],2))
treeMean = (tree['left']+tree['right'])/2.0
errorMerge = sum(power(testData[:,-1] - treeMean,2))
if errorMerge < errorNoMerge:
print("merging")
return treeMean
else: return tree
else: return tree
def regTreeEval(model, inDat):
return float(model)
def modelTreeEval(model, inDat):
n = shape(inDat)[1]
X = mat(ones((1,n+1)))
X[:,1:n+1]=inDat
return float(X*model)
def treeForeCast(tree, inData, modelEval=regTreeEval):
if not isTree(tree): return modelEval(tree, inData)
if inData[tree['spInd']] > tree['spVal']:
if isTree(tree['left']): return treeForeCast(tree['left'], inData, modelEval)
else: return modelEval(tree['left'], inData)
else:
if isTree(tree['right']): return treeForeCast(tree['right'], inData, modelEval)
else: return modelEval(tree['right'], inData)
def createForeCast(tree, testData, modelEval=regTreeEval):
m=len(testData)
yHat = mat(zeros((m,1)))
for i in range(m):
yHat[i,0] = treeForeCast(tree, mat(testData[i]), modelEval)
return yHat