根据表5.2所示的数据,用平方误差损失准则生成二叉回归树
二叉回归树的算法为:
其中公式5.21中的c1,c2分别为R1和R2上数据的平均值
代码仿照机器学习实战上关于决策树实现(递归的建立一棵树)保存为cart.py:
#coding:utf-8
import numpy as np
#数据集
def createDataSet():
dataSet = [4.5,4.75,4.91,5.34,5.8,7.05,7.9,8.23,8.7,9]
datalabel = [1,2,3,4,5,6,7,8,9,10]
return dataSet,datalabel
#计算数据集的平方误差
def calcMSE(dataSet):
means = np.mean(dataSet)
sums = sum([(i-means)*(i-means) for i in dataSet])*1.0
return sums
#选择最优的划分点
def chooseBestFeatureToSplit(dataSet):
nums = len(dataSet)-1
if nums == 0:
return 0
best = 0
bestMES = 100000
for i in range(nums):
temp = calcMSE(dataSet[:i+1]) + calcMSE(dataSet[i+1:])
if temp <= bestMES:
bestMES = temp
best = i
return best
# def getkeyofromvalue(dataSet,value):
# u = -1
# for i in range(len(dataSet)):
# if dataSet[i] == value:
# u = i
# return u
#建树过程
def createTree(dataSet,datalabel,left,right):
if right-left == 1:
#return dataSet[left]
return datalabel[left]
if left >= right:
return -1
#最优划分函数加上left为原数据集上的最优划分下标
bestchoose = left + chooseBestFeatureToSplit(dataSet[left:right])
#print bestchoose+1
mytree = {datalabel[bestchoose]:{}}
mytree[datalabel[bestchoose]]['left'] = createTree(dataSet,datalabel,left,bestchoose)
mytree[datalabel[bestchoose]]['right'] = createTree(dataSet,datalabel,bestchoose+1,right)
return mytree
调用方法:
import cart
mydat,myla = cart.createDataSet()
myt = cart.createTree(mydat,myla,0,len(mydat))
print myt
结果(没有进行可视化操作):