把代码保存于此,python3实现,详解就参考《机器学习实战》(Peter Harrington)啦...
1. regTrees.py :
#CART regression tree
# python3修改了部分代码 http://blog.csdn.net/sinat_17196995/article/details/69621687# 9-1
from numpy import *
# 获取数据集
def loadDataSet(fileName):
dataMat=[]
fr=open(fileName)
for line in fr.readlines():
curLine=line.strip().split('\t')
fltLine=list(map(float,curLine)) #将每行映射成浮点数(改变书上代码)
dataMat.append(fltLine)
return dataMat
# 将dataSet切分成2个子集并返回
def binSplitDataSet(dataSet,feature,value):#数据集合,待切分特征,该特征的某个值
mat0=dataSet[ nonzero(dataSet[ : , feature]>value)[0] , : ]
mat1=dataSet[ nonzero(dataSet[ : , feature]<=value)[0] , : ]
return mat0,mat1
'''
d:
cd pythonwp
cd ch09
python
import regTrees
from importlib import reload
reload(regTrees)
from numpy import *
testMat=mat(eye(4))
mat0,mat1=regTrees.binSplitDataSet(testMat,0,0.5)
mat0
mat1
'''
# 9-2
# 生成叶节点
def regLeaf(dataSet):
return mean(dataSet[:,-1]) #dataSet最后一列作为 回归值,其他列作为特征
# 误差估计值,平方误差和(均方误差*总样本数)
def regErr(dataSet):
return var(dataSet[:,-1]) * shape(dataSet)[0]
# 用最佳方式切分数据集,并生成相应的叶节点
def chooseBestSplit(dataSet,leafType=regLeaf,errType=regErr,ops=(1,4)):
#数据集,生成叶节点,误差估计函数,ops=(tolS,tolN)
tolS=ops[0]#容许的误差下降值
tolN=ops[1]#切分的最少样本数
if len(set(dataSet[:,-1].T.tolist()[0])) == 1:#【停止条件1】--所有值相等,退出(set相当于R的unique)
return None,leafType(dataSet)
m,n=shape(dataSet)
S=errType(dataSet)#当前总误差
bestS=inf;bestIndex=0;bestValue=0 #最优切分对应的误差、切分特征、特征值
for featIndex in range(n-1): #for each feature
for splitVal in set(dataSet[:,featIndex].T.A.tolist()[0]): #for each split value of the feature
mat0,mat1=binSplitDataSet(dataSet,featIndex,splitVal) #二元切分数据集
if (shape(mat0)[0]<tol