最近在看《机器学习实战》第9章,CART算法用于回归,在python2.7版本下跑,发现代码报错,所以勘误了下。下面只是回归树的部分勘误代码:
class treeNode():
def __init__(self, feat, val, right, left):
featureToSplitOn = feat#切分的特征
valueOfSplit = val#切分的值
rightBranch = right#左子树
leftBranch = left#右子树
from numpy import *
def loadDataSet(fileName):#加载数据
dataMat=[]
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split('\t')
fltLine = map(float, curLine)
dataMat.append(fltLine)
return dataMat
def binSplitDataSet(dataSet, feature, value):
#会不会发生没分割的情况?特征值恰巧处于边界?
mat0 = dataSet[nonzero(dataSet[:,feature]>value)[0],:]
mat1 = dataSet[nonzero(dataSet[:,feature]<=value)[0],:]
return mat0, mat1
#testMat = mat(eye(4))
#print testMat
#mat0,mat1 = binSplitDataSet(testMat, 1,0.5)
#print 'first: ', mat0
#print 'second',mat1
def regLeaf(dataSet):
return mean(dataSet[:,-1])#计算数据集标签值得均值
def regErr(dataSet):
return var(dataSet[:,-1])*shape(dataSet)[0]#方差*个数,衡量了一个数据集的混乱程度(个人理解:或者说是聚集程度)
def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):
tolS = ops[0]#数据集的足够有序不用分割程度
tolN = ops[1]#数据集的足够小不用分割程度
if len(set(dataSet[:,-1].T.tolist()[0]))==1:#数据集的标签值一样,没有必要分割
return None, leafType(dataSet)#分割下标为空,返回叶子节点
m,n =shape(dataSet)#数据集大小
S = errType(dataSet)#数据集混乱度
bestS = inf#最佳混乱度
bestIndex = 0#最佳分割特征
bestValue = 0#最佳分割值,分割值是有限个的,后面代码将看到。这些值是从数据集来的
for featIndex in range(n-1):#将去一是为了排除标签,获取特征个数
uniqeFeatVals = set(dataSet[:,featIndex].T.tolist()[0])
minFeatVal = min(uniqeFeatVals)
maxFeatVal = max(uniqeFeatVals)
for splitVal in uniqeFeatVals:#特征确定了,遍历它可能的所有值,如何对处于边界的特征值进行分割?
#如果是边界特征值,不让他去分割!
if (splitVal != minFeatVal) and (splitVal != maxFeatVal):
mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
if (shape(mat0)[0]<tolN) or (shape(mat1)[0]<tolN):#数据集太小,不割了
continue
newS = errType(mat0)+errType(mat1)#计算分割后两个数据集混乱都总和
if newS < bestS:#混乱都越小越好
bestIndex = featIndex
bestValue = splitVal
bestS = newS
if (S-bestS)<tolS:#混乱度足够小,创建叶子
return None, leafType(dataSet)
mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
if (shape(mat0)[0]<tolN) or (shape(mat1)[0]<tolN):
return None, leafType(dataSet)#数据集足够小,创建叶子
return bestIndex, bestValue
#创建一颗树,递归调用自己
def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):#assume dataSet is NumPy Mat so we can array filtering
feat, val = chooseBestSplit(dataSet, leafType, errType, ops)#choose the best split
if feat == None: return val #if the splitting hit a stop condition return val
retTree = {}
retTree['spInd'] = feat
retTree['spVal'] = val
lSet, rSet = binSplitDataSet(dataSet, feat, val)
retTree['left'] = createTree(lSet, leafType, errType, ops)
retTree['right'] = createTree(rSet, leafType, errType, ops)
return retTree
from numpy import *
myDat = loadDataSet('ex00.txt')
myMat = mat(myDat)
tree = createTree(myMat)
tree