《机器学习实战》读后备忘

最近在看《机器学习实战》第9章,CART算法用于回归,在python2.7版本下跑,发现代码报错,所以勘误了下。下面只是回归树的部分勘误代码:

class treeNode():
    def __init__(self, feat, val, right, left):
        featureToSplitOn = feat#切分的特征
        valueOfSplit = val#切分的值
        rightBranch = right#左子树
        leftBranch = left#右子树

from numpy import *
def loadDataSet(fileName):#加载数据
    dataMat=[]
    fr = open(fileName)
    for line in fr.readlines():
        curLine = line.strip().split('\t')
        fltLine = map(float, curLine)
        dataMat.append(fltLine)
    return dataMat
    
def binSplitDataSet(dataSet, feature, value):
    #会不会发生没分割的情况?特征值恰巧处于边界?
    mat0 = dataSet[nonzero(dataSet[:,feature]>value)[0],:]
    mat1 = dataSet[nonzero(dataSet[:,feature]<=value)[0],:]
    return mat0, mat1

#testMat = mat(eye(4))
#print testMat
#mat0,mat1 = binSplitDataSet(testMat, 1,0.5)
#print 'first: ', mat0
#print 'second',mat1

def regLeaf(dataSet):
    return mean(dataSet[:,-1])#计算数据集标签值得均值
    
def regErr(dataSet):
    return var(dataSet[:,-1])*shape(dataSet)[0]#方差*个数,衡量了一个数据集的混乱程度(个人理解:或者说是聚集程度)
    
def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):
    tolS = ops[0]#数据集的足够有序不用分割程度
    tolN = ops[1]#数据集的足够小不用分割程度
    if len(set(dataSet[:,-1].T.tolist()[0]))==1:#数据集的标签值一样,没有必要分割
        return None, leafType(dataSet)#分割下标为空,返回叶子节点
    m,n =shape(dataSet)#数据集大小
    S = errType(dataSet)#数据集混乱度
    bestS = inf#最佳混乱度
    bestIndex = 0#最佳分割特征
    bestValue = 0#最佳分割值,分割值是有限个的,后面代码将看到。这些值是从数据集来的
    for featIndex in range(n-1):#将去一是为了排除标签,获取特征个数
        uniqeFeatVals = set(dataSet[:,featIndex].T.tolist()[0])
        minFeatVal = min(uniqeFeatVals)
        maxFeatVal = max(uniqeFeatVals)
        for splitVal in uniqeFeatVals:#特征确定了,遍历它可能的所有值,如何对处于边界的特征值进行分割?
            #如果是边界特征值,不让他去分割!
            if (splitVal != minFeatVal) and (splitVal != maxFeatVal):
                mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
                if (shape(mat0)[0]<tolN) or (shape(mat1)[0]<tolN):#数据集太小,不割了
                    continue
                newS = errType(mat0)+errType(mat1)#计算分割后两个数据集混乱都总和
                if newS < bestS:#混乱都越小越好
                    bestIndex = featIndex
                    bestValue = splitVal
                    bestS = newS
    if (S-bestS)<tolS:#混乱度足够小,创建叶子
        return None, leafType(dataSet)
    mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
    if (shape(mat0)[0]<tolN) or (shape(mat1)[0]<tolN):
        return None, leafType(dataSet)#数据集足够小,创建叶子
    return bestIndex, bestValue

#创建一颗树,递归调用自己
def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):#assume dataSet is NumPy Mat so we can array filtering
    feat, val = chooseBestSplit(dataSet, leafType, errType, ops)#choose the best split
    if feat == None: return val #if the splitting hit a stop condition return val
    retTree = {}
    retTree['spInd'] = feat
    retTree['spVal'] = val
    lSet, rSet = binSplitDataSet(dataSet, feat, val)
    retTree['left'] = createTree(lSet, leafType, errType, ops)
    retTree['right'] = createTree(rSet, leafType, errType, ops)
    return retTree  

from numpy import *
myDat = loadDataSet('ex00.txt')
myMat = mat(myDat)
tree = createTree(myMat)
tree

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值