task_2

Task2CART树算法

CART中用于选择变量的不纯性度量是Gini指数,最好的划分就是使得GINI_Gain最小的划分。

参考:https://blog.csdn.net/u011067360/article/details/24871801

实现CART的分类树算法代码
class Node:
    j=None
    theta=None
    p=None
    left=None
    right=None


class DecistonTreeBasel:

    def__init__(self,max_depth,get_score, feature_sample_rate=1.0):
        self.max_depth=max depth 
        self.get_score=get_score
        self.feature_sample_raterfeature_sample_rate
    def split_data(self,j,theta,X,idx):
        idx1,idx2=list()list() 
        for i in idx:
            if x[i][j]<=theta:
                idx1.append(i) 
            else:
                idx2.append(i)
        return idx1,idx2
    def get_random_features(self,n):
        shuffled=np.random.permutation(n)
        size=int(self.feature sample rate*n) 
            return shuffledL:size]

    def find_best_split(self,x,y,idx):
        m,n=X.shape
        best_score,best_j,best theta=float("inf")-1float("inf") 
        best idx1,best_idx2=list()list()
        selected_j=self.get_random_features (n) 
        for j in selected_j:
            thetas=set([x[j] for x in x])
            for theta in thetas:
                idx1,idx2=self.split_data(j,theta,X,idx) 
                if min(len(idx1)len(idx2))==0:
                    continue
                scorel,score2=self.get_score(y,idx1),self.get_score(y,idx2) 
                w=1.0*len(idx1)/len(idx)
                score=w*score1+(1-w)* score2
                if score<best_score:
                    best score,best_j,best_theta=score,j,theta
                    best idx1,best_idx2=idx1,idx2
        return best_j,best_theta,best_idx1,best_idx2,best_score

    def generate_tree(self,X,Y,idx,d):
        r=Node ()
        if d==0 or len(idx)==1:
        r.p=np.average(y[idx],axis=0)
            return r
        j,theta,idx1,idx2,score=self.find_best_split(x,y,idx) 
        current_score=self.get score(y,idx) 
        if score>=current score:
            return r
        r.j,r.theta=j,theta
        r.left,r.right=self.generate_tree(x,y,idx1,d-1),self.generate_t(X,Y,idx2,d-1) 
        return r

    def fit(self,X,y):
        self.root=self.generate_tree(X,y,range(len(X)),self.max_depth)

    def get_prediction(self,r,x):
        if r.left==None and r.right ==None:
            return r.p
        if x[r.j]<=r.theta:
            return self.get_prediction(r.left,x) 
        else:
            return.self.get_prediction(r.right,x)

    def predict(self,X):
        y=list()
        for i in range(len(X)):
            y.append(self.get_prediction(self.root,x[i]))
        return np.array(y)


# 基于CART算法基类的决策树分类算法

def get_entropy(y,idx):
    _,k=y.shape
    p=np.average(y[idx],axis=0)
    return -np.1og(p+0.001*np.random.rand(k)).dot(p.T)

class DecisionTreeClassifier(DecisionTreeBase):
    def__init__(self,max depth=0,feature_sample_rate=1.0):
        super()._init__(max_depth=max_depth,
        feature_sample_rate=feature_sample_rate,get_score=get_entropy)

    def predict_proba(self,X):
        return super().predict(X)

    def predict(self,X):
        proba=self.predict_proba(X)
        return np.argmax(proba,axis=1)





# 基于CART算法基类的决策树回归算法

def get_var(y,idx):
    y_avg=np.average(y[idx])*np.ones(len(idx))
    return np.linalg.norm(y_avg -y[idx]2)** 2/len(idx)

class DecisionTreeRegressor(DecisionTreeBase):
    def_init__(self,max_depth=0,feature_sample_rate=1.0):
        super().__init__(
        max_depth = max_depth
        feature_sample_rate = feature_sample_rate 
        get_score = get_var)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值