随机森林Python实现

最新推荐文章于 2024-06-19 00:54:28 发布

季夏p

最新推荐文章于 2024-06-19 00:54:28 发布

阅读量2.6k

点赞数 1

文章标签：数据挖掘机器学习大数据

本文链接：https://blog.csdn.net/weixin_44047403/article/details/121455031

版权

用python手动实现随机森林，代码仅供参考

class RondomForest(object):
    def __init__(self,method):
        self.method = method
        self.feature =[]
        self.tree_size=0

    #分类-计算该切分的Gini系数
    def Gini_index(self,ValueCount):
        if len(ValueCount)==0:
            return 0
        if len(ValueCount)==1:
            return 0
        else:
            L1=ValueCount[0]/(ValueCount[0]+ValueCount[1])
            L2=ValueCount[1]/(ValueCount[0]+ValueCount[1])
            return 1-L1**2-L2**2
 
    #分类-GINI找到变量最佳切分点
    def GINI_split(self,feature,label):
        best_score=10
    #     print(feature.value_counts())
    #     print(feature)
        for i in list(feature.value_counts().index):
            label_split1=label[feature<i]
            label_split2=label[feature>=i]
            weight=len(label_split1)/(len(label_split1)+len(label_split2))
            Gini_score=weight*self.Gini_index(label_split1.value_counts())+(1-weight)*self.Gini_index(label_split2.value_counts())
            if Gini_score<best_score:
                best_score=Gini_score
                best_split=i
        return best_split
        
    #分类-计算熵
    def entropy(self,ValueCount):
        if len(ValueCount)==0:
            return 0
        if len(ValueCount)==1:
            return 0
        r1=ValueCount[0]/(ValueCount[0]+ValueCount[1])
        r2=ValueCount[1]/(ValueCount[0]+ValueCount[1])
        return -(r1*math.log2(r1)+r2*math.log2(r2))  
    
    #分类-计算增益比
    def entropy_ratio(self,feature,label,split):
        #feature的熵
        entropy_feature=self.entropy(feature.apply(lambda x:0 if x<split else 1).value_counts())
        entropy_label=self.entropy(label.value_counts())
        #按照featrue的split划分后label的熵
        weight=len(label[feature<split])/(len(label[feature<split])+len(label[feature>=split]))
        entropy_split=weight*self.entropy(label[feature<split].value_counts())+(1-weight)*self.entropy(label[feature>=split].value_counts())
        if entropy_feature==0:
            info_gain=0
        else:
            info_gain=(entropy_label-entropy_split)/entropy_feature
        return info_gain
    
    
    #分类-进行一次node选择
    def node_choose(self,x_train,y_train):
        gain_best=0
        for i in range(x_train.shape[1]):
            split=self.GINI_split(x_train.iloc[:,i],y_train)
            info_gain=self.entropy_ratio(x_train.iloc[:,i],y_train,split)
            if info_gain>=gain_best:
                gain_best=info_gain
                best_feature=i
                best_split=split
        return best_feature,best_split,gain_best
    
    #-----------------------------------------------------------------------------------------------------------
    #回归-计算该切分的损失
    def err_index(self,label_split):
        if len(label_split)==0:
            return 0
        if len(label_split)==1:
            return 0
        else:
            label_mean=label_split.mean()
            Loss=label_split-label_mean
            Loss_sum=0
            for i in range(len(Loss)):
                Loss_sum=Loss_sum+Loss.iloc[i]**2
            return Loss_sum
    
    #回归-找到变量最佳切分点
    def Split_point(self,feature,label):
        best_score=99999999999999999
    #     print(feature.value_counts())
    #     print(feature)
        for i in list(feature.value_counts().index):
            label_split1=label[feature<i]
            label_split2=label[feature>=i]
            weight=len(label_split1)/(len(label_split1)+len(label_split2))
            Err_score=weight*self.err_index(label_split1)+(1-weight)*self.err_index(label_split2)
            if Err_score<best_score:
                best_score=Err_score
                best_split=i
        return best_split,best_score
    
    #回归-进行一次node选择
    def node_choose_regression(self,x_train,y_train):
        gain_best=99999999999999999999
        for i in range(x_train.shape[1]):
            split,split_score=self.Split_point(x_train.iloc[:,i],y_train)
            if split_score<=gain_best:
                gain_best=split_score
                best_feature=i
                best_split=split
        return best_feature,best_split    

    
    #--------------------------------------------------------------------------------------------------------------------
    
    #递归完成分类树的构建
    def decision_tree_class(self,x_train_deal,y_train,final_tree,feature_sample_size,tree_deep_size,seed):    
        if x_train_deal.shape[0]==0:
            final_tree.append("end")
        elif x_train_deal.shape[1]==0:
            if len(y_train)>=2:
                final_tree.append(y_train.value_counts().index[0])
            else:
                final_tree.append(y_train.iloc[0])
        elif len(y_train.value_counts())==1:
            final_tree.append(y_train.iloc[0])
        elif self.tree_size>tree_deep_size:
            final_tree.append(y_train.value_counts().index[0])
            self.tree_size=0
        else:
            #进行一次node选择
            self.tree_size=self.tree_size+1
            print(x_train_deal.shape[1])
            feature_num=(x_train_deal.shape[1])//int(1/feature_sample_size)+2
            best_feature,best_split,gain_best=self.node_choose(x_train_deal.iloc[:,0:feature_num],y_train)
            print("best_feature:------------------------",[x_train_deal.iloc[:,0:feature_num].columns[best_feature],best_split,gain_best])
            self.feature.append([x_train_deal.iloc[:,0:feature_num].columns[best_feature],gain_best])
            #根据选择结果切分数据集
            x_train_deal_left=x_train_deal[x_train_deal.iloc[:,best_feature]<best_split]
            y_train_left=y_train[x_train_deal.iloc[:,best_feature]<best_split]
            x_train_deal_right=x_train_deal[x_train_deal.iloc[:,best_feature]>=best_split]
            y_train_right=y_train[x_train_deal.iloc[:,best_feature]>=best_split]
            #剔除已经使用过的变量
            feature_list=list(x_train_deal.columns)
            feature_list.remove(feature_list[best_feature])
            x_train_deal_left=x_train_deal_left[feature_list]
            x_train_deal_right=x_train_deal_right[feature_list]
            print("-----------------------------------------下一个节点----------------------------------------")
            print("还剩",x_train_deal_left.shape[0],x_train_deal_right.shape[0],"行")
            if x_train_deal_left.shape[0]==0:
                final_tree.append(y_train_right.value_counts().index[0])
            elif x_train_deal_right.shape[0]==0:
                final_tree.append(y_train_left.value_counts().index[0])
            else:
                final_tree.append([x_train_deal.columns[best_feature],best_split])
                final_tree.append([])
                final_tree.append([])
                return self.decision_tree_class(x_train_deal_left,y_train_left,final_tree[1],feature_sample_size,tree_deep_size,seed),self.decision_tree_class(x_train_deal_right,y_train_right,final_tree[2],feature_sample_size,tree_deep_size,seed)
    
    
    #递归完成回归树的构建
    def decision_tree_regression(self,x_train_deal,y_train,final_tree):    
        if x_train_deal.shape[0]==0:
            final_tree.append("end")
        elif x_train_deal.shape[1]==0:
            final_tree.append(y_train.mean())  
        elif len(y_train)<=10:
            final_tree.append(y_train.mean()) 
        else:
            #进行一次node选择
            best_feature,best_split=self.node_choose_regression(x_train_deal,y_train)
            print("best_feature:------------------------",[x_train_deal.columns[best_feature],best_split])
            #根据选择结果切分数据集
            x_train_deal_left=x_train_deal[x_train_deal.iloc[:,best_feature]<best_split]
            y_train_left=y_train[x_train_deal.iloc[:,best_feature]<best_split]
            x_train_deal_right=x_train_deal[x_train_deal.iloc[:,best_feature]>=best_split]
            y_train_right=y_train[x_train_deal.iloc[:,best_feature]>=best_split]
            #剔除已经使用过的变量
            feature_list=list(x_train_deal.columns)
            feature_list.remove(feature_list[best_feature])
            x_train_deal_left=x_train_deal_left[feature_list]
            x_train_deal_right=x_train_deal_right[feature_list]
            print("-----------------------------------------下一个节点----------------------------------------")
            print("还剩",x_train_deal_left.shape[0],x_train_deal_right.shape[0],"行")
            if x_train_deal_left.shape[0]==0:
                final_tree.append(y_train_right.mean()) 
            elif x_train_deal_right.shape[0]==0:
                final_tree.append(y_train_left.mean()) 
            else:
                final_tree.append([x_train_deal.columns[best_feature],best_split])
                final_tree.append([])
                final_tree.append([])
                return self.decision_tree_regression(x_train_deal_left,y_train_left,final_tree[1]),self.decision_tree_regression(x_train_deal_right,y_train_right,final_tree[2])  
    
    #Bootstrap抽样
    def Bootstrap(self,x_train,y_train,tree_num,seed):
        x_train_bootstrap=[]
        y_train_bootstrap=[]
        n=len(y_train)
        for i in range(tree_num):
            np.random.seed(seed=seed+i)
            x_train_b=x_train.sample(n=n,replace=True)
            np.random.seed(seed=seed+i)
            y_train_b=y_train.sample(n=n,replace=True)
            x_train_bootstrap.append(x_train_b)
            y_train_bootstrap.append(y_train_b)
        return x_train_bootstrap,y_train_bootstrap
    
    
    #fit       
    def tree_construction(self,x_train,y_train,tree_num,feature_sample_size,tree_deep_size,seed):
        final_tree_list=[]
        x_train_bootstrap,y_train_bootstrap=self.Bootstrap(x_train,y_train,tree_num,seed)
        for i in range(tree_num):
            final_tree=[]
            x_train_deal=x_train_bootstrap[i]
            y_train_deal=y_train_bootstrap[i]
            if self.method=="classification":
                self.decision_tree_class(x_train_deal,y_train_deal,final_tree,feature_sample_size,tree_deep_size,seed)
            if self.method=="regression":
                self.decision_tree_regression(x_train_deal,y_train_deal,final_tree)  
            final_tree_list.append(final_tree)
        return final_tree_list
    
    def oob_estimate(self,x_train,y_train,tree_num,feature_sample_size,tree_deep_size,seed):
        final_tree_list=[]
        x_train_bootstrap,y_train_bootstrap=self.Bootstrap(x_train,y_train,tree_num,seed)
        for i in range(tree_num):
            final_tree=[]
            x_train_deal=x_train_bootstrap[i]
            y_train_deal=y_train_bootstrap[i]
            if self.method=="classification":
                self.decision_tree_class(x_train_deal,y_train_deal,final_tree,feature_sample_size,tree_deep_size,seed)
            if self.method=="regression":
                self.decision_tree_regression(x_train_deal,y_train_deal,final_tree)  
            final_tree_list.append(final_tree)
        #oob
#         acc=0
        TP=0
        FP=0
        FN=0
        TN=0
        total=len(x_train.index)
        for k in x_train.index:
            #找到包外tree
            oob_tree_list=[]
            for j in range(tree_num):
                x_train_deal=x_train_bootstrap[j]
                y_train_deal=y_train_bootstrap[j]
                if k not in y_train_deal.index:
#                     final_tree=[]
#                     self.decision_tree_class(x_train_deal,y_train_deal,final_tree,seed)
                    oob_tree_list.append(final_tree_list[j])
            #用包外tree进行预测
            pred_list=[]
            x_test=pd.DataFrame(x_train.iloc[k,:]).T
            for i in range(len(oob_tree_list)):
                result=[]
                final_tree=oob_tree_list[i]
                self.prediction(final_tree,x_test,result)
                pred=pd.DataFrame()
                for p in result:
                    pred=pred.append(p)
                pred=pred.sort_index()
                pred_list.append(pred)
#             print(pred_list)
            #如果一棵树也没有就跳过
            if len(pred_list)==0:
                total=total-1
                continue
                
            pred_final=pred_list[0]
            for j in range(1,len(pred_list)):
                pred_final=pred_final+pred_list[j]
            pred_final=pred_final["predict"].apply(lambda x:1 if (x/2)>0.5 else 0)
            if (pred_final.iloc[0]==1) & (y_train.iloc[k]==1) :
                TP=TP+1
            if (pred_final.iloc[0]==1) & (y_train.iloc[k]==0) :
                FP=FP+1
            if (pred_final.iloc[0]==0) & (y_train.iloc[k]==1) :
                FN=FN+1
            if (pred_final.iloc[0]==0) & (y_train.iloc[k]==0):
                TN=TN+1
        accuracy=(TP+TN)/(TP+FP+FN+TN)
        print(TP+FP+FN+TN)
        print("accuracy",accuracy)
        if (TP+FP!=0) and (TP+FN !=0):
            precision=TP/(TP+FP)
            recall=TP/(TP+FN)
            F1=2*precision*recall/(precision+recall)
            print("precision",precision)
            print("recall",recall) 
            print("F1",F1)
        return accuracy
    
    def feature_importance(self):
        feature_score=pd.DataFrame(self.feature,columns=["feature","score"])
        feature_rank=feature_score.groupby("feature")["score"].mean().sort_values(ascending=False)
        return feature_rank
    
    #预测方法
    def prediction(self,final_tree,x_test,result):
        if len(final_tree)==1:
            x_test["predict"]=final_tree[0]
            result=result.append(pd.DataFrame(x_test["predict"]))
        elif x_test.shape[0]==0:
            i=0
        else:
            x_left=x_test[x_test[final_tree[0][0]]<final_tree[0][1]]
            x_right=x_test[x_test[final_tree[0][0]]>=final_tree[0][1]]
            return self.prediction(final_tree[1],x_left,result),self.prediction(final_tree[2],x_right,result)
    
    #输出预测结果
    def prediction_result(self,final_tree_list,x_test):
        pred_list=[]
        for i in range(len(final_tree_list)):
            result=[]
            final_tree=final_tree_list[i]
            self.prediction(final_tree,x_test,result)
            pred=pd.DataFrame()
            for p in result:
                pred=pred.append(p)
            pred=pred.sort_index()
            pred_list.append(pred)
        pred_final=pred_list[0]
        for j in range(1,len(pred_list)):
            pred_final=pred_final+pred_list[j]
        pred_final=pred_final["predict"].apply(lambda x:1 if (x/2)>0.5 else 0)
        return pd.DataFrame(pred_final)

    #输出正确率
    def accuracy(self,pred,y_test):
        acc=0
        for i in range(pred.shape[0]):
            if pred.iloc[i][0]==y_test.iloc[i]:
                acc=acc+1
        return acc/pred.shape[0]
    
    
    #输出混淆矩阵
    def confusion_matrix(self,pred,y_test):
        TP=0
        FP=0
        FN=0
        TN=0
        for i in range(pred.shape[0]):
            if (pred.iloc[i][0]==1) & (y_test.iloc[i]==1) :
                TP=TP+1
            if (pred.iloc[i][0]==1) & (y_test.iloc[i]==0) :
                FP=FP+1
            if (pred.iloc[i][0]==0) & (y_test.iloc[i]==1) :
                FN=FN+1
            if (pred.iloc[i][0]==0) & (y_test.iloc[i]==0):
                TN=TN+1
        accuracy=(TP+TN)/(TP+FP+FN+TN)
        precision=TP/(TP+FP)
        recall=TP/(TP+FN)
        F1=2*precision*recall/(precision+recall)
        print("accuracy",accuracy)
        print("precision",precision)
        print("recall",recall) 
        print("F1",F1)
        return [[TP,FP],[FN,TN]]
    
    #输出误差 MSE MAE
    def Err(self,pred,y_test):
        MSE=0
        MAE=0
        for i in range(pred.shape[0]):
            mse_temp=(pred.iloc[i][0]-y_test.iloc[i])**2
            MSE=MSE+mse_temp
            mae_temp=abs(pred.iloc[i][0]-y_test.iloc[i])
            MAE=MAE+mae_temp
        MSE=MSE/pred.shape[0]
        MAE=MAE/pred.shape[0]
        print("MSE:",MSE)
        print("MAE",MAE)
        return MSE,MAE

#构建随机森林

#参数设置
tree_num=2
feature_sample_size=1/5
tree_deep_size=10
seed=1
#模型训练
model=RondomForest(method = "classification")
final_tree_list=model.tree_construction(x_train,y_train,tree_num,feature_sample_size,tree_deep_size,seed)

#在测试集表现
pred=model.prediction_result(final_tree_list,x_test)
acc=model.accuracy(pred,y_test)
con_m=model.confusion_matrix(pred,y_test)

#包外估计
model=RondomForest(method = "classification")
oob_estimate=model.oob_estimate(x_train,y_train,tree_num,feature_sample_size,tree_deep_size,seed)

季夏p

关注

1
点赞
踩
17

收藏

觉得还不错? 一键收藏
2
评论
随机森林Python实现

用python手动实现随机森林，代码仅供参考class RondomForest(object): def __init__(self,method): self.method = method self.feature =[] self.tree_size=0 #分类-计算该切分的Gini系数 def Gini_index(self,ValueCount): if len(ValueCount)==0: .
复制链接

扫一扫