用python手动实现随机森林,代码仅供参考
class RondomForest(object):
def __init__(self,method):
self.method = method
self.feature =[]
self.tree_size=0
#分类-计算该切分的Gini系数
def Gini_index(self,ValueCount):
if len(ValueCount)==0:
return 0
if len(ValueCount)==1:
return 0
else:
L1=ValueCount[0]/(ValueCount[0]+ValueCount[1])
L2=ValueCount[1]/(ValueCount[0]+ValueCount[1])
return 1-L1**2-L2**2
#分类-GINI找到变量最佳切分点
def GINI_split(self,feature,label):
best_score=10
# print(feature.value_counts())
# print(feature)
for i in list(feature.value_counts().index):
label_split1=label[feature<i]
label_split2=label[feature>=i]
weight=len(label_split1)/(len(label_split1)+len(label_split2))
Gini_score=weight*self.Gini_index(label_split1.value_counts())+(1-weight)*self.Gini_index(label_split2.value_counts())
if Gini_score<best_score:
best_score=Gini_score
best_split=i
return best_split
#分类-计算熵
def entropy(self,ValueCount):
if len(ValueCount)==0:
return 0
if len(ValueCount)==1:
return 0
r1=ValueCount[0]/(ValueCount[0]+ValueCount[1])
r2=ValueCount[1]/(ValueCount[0]+ValueCount[1])
return -(r1*math.log2(r1)+r2*math.log2(r2))
#分类-计算增益比
def entropy_ratio(self,feature,label,split):
#feature的熵
entropy_feature=self.entropy(feature.apply(lambda x:0 if x<split else 1).value_counts())
entropy_label=self.entropy(label.value_counts())
#按照featrue的split划分后label的熵
weight=len(label[feature<split])/(len(label[feature<split])+len(label[feature>=split]))
entropy_split=weight*self.entropy(label[feature<split].value_counts())+(1-weight)*self.entropy(label[feature>=split].value_counts())
if entropy_feature==0:
info_gain=0
else:
info_gain=(entropy_label-entropy_split)/entropy_feature
return info_gain
#分类-进行一次node选择
def node_choose(self,x_train,y_train):
gain_best=0
for i in range(x_train.shape[1]):
split=self.GINI_split(x_train.iloc[:,i],y_train)
info_gain=self.entropy_ratio(x_train.iloc[:,i],y_train,split)
if info_gain>=gain_best:
gain_best=info_gain
best_feature=i
best_split=split
return best_feature,best_split,gain_best
#-----------------------------------------------------------------------------------------------------------
#回归-计算该切分的损失
def err_index(self,label_split):
if len(label_split)==0:
return 0
if len(label_split)==1:
return 0
else:
label_mean=label_split.mean()
Loss=label_split-label_mean
Loss_sum=0
for i in range(len(Loss)):
Loss_sum=Loss_sum+Loss.iloc[i]**2
return Loss_sum
#回归-找到变量最佳切分点
def Split_point(self,feature,label):
best_score=99999999999999999
# print(feature.value_counts())
# print(feature)
for i in list(feature.value_counts().index):
label_split1=label[feature<i]
label_split2=label[feature>=i]
weight=len(label_split1)/(len(label_split1)+len(label_split2))
Err_score=weight*self.err_index(label_split1)+(1-weight)*self.err_index(label_split2)
if Err_score<best_score:
best_score=Err_score
best_split=i
return best_split,best_score
#回归-进行一次node选择
def node_choose_regression(self,x_train,y_train):
gain_best=99999999999999999999
for i in range(x_train.shape[1]):
split,split_score=self.Split_point(x_train.iloc[:,i],y_train)
if split_score<=gain_best:
gain_best=split_score
best_feature=i
best_split=split
return best_feature,best_split
#--------------------------------------------------------------------------------------------------------------------
#递归完成分类树的构建
def decision_tree_class(self,x_train_deal,y_train,final_tree,feature_sample_size,tree_deep_size,seed):
if x_train_deal.shape[0]==0:
final_tree.append("end")
elif x_train_deal.shape[1]==0:
if len(y_train)>=2:
final_tree.append(y_train.value_counts().index[0])
else:
final_tree.append(y_train.iloc[0])
elif len(y_train.value_counts())==1:
final_tree.append(y_train.iloc[0])
elif self.tree_size>tree_deep_size:
final_tree.append(y_train.value_counts().index[0])
self.tree_size=0
else:
#进行一次node选择
self.tree_size=self.tree_size+1
print(x_train_deal.shape[1])
feature_num=(x_train_deal.shape[1])//int(1/feature_sample_size)+2
best_feature,best_split,gain_best=self.node_choose(x_train_deal.iloc[:,0:feature_num],y_train)
print("best_feature:------------------------",[x_train_deal.iloc[:,0:feature_num].columns[best_feature],best_split,gain_best])
self.feature.append([x_train_deal.iloc[:,0:feature_num].columns[best_feature],gain_best])
#根据选择结果切分数据集
x_train_deal_left=x_train_deal[x_train_deal.iloc[:,best_feature]<best_split]
y_train_left=y_train[x_train_deal.iloc[:,best_feature]<best_split]
x_train_deal_right=x_train_deal[x_train_deal.iloc[:,best_feature]>=best_split]
y_train_right=y_train[x_train_deal.iloc[:,best_feature]>=best_split]
#剔除已经使用过的变量
feature_list=list(x_train_deal.columns)
feature_list.remove(feature_list[best_feature])
x_train_deal_left=x_train_deal_left[feature_list]
x_train_deal_right=x_train_deal_right[feature_list]
print("-----------------------------------------下一个节点----------------------------------------")
print("还剩",x_train_deal_left.shape[0],x_train_deal_right.shape[0],"行")
if x_train_deal_left.shape[0]==0:
final_tree.append(y_train_right.value_counts().index[0])
elif x_train_deal_right.shape[0]==0:
final_tree.append(y_train_left.value_counts().index[0])
else:
final_tree.append([x_train_deal.columns[best_feature],best_split])
final_tree.append([])
final_tree.append([])
return self.decision_tree_class(x_train_deal_left,y_train_left,final_tree[1],feature_sample_size,tree_deep_size,seed),self.decision_tree_class(x_train_deal_right,y_train_right,final_tree[2],feature_sample_size,tree_deep_size,seed)
#递归完成回归树的构建
def decision_tree_regression(self,x_train_deal,y_train,final_tree):
if x_train_deal.shape[0]==0:
final_tree.append("end")
elif x_train_deal.shape[1]==0:
final_tree.append(y_train.mean())
elif len(y_train)<=10:
final_tree.append(y_train.mean())
else:
#进行一次node选择
best_feature,best_split=self.node_choose_regression(x_train_deal,y_train)
print("best_feature:------------------------",[x_train_deal.columns[best_feature],best_split])
#根据选择结果切分数据集
x_train_deal_left=x_train_deal[x_train_deal.iloc[:,best_feature]<best_split]
y_train_left=y_train[x_train_deal.iloc[:,best_feature]<best_split]
x_train_deal_right=x_train_deal[x_train_deal.iloc[:,best_feature]>=best_split]
y_train_right=y_train[x_train_deal.iloc[:,best_feature]>=best_split]
#剔除已经使用过的变量
feature_list=list(x_train_deal.columns)
feature_list.remove(feature_list[best_feature])
x_train_deal_left=x_train_deal_left[feature_list]
x_train_deal_right=x_train_deal_right[feature_list]
print("-----------------------------------------下一个节点----------------------------------------")
print("还剩",x_train_deal_left.shape[0],x_train_deal_right.shape[0],"行")
if x_train_deal_left.shape[0]==0:
final_tree.append(y_train_right.mean())
elif x_train_deal_right.shape[0]==0:
final_tree.append(y_train_left.mean())
else:
final_tree.append([x_train_deal.columns[best_feature],best_split])
final_tree.append([])
final_tree.append([])
return self.decision_tree_regression(x_train_deal_left,y_train_left,final_tree[1]),self.decision_tree_regression(x_train_deal_right,y_train_right,final_tree[2])
#Bootstrap抽样
def Bootstrap(self,x_train,y_train,tree_num,seed):
x_train_bootstrap=[]
y_train_bootstrap=[]
n=len(y_train)
for i in range(tree_num):
np.random.seed(seed=seed+i)
x_train_b=x_train.sample(n=n,replace=True)
np.random.seed(seed=seed+i)
y_train_b=y_train.sample(n=n,replace=True)
x_train_bootstrap.append(x_train_b)
y_train_bootstrap.append(y_train_b)
return x_train_bootstrap,y_train_bootstrap
#fit
def tree_construction(self,x_train,y_train,tree_num,feature_sample_size,tree_deep_size,seed):
final_tree_list=[]
x_train_bootstrap,y_train_bootstrap=self.Bootstrap(x_train,y_train,tree_num,seed)
for i in range(tree_num):
final_tree=[]
x_train_deal=x_train_bootstrap[i]
y_train_deal=y_train_bootstrap[i]
if self.method=="classification":
self.decision_tree_class(x_train_deal,y_train_deal,final_tree,feature_sample_size,tree_deep_size,seed)
if self.method=="regression":
self.decision_tree_regression(x_train_deal,y_train_deal,final_tree)
final_tree_list.append(final_tree)
return final_tree_list
def oob_estimate(self,x_train,y_train,tree_num,feature_sample_size,tree_deep_size,seed):
final_tree_list=[]
x_train_bootstrap,y_train_bootstrap=self.Bootstrap(x_train,y_train,tree_num,seed)
for i in range(tree_num):
final_tree=[]
x_train_deal=x_train_bootstrap[i]
y_train_deal=y_train_bootstrap[i]
if self.method=="classification":
self.decision_tree_class(x_train_deal,y_train_deal,final_tree,feature_sample_size,tree_deep_size,seed)
if self.method=="regression":
self.decision_tree_regression(x_train_deal,y_train_deal,final_tree)
final_tree_list.append(final_tree)
#oob
# acc=0
TP=0
FP=0
FN=0
TN=0
total=len(x_train.index)
for k in x_train.index:
#找到包外tree
oob_tree_list=[]
for j in range(tree_num):
x_train_deal=x_train_bootstrap[j]
y_train_deal=y_train_bootstrap[j]
if k not in y_train_deal.index:
# final_tree=[]
# self.decision_tree_class(x_train_deal,y_train_deal,final_tree,seed)
oob_tree_list.append(final_tree_list[j])
#用包外tree进行预测
pred_list=[]
x_test=pd.DataFrame(x_train.iloc[k,:]).T
for i in range(len(oob_tree_list)):
result=[]
final_tree=oob_tree_list[i]
self.prediction(final_tree,x_test,result)
pred=pd.DataFrame()
for p in result:
pred=pred.append(p)
pred=pred.sort_index()
pred_list.append(pred)
# print(pred_list)
#如果一棵树也没有就跳过
if len(pred_list)==0:
total=total-1
continue
pred_final=pred_list[0]
for j in range(1,len(pred_list)):
pred_final=pred_final+pred_list[j]
pred_final=pred_final["predict"].apply(lambda x:1 if (x/2)>0.5 else 0)
if (pred_final.iloc[0]==1) & (y_train.iloc[k]==1) :
TP=TP+1
if (pred_final.iloc[0]==1) & (y_train.iloc[k]==0) :
FP=FP+1
if (pred_final.iloc[0]==0) & (y_train.iloc[k]==1) :
FN=FN+1
if (pred_final.iloc[0]==0) & (y_train.iloc[k]==0):
TN=TN+1
accuracy=(TP+TN)/(TP+FP+FN+TN)
print(TP+FP+FN+TN)
print("accuracy",accuracy)
if (TP+FP!=0) and (TP+FN !=0):
precision=TP/(TP+FP)
recall=TP/(TP+FN)
F1=2*precision*recall/(precision+recall)
print("precision",precision)
print("recall",recall)
print("F1",F1)
return accuracy
def feature_importance(self):
feature_score=pd.DataFrame(self.feature,columns=["feature","score"])
feature_rank=feature_score.groupby("feature")["score"].mean().sort_values(ascending=False)
return feature_rank
#预测方法
def prediction(self,final_tree,x_test,result):
if len(final_tree)==1:
x_test["predict"]=final_tree[0]
result=result.append(pd.DataFrame(x_test["predict"]))
elif x_test.shape[0]==0:
i=0
else:
x_left=x_test[x_test[final_tree[0][0]]<final_tree[0][1]]
x_right=x_test[x_test[final_tree[0][0]]>=final_tree[0][1]]
return self.prediction(final_tree[1],x_left,result),self.prediction(final_tree[2],x_right,result)
#输出预测结果
def prediction_result(self,final_tree_list,x_test):
pred_list=[]
for i in range(len(final_tree_list)):
result=[]
final_tree=final_tree_list[i]
self.prediction(final_tree,x_test,result)
pred=pd.DataFrame()
for p in result:
pred=pred.append(p)
pred=pred.sort_index()
pred_list.append(pred)
pred_final=pred_list[0]
for j in range(1,len(pred_list)):
pred_final=pred_final+pred_list[j]
pred_final=pred_final["predict"].apply(lambda x:1 if (x/2)>0.5 else 0)
return pd.DataFrame(pred_final)
#输出正确率
def accuracy(self,pred,y_test):
acc=0
for i in range(pred.shape[0]):
if pred.iloc[i][0]==y_test.iloc[i]:
acc=acc+1
return acc/pred.shape[0]
#输出混淆矩阵
def confusion_matrix(self,pred,y_test):
TP=0
FP=0
FN=0
TN=0
for i in range(pred.shape[0]):
if (pred.iloc[i][0]==1) & (y_test.iloc[i]==1) :
TP=TP+1
if (pred.iloc[i][0]==1) & (y_test.iloc[i]==0) :
FP=FP+1
if (pred.iloc[i][0]==0) & (y_test.iloc[i]==1) :
FN=FN+1
if (pred.iloc[i][0]==0) & (y_test.iloc[i]==0):
TN=TN+1
accuracy=(TP+TN)/(TP+FP+FN+TN)
precision=TP/(TP+FP)
recall=TP/(TP+FN)
F1=2*precision*recall/(precision+recall)
print("accuracy",accuracy)
print("precision",precision)
print("recall",recall)
print("F1",F1)
return [[TP,FP],[FN,TN]]
#输出误差 MSE MAE
def Err(self,pred,y_test):
MSE=0
MAE=0
for i in range(pred.shape[0]):
mse_temp=(pred.iloc[i][0]-y_test.iloc[i])**2
MSE=MSE+mse_temp
mae_temp=abs(pred.iloc[i][0]-y_test.iloc[i])
MAE=MAE+mae_temp
MSE=MSE/pred.shape[0]
MAE=MAE/pred.shape[0]
print("MSE:",MSE)
print("MAE",MAE)
return MSE,MAE
#构建随机森林
#参数设置
tree_num=2
feature_sample_size=1/5
tree_deep_size=10
seed=1
#模型训练
model=RondomForest(method = "classification")
final_tree_list=model.tree_construction(x_train,y_train,tree_num,feature_sample_size,tree_deep_size,seed)
#在测试集表现
pred=model.prediction_result(final_tree_list,x_test)
acc=model.accuracy(pred,y_test)
con_m=model.confusion_matrix(pred,y_test)
#包外估计
model=RondomForest(method = "classification")
oob_estimate=model.oob_estimate(x_train,y_train,tree_num,feature_sample_size,tree_deep_size,seed)