sklearn机器学习—决策树之画图
--------仅用于个人学习知识整理和R语言/python代码整理
--------本文所使用数据全为脱敏模拟数据
1.前言
项目中会需要对决策树的树图进行修改,看起来更方便(相当于要对比每个节点1的概率,和基础的1的概率之间的lift提升),此处特别感谢某团初始代码思路提供,之后代码基于某团思路修改
2.二分类tree 增加lift及1 rate
0. 多分类的优雅?代码
感谢业务需求,遇到了一个7分类的树图,飞速改出灵活的代码
def dot_data_with_lift(model,dot_data):
from functools import reduce
lift_list=[]
model_value=list(model.tree_.value[0][0])
base_ttl=reduce(lambda x,y:x+y,model_value)
value_index=[]
for i in range(0,len(model.tree_.value)):
##要换掉整个这一层树的文本值
model_value_i_raw=list(model.tree_.value[i][0])
##转int 才能和输出对上
model_value_i=[int(c) for c in model_value_i_raw]
if str(model_value_i) in value_index:
continue
else:
###一层的数据需要拼起来一起替换
lift_add=''
hit_add=''
for j in range(0,len(model_value)):
lift_base=model.tree_.value[0][0][j]/base_ttl
base_ttl_i=reduce(lambda x,y:x+y,model_value_i)
hit_rate=model.tree_.value[i][0][j]/base_ttl_i
lift=model.tree_.value[i][0][j]/base_ttl_i/lift_base
lift_add=lift_add+"<br/>lift "+str(j)+" = "+str(np.around(lift,decimals=4))+" hitrate "+str(j)+" = "+str(np.around(hit_rate,decimals=4))
dot_data=dot_data.replace(str(model_value_i),str(model_value_i)+lift_add)
value_index.append(str(model_value_i))
return dot_data
1. 画图code
核心思路是替换export_graphviz中,树的结构,加入新计算的lift和1的比例,以下改编自某团code
def dot_data_with_lift(model,dot_data):
lift_list=[]
lift_base=model.tree_.value[0][0][1]/(model.tree_.value[0][0][1]+model.tree_.value[0][0][0])
value_index=[]
for i in range(0,len(model.tree_.value)):
if str([int(model.tree_.value[i][0][0]),int(model.tree_.value[i][0][1])]) in value_index:
continue
else:
### lift=当前节点1的概率 / 根节点1的概率
lift=model.tree_.value[i][0][1]/(model.tree_.value[i][0][1]+model.tree_.value[i][0][0])/lift_base
### hit_rate=当前节点1的概率
hit_rate=model.tree_.value[i][0][1]/(model.tree_.value[i][0][1]+model.tree_.value[i][0][0])
### 把原来的dot_data中,替换加入lift 1_rate
dot_data=dot_data.replace(str([int(model.tree_.value[i][0][0]),int(model.tree_.value[i][0][1])]),\
str([int(model.tree_.value[i][0][0]),int(model.tree_.value[i][0][1])])+\
"<br/>lift = "+str(lift)+\
"<br/>1_rate = "+str(hit_rate))
value_index.append(str([int(model.tree_.value[i][0][0]),int(model.tree_.value[i][0][1])]))
return dot_data
### 调用
model=joblib.load(out_put)
dot_data =dot_data_with_lift(model,tree.export_graphviz(model, out_file=None,
feature_names=variable_list,
filled=True, rounded=True,
special_characters=True) )
graph = pydotplus.graph_from_dot_data(dot_data)
png_name='DecisionTree_r1.png'
graph.write_png(png_name)
2. 效果
本文所使用数据全为脱敏模拟数据
可以看到每个节点的lift和1的比例
2.多分类tree 增加lift及1 rate
def dot_data_with_lift(model,dot_data):
lift_list=[]
lift_base_1=model.tree_.value[0][0][1]/(model.tree_.value[0][0][1]+model.tree_.value[0][0][0]+model.tree_.value[0][0][2])
lift_base_2=model.tree_.value[0][0][2]/(model.tree_.value[0][0][1]+model.tree_.value[0][0][0]+model.tree_.value[0][0][2])
value_index=[]
for i in range(0,len(model.tree_.value)):
if str([int(model.tree_.value[i][0][0]),int(model.tree_.value[i][0][1]),int(model.tree_.value[i][0][2])]) in value_index:
continue
else:
lift_1=model.tree_.value[i][0][1]/(model.tree_.value[i][0][1]+model.tree_.value[i][0][0]+model.tree_.value[i][0][2])/lift_base_1
hit_rate_1=model.tree_.value[i][0][1]/(model.tree_.value[i][0][1]+model.tree_.value[i][0][0]+model.tree_.value[i][0][2])
lift_2=model.tree_.value[i][0][2]/(model.tree_.value[i][0][1]+model.tree_.value[i][0][0]+model.tree_.value[i][0][2])/lift_base_2
hit_rate_2=model.tree_.value[i][0][2]/(model.tree_.value[i][0][1]+model.tree_.value[i][0][0]+model.tree_.value[i][0][2])
dot_data=dot_data.replace(str([int(model.tree_.value[i][0][0]),int(model.tree_.value[i][0][1]),int(model.tree_.value[i][0][2])]),\
str([int(model.tree_.value[i][0][0]),int(model.tree_.value[i][0][1]),int(model.tree_.value[i][0][2])])+\
"<br/>class1_lift = "+str(np.around(lift_1,decimals=4))+\
"<br/>class1_1概率 = "+str(np.around(hit_rate_1,decimals=4))+\
"<br/>class2_lift = "+str(np.around(lift_2,decimals=4))+\
"<br/>class2_1概率 = "+str(np.around(hit_rate_2,decimals=4))
)
value_index.append(str([int(model.tree_.value[i][0][0]),int(model.tree_.value[i][0][1]),int(model.tree_.value[i][0][2])]))
return dot_data