背景
这篇文章就是一个简单的,把python回归树转成sql语言的脚本。
本文的Cart回归树脚本之前有发过。当时是想用于辅助策略分析的。在智能风控和用户增长的场景下都可以使用。后来遇到了一些场景,需要将决策树的逻辑转为SQL部署在hadoop集群上,于是尝试用递归生成了Hive SQL的脚本。
正文
⭐️首先,这里演示,生成一组数据,然后通过一个函数,生成决策树。
为了生成脚本的美观,这里就只使用一个2层的回归树,但决策树非常复杂的时候,自动脚本的优势就体现出来啦。
import pandas as pd
import sys
sys.path.append('/函数/automl/')
from sklearn.datasets import make_classification
from dt_main import auto_DecisionTreeRegressor
from Decision_utils import DTR_TO_SQL
from IPython.display import Image
X, y = make_classification(n_samples=1000,n_features=30,n_classes=2,random_state=328)
data = pd.DataFrame(X)
data['bad_ind'] = y
data['imei'] = [i for i in range(len(data))]
data.columns = ['f0_radius','f0_texture','f0_perimeter','f0_area','f0_smoothness',
'f0_compactness','f0_concavity','f0_concave_points','f0_symmetry',
'f0_fractal_dimension','f1_radius_error','f1_texture_error','f1_perimeter_error',
'f2_area_error','f2_smoothness_error','f2_compactness_error','f2_concavity_error',
'f2_concave_points_error','f2_symmetry_error','f2_fractal_dimension_error',
'f3_radius','f3_texture','f3_perimeter','f3_area','f3_smoothness',
'f3_compactness','f3_concavity','f3_concave_points','f3_symmetry',
'f3_fractal_dimension','bad_ind','imei']
uid, dep = "imei", "bad_ind"
var_names = list(data.columns)
var_names.remove(dep)
var_names.remove(uid)
#指定不参与建模的变量,包含标签bad_ind。
ex_lis = [uid, dep]
#调用决策树函数
dct = auto_DecisionTreeRegressor(datasets = data,ex_lis = ex_lis,is_bin=False,
dep = dep,min_samples=0.01,max_depth=4,
min_samples_leaf=50,min_samples_split=50).fit_plot()
#展示图像
Image(dct['graph'])
⭐️下面生成Hive SQL脚本。
#获取SQL,决策树模型/变量名/when_else异常赋值默认为-1,我这里手动设置为0
DTR_TO_SQL(dct['dtree'],dct['feature_names'],when_else=0)
上述内容用到了两个函数,分别存在了两个py文件当中。下面来看一下这两个文件。
⭐️第一个文件dt_main.py,是回归树的主函数。内容如下。
#卡方分箱决策树
class auto_DecisionTreeRegressor(object):
def __init__(self,datasets,ex_lis,dep='bad_ind',min_samples=0.05,min_samples_leaf=200,min_samples_split=20,max_depth=4,is_bin=True):
'''
datasets:数据集 dataframe格式
ex_lis:不参与建模的特征,如id,时间切片等。 list格式
min_samples:分箱时最小箱的样本占总比 numeric格式
max_depth:决策树最大深度 numeric格式
min_samples_leaf:决策树子节点最小样本个数 numeric格式
min_samples_split:决策树划分前,父节点最小样本个数 numeric格式
is_bin:是否进行卡方分箱 bool格式(True/False)
'''
self.datasets = datasets
self.ex_lis = ex_lis
self.dep = dep
self.max_depth = max_depth
self.min_samples = min_samples
self.min_samples_leaf = min_samples_leaf
self.min_samples_split = min_samples_split
self.is_bin = is_bin
self.bins = 0
def fit_plot(self):
os.environ["PATH"] += os.pathsep + 'D:/Program Files/Graphviz2.38/bin'
dtree = tree.DecisionTreeRegressor(max_depth=self.max_depth,
min_samples_leaf=self.min_samples_leaf,
min_samples_split=self.min_samples_split)
x = self.datasets.drop(self.ex_lis,axis=1)
y = self.datasets[self.dep]
combiner = 0
if self.is_bin:
#分箱
combiner = toad.transform.Combiner()
combiner.fit(x,y,method='chi',min_samples = self.min_samples)
x_bin= combiner.transform(x)
self.bins = combiner.export()
else:
x_bin = x.copy()
dtree = dtree.fit(x_bin,y)
df_bin = x_bin.copy()
df_bin[self.dep] = y
dot_data = StringIO()
tree.export_graphviz(dtree, out_file=dot_data,
feature_names=x_bin.columns,
class_names=[self.dep],
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
png = graph.create_png()
#返回字典
dct = {
'df_bin': df_bin,
'bins':self.bins,
'combiner':combiner,
'graph':png,
'dtree':dtree,
'feature_names':x_bin.columns
}
return dct
⭐️另一个文件的脚本Decision_utils.py如下。
找到决策树的叶子节点并抽取决策路径到SQL中。因为是一个重新写的demo,如果公司实际生产环境,还需要在脚本中使用cast(features as flaot) 之类的函数,就自己添加一下吧,我偷个懒。
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 12 21:47:02 2020
@author: meizihang
"""
import toad
import pandas as pd
import numpy as np
import pydotplus
from IPython.display import Image
from sklearn.externals.six import StringIO
import os
from sklearn import tree
def DTR_TO_SQL(tree, feature_names, when_else=-1):
import numpy as np
left = tree.tree_.children_left
right = tree.tree_.children_right
threshold = tree.tree_.threshold
features = [feature_names[i] for i in tree.tree_.feature]
le='<='
g ='>'
idx = np.argwhere(left == -1)[:,0]
def find_value(tree):
value_lis=[]
tree_ = tree.tree_
def recurse(node, depth):
from sklearn.tree import _tree
if tree_.feature[node] != _tree.TREE_UNDEFINED:
recurse(tree_.children_left[node], depth + 1)
recurse(tree_.children_right[node], depth + 1)
else:
value_lis.append(tree_.value[node][0][0])
recurse(0, 1)
return value_lis
value_lis = find_value(tree)
def recurse(left, right, child, lineage=None):
if lineage is None:
lineage = [child]
if child in left:
parent = np.where(left == child)[0].item()
split = 'l'
else:
parent = np.where(right == child)[0].item()
split = 'r'
lineage.append((parent, split, threshold[parent], features[parent]))
if parent == 0:
lineage.reverse()
return lineage
else:
return recurse(left, right, parent, lineage)
print('case ')
for j,child in enumerate(idx):
clause=' when '
for node in recurse(left, right, child):
if len(str(node))<3:
continue
i=node
if i[1]=='l':
sign=le
else:
sign=g
clause=clause+i[3]+sign+str(i[2])+' and '
clause=clause[:-4]+' then '+ str(value_lis[j])
print(clause)
print('else %s end as clusters'%(when_else))
总结
这篇文章就是一个简单的,把python回归树转成sql语言的脚本,主要的用处就是,考虑到不是每家公司都有部署决策树的环境,方便的时候也可以使用SQL部署。同样的逻辑回归评分卡其实也可以实现类似的逻辑。后续TOAD库中也会引进的(信心满满.jpg)。
最后,不知道这里面的脚本还有没有坑,希望对你有帮助。
1011

被折叠的 条评论
为什么被折叠?



