决策树建模流程
一:数据处理
- 先划分数据集
- 缺失值填充
- 0/1转码
- 虚拟变量处理
- 相关关系对比,删除相关关系大的部分特征
- 等等
二:建模
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
clf = DecisionTreeClassifier(random_state=420,class_weight='balanced')
cvs = cross_val_score(clf,Xtrain,Ytrain)
三:网格搜索找最优参数
from sklearn.model_selection import GridSearchCV
param_test = {
'splitter':('best','random'),
'criterion':('gini','entropy'),
'max_depth':range(3,15)
}
gsearch= GridSearchCV(estimator=clf,
param_grid=param_test,
scoring='roc_auc',
n_jobs=-1,
cv = 5,
iid=False,
verbose=2
)
gsearch.fit(Xtrain,Ytrain)
gsearch.best_score_
输出最优参数
gsearch.best_params_
四:模型评估
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve
y_pre = gsearch.predict(Xtest_05)
accuracy_score(y_pre,Ytest)
precision_score(y_pre,Ytest)
recall_score(y_pre,Ytest)
画ROC曲线
fpr,tpr,thresholds = roc_curve(y_pre,Ytest)
import matplotlib.pyplot as plt
plt.plot(fpr,tpr,c='b',label='roc曲线')
plt.plot(fpr,fpr,c='r',ls='--')
五:输出规则
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import graphviz
clf = DecisionTreeClassifier(criterion='entropy',max_depth=6,splitter='best')
clf = clf.fit(Xtrain_05,Ytrain)
features = Xtrain_05.columns
dot_data = tree.export_graphviz(clf,
feature_names=features,
class_names=['Not Buy','Buy'],
filled=True,
rounded=True,
leaves_parallel=False)
graph= graphviz.Source(dot_data)
graph
六:把图像的
标签复制到html文件中再使用浏览器打开,最后通过打印可以得到pdf文件