继‘决策树总结’之后的sklearn_决策树

最新推荐文章于 2021-02-22 01:47:32 发布

AppleYRY

最新推荐文章于 2021-02-22 01:47:32 发布

阅读量179

点赞数

分类专栏： # 周志华西瓜书

本文链接：https://blog.csdn.net/weixin_42676175/article/details/105887810

版权

周志华西瓜书专栏收录该内容

34 篇文章 19 订阅

订阅专栏

分类算法实战：

参考机器学习官网：https://scikit-learn.org/stable/modules/classes.html

备注：

该笔记不全，代码不能运行，只供复习需要，以后有时间，补全代码。
在这里插入图片描述 限制树的参数：max_depth; min_samples_split;

并不是叶子节点越多也好，要合适。还有max_leaf_nodes:最大叶子节点也可以去设
自学图形化工具：graphviz

#图形化工具
def out_image():
    #模式初始化
    clf=DecisionTreeClassifier(max_depth=3)
    #训练模型
    clf.fit(x_train,y_train)
    #输出.dot文件
    #tree.export_graphviz(clf,out_file=data_path.replace('.csv','.pdf'))
    graph.write_png(file.replace('.csv','.png'))

根据网上资料，自己学。
注意：要安装pydotplus才能生成相对应文件

如何选择算法：

在这里插入图片描述

等于0的混淆矩阵。计算命中率和误判率

伪代码：

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt 
import pandas as pd 
from sklearn.metrics import confusion_matrix，roc_curve,auc    #导入混淆矩阵


file='data.csv'
dataset=pd.read_csv(file,encoding='gbk').values

x_train,x_test,y_train,y_test=train_test_split(dataset[:,:-1],dataset[:,-1],test_size=0.3)

#参数的调节（最简单）
def getCriterion():
    criterions=['gini','entropy']
    for criterion in criterions:
        model=DecisionTreeClassifier(criterion=criterion)
        model.fit(x_train,y_train)
        print(criterion,'training score',model.score(x_train,y_train))
        print(criterion,'testing score',model.score(x_test,y_test))

#树的深度调节（过拟合）
def getDepth():
    max_depths=range(1,70)
    train_score=[]
    test_score=[]
    for max_depth in max_depths:
        model=DecisionTreeClassifier(max_depth=max_depth)
        model.fit(x_train,y_train)
        train_score.append(model.score(x_train,y_train))
        test_score.append(model.score(x_test,y_test))
    print(train_score)
    print(test_score)
    plt.plot(max_depths,train_score,label='train',marker='*')
    plt.plot(max_depths,test_score,label='test',marker='o')
    plt.xlabel('max_depth')
    plt.ylabel('score')
    plt.legend(loc='best')
    plt.show()

#最小分裂点
def getMinSampleSplit():
    train_score=[]
    test_score=[]
    min_samples_split=range(100,1000,100)
    for min_samples in min_samples_split:
        model=DecisionTreeClassifier(max_depth=max_depth)
        model.fit(x_train,y_train)
        train_score.append(model.score(x_train,y_train))
        test_score.append(model.score(x_test,y_test))
    print(train_score)
    print(test_score)

    plt.plot(min_samples_split,train_score,label='train',marker='*')  
    plt.plot(min_samples_split,test_score,label='test',marker='o')   
    plt.xlabel('max_depth')
    plt.ylabel('score')
    plt.legend(loc='best')
    plt.show()

#叶子节点最小数
def getMinLeaf():
    train_score=[]
    test_score=[]
    min_samples_leaf=range(50,300,20)
    for min_samples in min_samples_leaf:
        model=DecisionTreeClassifier(min_samples_leaf=min_samples)
        model.fit(x_train,y_train)
        train_score.append(model.score(x_train,y_train))
        test_score.append(model.score(x_test,y_test))
    print(train_score)
    print(test_score)
    plt.plot(min_samples_leaf,train_score,label='train',marker='*')
    plt.plot(min_samples_leaf,test_score,label='test',marker='o')
    plt.xlabel('max_depth')
    plt.ylabel('score')
    plt.legend(loc='best')
    plt.show()

def graph_roc():
    model=DecisionTreeClassifier(criterion='entropy',max_depth=5,min_samples_split=300)
    model.fit(x_train,y_train)
    #要知道预测结果
    #输出结果为1的概率
    probal=model.predict_proba(x_test)[1]
    df1=pd.read_csv(file,encoding='gbk')
    cols=list(df1.columns)[0:-1]
    df_test=pd.DataFrame(x_test,columns=cols)
    #生成预测值和预测概率
    df_test.loc[:,'营销是否成功']=y_test
    df_test.loc[:,'预测为1的概率']=proba1
    # df[:,-1]=probal
    #检测文件是否存在
    if not os.path.exists('test.csv'):
        df_test.to_csv('test.csv',encoding='utf-8',index=False)
    print(df)

def plot_roc():
    #构建模型
    clf=DecisionTreeClassifier(max_depth=3)
    #训练数据
    clf.fit(x_train,y_train)
    #输出混淆矩阵
    pre=clf.predict(x_test)
    c_matrix=confusion_matrix(y_test,pre)   #混淆矩阵
    #更好的输出（二分类）（输出混淆矩阵的值）
    tn,fp,fn,tp=c_matrix.ravel()
    print(c_matrix)
    print('tn={0},fp={1},fn={2},tp={3}'.format(tn,fp,fn,tp))
    #输出预测测试集的概率

if __name__=='__main__':
    # getCriterion()
    # getDepth()
    getMinLeaf()