决策树（分类树）——红酒数据（分类树的八个参数，一个属性，四个接口，以及绘图所用的代码）

最新推荐文章于 2024-06-23 15:36:10 发布

BSLDTH

最新推荐文章于 2024-06-23 15:36:10 发布

阅读量2.6k

点赞数 2

文章标签： python 决策树大数据

本文链接：https://blog.csdn.net/BSLDTH/article/details/106358787

版权

#1、导入需要的算法库和模块
import pandas as pd 
import numpy as np

from sklearn import tree
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split

import graphviz

#2、探索数据
wine = load_wine()    #导入红酒数据

pd.concat([pd.DataFrame(wine.data),pd.DataFrame(wine.target)],axis = 1)  #连接数据

#3、分训练集和测试集
xtrain,xtest,ytrain,ytest = train_test_split(wine.data, wine.target, test_size = 0.3)

#4、建立模型
clf = tree.DecisionTreeClassifier(                           
                                    criterion='entropy'      #实例化                             
                                    ,random_state=30
                                    ,splitter='random'
                                    ,max_depth=3
                                    ,min_samples_leaf=10
                                    ,min_samples_split=10
                                    )                                      
clf = clf.fit(xtrain,ytrain)           #训练模型
score = clf.score(xtest, ytest)

score
>>0.8703703703703703

#5、画出一棵树
feature_name = ['酒精','苹果酸','灰','灰的碱性','镁','总酚','类黄酮','非黄烷类酚类','花青素','颜色强度','色调','od280/od315稀释葡萄酒','脯氨酸']

dot_data = tree.export_graphviz(clf
                                ,out_file = None
                                ,feature_names=feature_name
                                ,class_names=["琴酒","雪莉","贝尔摩德"]
                                ,filled = True
                                ,rounded = True
                               )
graph  =  graphviz.Source(dot_data)
graph

#6、探索决策树

#特征重要性
clf.feature_importances_
>>array([0.23823264, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.60075019, 0.00337075, 0.        , 0.        ,
       0.        , 0.09889766, 0.05874876])

[*zip(feature_name,clf.feature_importances_)]       #查看各特征重要性
>>[('酒精', 0.3054576317612548),
 ('苹果酸', 0.0),
 ('灰', 0.0),
 ('灰的碱性', 0.0),
 ('镁', 0.0),
 ('总酚', 0.0),
 ('类黄酮', 0.5105678404764971),
 ('非黄烷类酚类', 0.00040120124488055394),
 ('花青素', 0.0),
 ('颜色强度', 0.0),
 ('色调', 0.0),
 ('od280/od315稀释葡萄酒', 0.15597190486406962),
 ('脯氨酸', 0.027601421653297924)]

#7、确认最优剪枝参数
import matplotlib.pyplot as plt
test = []
for i in range(10):
    clf = tree.DecisionTreeClassifier(  max_depth=i+1
                                        ,criterion='entropy'
                                        ,random_state=30
                                        ,splitter='random'
                                        )
    clf = clf.fit(xtrain, ytrain)
    score = clf.score(xtest, ytest)
    test.append(score)
plt.plot(range(1,11),test,color='red',label='max_depth')
plt.legend()
plt.show()
>>图

#apply返回每个测试样本所在的叶子节点的索引
clf.apply(xtest)

#predict返回每个测试样本的分类/回归结果
clf.predict(xtest)