sklearn-01:DT record

该篇博客介绍了使用Python的Scikit-Learn库构建决策树的过程,包括数据加载、探索、训练集测试集划分、模型建立、评估与优化。通过调整参数如熵作为准则、随机分割和限制最大深度,提高模型预测准确度。同时展示了决策树的可视化,并探讨了特征重要性。最后,将决策树应用于回归问题,展示了交叉验证在回归模型中的应用。
摘要由CSDN通过智能技术生成
# 建立一棵树
# 1. 导入需要的算法库和模块
from sklearn import tree
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
# 2.探索数据
wine=load_wine()
wine.data
wine.data.shape
wine.target
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])
#如果wine是一张表,应该长这样:
import pandas as pd
pd.concat([pd.DataFrame(wine.data),pd.DataFrame(wine.target)],axis=1)
01234567891011120
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.00
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.00
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.00
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.00
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.00
.............................................
17313.715.652.4520.595.01.680.610.521.067.700.641.74740.02
17413.403.912.4823.0102.01.800.750.431.417.300.701.56750.02
17513.274.282.2620.0120.01.590.690.431.3510.200.591.56835.02
17613.172.592.3720.0120.01.650.680.531.469.300.601.62840.02
17714.134.102.7424.596.02.050.760.561.359.200.611.60560.02

178 rows × 14 columns

wine.feature_names
['alcohol',
 'malic_acid',
 'ash',
 'alcalinity_of_ash',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'nonflavanoid_phenols',
 'proanthocyanins',
 'color_intensity',
 'hue',
 'od280/od315_of_diluted_wines',
 'proline']
wine.target_names
array(['class_0', 'class_1', 'class_2'], dtype='<U7')
# 3. 分训练集和测试集
Xtrain,Xtest,Ytrain,Ytest=train_test_split(wine.data,wine.target,test_size=0.3) #0.3代表30%做测试集,剩下70%做训练集
Xtrain.shape
(124, 13)
Xtest.shape
(54, 13)
Ytrain
array([2, 1, 0, 0, 2, 1, 0, 1, 2, 0, 0, 1, 0, 0, 1, 1, 1, 1, 2, 1, 1, 1,
       1, 0, 1, 2, 2, 2, 2, 0, 1, 1, 0, 1, 2, 1, 2, 0, 0, 2, 1, 0, 2, 2,
       1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 2, 2, 1, 0, 1, 1, 0, 2, 1, 0, 0, 0,
       1, 2, 2, 1, 2, 1, 0, 0, 2, 1, 1, 2, 2, 0, 1, 0, 2, 0, 0, 1, 2, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 1, 1, 0, 2, 1, 2, 2, 0, 1, 1, 0,
       2, 0, 2, 0, 2, 1, 0, 1, 1, 1, 0, 0, 2, 1])
# 4. 建立模型
clf=tree.DecisionTreeClassifier(criterion="entropy")
clf=clf.fit(Xtrain,Ytrain)
score=clf.score(Xtest,Ytest) #返回预测的准确度accuracy

score
0.8703703703703703
# 5. 画出一棵树
feature_name = ['酒精','苹果酸','灰','灰的碱性','镁','总酚','类黄酮','非黄烷类酚类','花青素','颜色强度','色调','od280/od315稀释葡萄酒','脯氨酸']
import graphviz
dot_data=tree.export_graphviz(clf
                              ,feature_names=feature_name
                              ,class_names=["琴酒","雪莉","贝尔摩德"]
                              ,filled=True
                              ,rounded=True
                             )
graph=graphviz.Source(dot_data)
graph

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-uPC6E7pO-1652878264148)(output_10_0.svg)]

# 6.探索决策树
# 特征重要性
clf.feature_importances_
array([0.02111645, 0.        , 0.        , 0.01419846, 0.        ,
       0.        , 0.44269737, 0.        , 0.        , 0.10895451,
       0.03548785, 0.        , 0.37754535])
[*zip(feature_name,clf.feature_importances_)]
[('酒精', 0.021116452863815985),
 ('苹果酸', 0.0),
 ('灰', 0.0),
 ('灰的碱性', 0.014198463898749614),
 ('镁', 0.0),
 ('总酚', 0.0),
 ('类黄酮', 0.44269736563613815),
 ('非黄烷类酚类', 0.0),
 ('花青素', 0.0),
 ('颜色强度', 0.10895451445562636),
 ('色调', 0.03548785169541032),
 ('od280/od315稀释葡萄酒', 0.0),
 ('脯氨酸', 0.37754535145025947)]
clf = tree.DecisionTreeClassifier(criterion="entropy",random_state=30)
clf = clf.fit(Xtrain, Ytrain)
score = clf.score(Xtest, Ytest) #返回预测的准确度
score
0.9444444444444444
clf = tree.DecisionTreeClassifier(criterion="entropy"
                                    ,random_state=30
#                                     ,splitter="random"
                                    )
clf = clf.fit(Xtrain, Ytrain)
score = clf.score(Xtest, Ytest) #返回预测的准确度
score
0.9444444444444444
import graphviz
dot_data = tree.export_graphviz(clf
                                ,feature_names= feature_name
                                ,class_names=["琴酒","雪莉","贝尔摩德"]
                                ,filled=True
                                ,rounded=True
                               )
graph = graphviz.Source(dot_data)
graph

在这里插入图片描述

#我们的树对训练集的拟合程度如何?
score_train = clf.score(Xtrain, Ytrain)
score_train
1.0
clf = tree.DecisionTreeClassifier(criterion="entropy"
                                    ,random_state=30
                                    ,splitter="random"
                                    ,max_depth=3
#                                      ,min_samples_leaf=10
#                                     ,min_samples_split=26
                                    )
clf = clf.fit(Xtrain, Ytrain)
dot_data = tree.export_graphviz(clf
                                ,feature_names= feature_name
                                ,class_names=["琴酒","雪莉","贝尔摩德"]
                                ,filled=True
                                ,rounded=True
                                )
graph = graphviz.Source(dot_data)
graph

在这里插入图片描述

score = clf.score(Xtest, Ytest) 
score
0.9444444444444444
score = clf.score(Xtrain, Ytrain) #返回预测的准确度
score
0.967741935483871
import matplotlib.pyplot as plt
test = []
for i in range(10):
    clf = tree.DecisionTreeClassifier(max_depth=i+1
                                    ,criterion="entropy"
                                    ,random_state=30
                                    ,splitter="random"
                                    )
    clf = clf.fit(Xtrain, Ytrain)
    score = clf.score(Xtest, Ytest)
    test.append(score)
plt.plot(range(1,11),test,color="red",label="max_depth")
plt.legend()
plt.show()

在这里插入图片描述

#apply返回每个测试样本所在的叶子节点的索引
clf.apply(Xtest)
array([13, 13, 18, 13,  7,  7, 15, 15, 23, 24, 23, 13, 15,  7,  7, 24,  7,
       23,  7, 24, 13, 24,  7,  7,  7,  7, 15, 24, 15,  7, 13, 24, 13,  7,
        7, 24,  7, 13, 24, 19, 13, 24,  7, 24, 13,  7,  7, 24, 13,  7, 24,
       13, 15, 13], dtype=int64)
#predict返回每个测试样本的分类/回归结果
clf.predict(Xtest)
array([1, 1, 0, 1, 2, 2, 1, 1, 0, 0, 0, 1, 1, 2, 2, 0, 2, 0, 2, 0, 1, 0,
       2, 2, 2, 2, 1, 0, 1, 2, 1, 0, 1, 2, 2, 0, 2, 1, 0, 1, 1, 0, 2, 0,
       1, 2, 2, 0, 1, 2, 0, 1, 1, 1])
from sklearn.datasets import load_boston #为什么要换数据集,因为回归处理的是连续型变量,而之前用到的红酒是分类型变量
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
boston = load_boston()
boston.data
array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
        4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
        9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
        4.0300e+00],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        5.6400e+00],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
        6.4800e+00],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        7.8800e+00]])
boston.target
array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
       18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
       15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
       13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
       21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
       35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,
       19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,
       20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,
       23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,
       33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,
       21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,
       20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,
       23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,
       15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21.5, 19.6, 15.3, 19.4,
       17. , 15.6, 13.1, 41.3, 24.3, 23.3, 27. , 50. , 50. , 50. , 22.7,
       25. , 50. , 23.8, 23.8, 22.3, 17.4, 19.1, 23.1, 23.6, 22.6, 29.4,
       23.2, 24.6, 29.9, 37.2, 39.8, 36.2, 37.9, 32.5, 26.4, 29.6, 50. ,
       32. , 29.8, 34.9, 37. , 30.5, 36.4, 31.1, 29.1, 50. , 33.3, 30.3,
       34.6, 34.9, 32.9, 24.1, 42.3, 48.5, 50. , 22.6, 24.4, 22.5, 24.4,
       20. , 21.7, 19.3, 22.4, 28.1, 23.7, 25. , 23.3, 28.7, 21.5, 23. ,
       26.7, 21.7, 27.5, 30.1, 44.8, 50. , 37.6, 31.6, 46.7, 31.5, 24.3,
       31.7, 41.7, 48.3, 29. , 24. , 25.1, 31.5, 23.7, 23.3, 22. , 20.1,
       22.2, 23.7, 17.6, 18.5, 24.3, 20.5, 24.5, 26.2, 24.4, 24.8, 29.6,
       42.8, 21.9, 20.9, 44. , 50. , 36. , 30.1, 33.8, 43.1, 48.8, 31. ,
       36.5, 22.8, 30.7, 50. , 43.5, 20.7, 21.1, 25.2, 24.4, 35.2, 32.4,
       32. , 33.2, 33.1, 29.1, 35.1, 45.4, 35.4, 46. , 50. , 32.2, 22. ,
       20.1, 23.2, 22.3, 24.8, 28.5, 37.3, 27.9, 23.9, 21.7, 28.6, 27.1,
       20.3, 22.5, 29. , 24.8, 22. , 26.4, 33.1, 36.1, 28.4, 33.4, 28.2,
       22.8, 20.3, 16.1, 22.1, 19.4, 21.6, 23.8, 16.2, 17.8, 19.8, 23.1,
       21. , 23.8, 23.1, 20.4, 18.5, 25. , 24.6, 23. , 22.2, 19.3, 22.6,
       19.8, 17.1, 19.4, 22.2, 20.7, 21.1, 19.5, 18.5, 20.6, 19. , 18.7,
       32.7, 16.5, 23.9, 31.2, 17.5, 17.2, 23.1, 24.5, 26.6, 22.9, 24.1,
       18.6, 30.1, 18.2, 20.6, 17.8, 21.7, 22.7, 22.6, 25. , 19.9, 20.8,
       16.8, 21.9, 27.5, 21.9, 23.1, 50. , 50. , 50. , 50. , 50. , 13.8,
       13.8, 15. , 13.9, 13.3, 13.1, 10.2, 10.4, 10.9, 11.3, 12.3,  8.8,
        7.2, 10.5,  7.4, 10.2, 11.5, 15.1, 23.2,  9.7, 13.8, 12.7, 13.1,
       12.5,  8.5,  5. ,  6.3,  5.6,  7.2, 12.1,  8.3,  8.5,  5. , 11.9,
       27.9, 17.2, 27.5, 15. , 17.2, 17.9, 16.3,  7. ,  7.2,  7.5, 10.4,
        8.8,  8.4, 16.7, 14.2, 20.8, 13.4, 11.7,  8.3, 10.2, 10.9, 11. ,
        9.5, 14.5, 14.1, 16.1, 14.3, 11.7, 13.4,  9.6,  8.7,  8.4, 12.8,
       10.5, 17.1, 18.4, 15.4, 10.8, 11.8, 14.9, 12.6, 14.1, 13. , 13.4,
       15.2, 16.1, 17.8, 14.9, 14.1, 12.7, 13.5, 14.9, 20. , 16.4, 17.7,
       19.5, 20.2, 21.4, 19.9, 19. , 19.1, 19.1, 20.1, 19.9, 19.6, 23.2,
       29.8, 13.8, 13.3, 16.7, 12. , 14.6, 21.4, 23. , 23.7, 25. , 21.8,
       20.6, 21.2, 19.1, 20.6, 15.2,  7. ,  8.1, 13.6, 20.1, 21.8, 24.5,
       23.1, 19.7, 18.3, 21.2, 17.5, 16.8, 22.4, 20.6, 23.9, 22. , 11.9])
regressor = DecisionTreeRegressor(random_state=0) #实例化
#交叉验证cross_val_score:
cross_val_score(regressor, boston.data, boston.target, cv=10
                ,scoring = "neg_mean_squared_error"
               )
#参数1:模型评估器,可以是分类,回归等任何实例化后的算法模型,
#参数2:完整的特征矩阵,不需要划分测试集,训练集
#参数3:完整的标签
#参数4:cv=10,表示交叉验证的数据分成10份,每次提一份做测试集,剩余的做训练集(通常是5)
#参数5:评分用到的指标,用到"neg_mean_squared_error"
array([-16.41568627, -10.61843137, -18.30176471, -55.36803922,
       -16.01470588, -44.70117647, -12.2148    , -91.3888    ,
       -57.764     , -36.8134    ])
import numpy as np
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
#np.random.rand(数组结构),生成随机数组的函数
rng = np.random.RandomState(1) #随机数种子
X=np.sort(5 * rng.rand(80,1),axis=0) #生成0~5之间的随机的X的取值,且排序
y = np.sin(X).ravel() #生成正弦曲线
y[::5]+=3*(0.5-rng.rand(16))#在正弦曲线上加噪声
plt.figure()
plt.scatter(X, y, s=20, edgecolor="black",c="darkorange", label="data")
<matplotlib.collections.PathCollection at 0x1832feee8c8>

在这里插入图片描述

#了解降维函数ravel()的用法
np.random.random((2,1))
np.random.random((2,1)).ravel()
np.random.random((2,1)).ravel().shape
(2,)
regr_1 = DecisionTreeRegressor(max_depth=2)
regr_2 = DecisionTreeRegressor(max_depth=5)
regr_1.fit(X, y)
regr_2.fit(X, y)
DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=5,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')
#np.arange(开始点,结束点,步长)
X_test = np.arange(0.0, 5.0, 0.01)[:,np.newaxis]
X_test.shape
(500, 1)
# 了解增维切片np.newaxis的用法
l = np.array([1,2,3,4])
l
l.shape
l[:,np.newaxis]
l[:,np.newaxis].shape
# l[np.newaxis,:].shape
(4, 1)
y_1 = regr_1.predict(X_test)
y_2 = regr_2.predict(X_test)
plt.figure()
plt.scatter(X, y, s=20, edgecolor="black",c="darkorange", label="data")
plt.plot(X_test, y_1, color="cornflowerblue",label="max_depth=2", linewidth=2)
plt.plot(X_test, y_2, color="yellowgreen", label="max_depth=5", linewidth=2)
plt.xlabel("data")
plt.ylabel("target")
plt.title("Decision Tree Regression")
plt.legend()
plt.show()

在这里插入图片描述


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

weixin_47049321

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值