决策树与随机森林总结

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import time

决策树(红酒数据集为例)

1:Criterion

  • 将数据表格转化为一棵树,需要找到最佳节点和最佳分支方法,对分类树来说,衡量这个最佳的直标叫做不纯度
  • Criterion这个参数正是用来决定不纯度的计算方法的
    • entropy 信息熵
    • gini 基尼系数
  • 区别 :比起基尼系数,信息熵对不纯度更加敏感,对不纯度的惩罚最强。但是在实际使用中,信息熵和基尼系数的效果基 本相同。信息熵的计算比基尼系数缓慢一些,因为基尼系数的计算不涉及对数
from sklearn import tree
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
wine=load_wine()
# dataset 提供数据集默认是data,target分开的,连接查看结构使用的方法为pd.concat
pd.concat([pd.DataFrame(wine.data),pd.DataFrame(wine.target)],axis=1)
01234567891011120
014.231.712.4315.6127.02.803.060.282.295.6400001.043.921065.00
113.201.782.1411.2100.02.652.760.261.284.3800001.053.401050.00
213.162.362.6718.6101.02.803.240.302.815.6800001.033.171185.00
314.371.952.5016.8113.03.853.490.242.187.8000000.863.451480.00
413.242.592.8721.0118.02.802.690.391.824.3200001.042.93735.00
514.201.762.4515.2112.03.273.390.341.976.7500001.052.851450.00
614.391.872.4514.696.02.502.520.301.985.2500001.023.581290.00
714.062.152.6117.6121.02.602.510.311.255.0500001.063.581295.00
814.831.642.1714.097.02.802.980.291.985.2000001.082.851045.00
913.861.352.2716.098.02.983.150.221.857.2200001.013.551045.00
1014.102.162.3018.0105.02.953.320.222.385.7500001.253.171510.00
1114.121.482.3216.895.02.202.430.261.575.0000001.172.821280.00
1213.751.732.4116.089.02.602.760.291.815.6000001.152.901320.00
1314.751.732.3911.491.03.103.690.432.815.4000001.252.731150.00
1414.381.872.3812.0102.03.303.640.292.967.5000001.203.001547.00
1513.631.812.7017.2112.02.852.910.301.467.3000001.282.881310.00
1614.301.922.7220.0120.02.803.140.331.976.2000001.072.651280.00
1713.831.572.6220.0115.02.953.400.401.726.6000001.132.571130.00
1814.191.592.4816.5108.03.303.930.321.868.7000001.232.821680.00
1913.643.102.5615.2116.02.703.030.171.665.1000000.963.36845.00
2014.061.632.2816.0126.03.003.170.242.105.6500001.093.71780.00
2112.933.802.6518.6102.02.412.410.251.984.5000001.033.52770.00
2213.711.862.3616.6101.02.612.880.271.693.8000001.114.001035.00
2312.851.602.5217.895.02.482.370.261.463.9300001.093.631015.00
2413.501.812.6120.096.02.532.610.281.663.5200001.123.82845.00
2513.052.053.2225.0124.02.632.680.471.923.5800001.133.20830.00
2613.391.772.6216.193.02.852.940.341.454.8000000.923.221195.00
2713.301.722.1417.094.02.402.190.271.353.9500001.022.771285.00
2813.871.902.8019.4107.02.952.970.371.764.5000001.253.40915.00
2914.021.682.2116.096.02.652.330.261.984.7000001.043.591035.00
.............................................
14813.323.242.3821.592.01.930.760.451.258.4200000.551.62650.02
14913.083.902.3621.5113.01.411.390.341.149.4000000.571.33550.02
15013.503.122.6224.0123.01.401.570.221.258.6000000.591.30500.02
15112.792.672.4822.0112.01.481.360.241.2610.8000000.481.47480.02
15213.111.902.7525.5116.02.201.280.261.567.1000000.611.33425.02
15313.233.302.2818.598.01.800.830.611.8710.5200000.561.51675.02
15412.581.292.1020.0103.01.480.580.531.407.6000000.581.55640.02
15513.175.192.3222.093.01.740.630.611.557.9000000.601.48725.02
15613.844.122.3819.589.01.800.830.481.569.0100000.571.64480.02
15712.453.032.6427.097.01.900.580.631.147.5000000.671.73880.02
15814.341.682.7025.098.02.801.310.532.7013.0000000.571.96660.02
15913.481.672.6422.589.02.601.100.522.2911.7500000.571.78620.02
16012.363.832.3821.088.02.300.920.501.047.6500000.561.58520.02
16113.693.262.5420.0107.01.830.560.500.805.8800000.961.82680.02
16212.853.272.5822.0106.01.650.600.600.965.5800000.872.11570.02
16312.963.452.3518.5106.01.390.700.400.945.2800000.681.75675.02
16413.782.762.3022.090.01.350.680.411.039.5800000.701.68615.02
16513.734.362.2622.588.01.280.470.521.156.6200000.781.75520.02
16613.453.702.6023.0111.01.700.920.431.4610.6800000.851.56695.02
16712.823.372.3019.588.01.480.660.400.9710.2600000.721.75685.02
16813.582.582.6924.5105.01.550.840.391.548.6600000.741.80750.02
16913.404.602.8625.0112.01.980.960.271.118.5000000.671.92630.02
17012.203.032.3219.096.01.250.490.400.735.5000000.661.83510.02
17112.772.392.2819.586.01.390.510.480.649.8999990.571.63470.02
17214.162.512.4820.091.01.680.700.441.249.7000000.621.71660.02
17313.715.652.4520.595.01.680.610.521.067.7000000.641.74740.02
17413.403.912.4823.0102.01.800.750.431.417.3000000.701.56750.02
17513.274.282.2620.0120.01.590.690.431.3510.2000000.591.56835.02
17613.172.592.3720.0120.01.650.680.531.469.3000000.601.62840.02
17714.134.102.7424.596.02.050.760.561.359.2000000.611.60560.02

178 rows × 14 columns

xtrain,xtest,ytrain,ytest=train_test_split(wine.data,wine.target,test_size=0.3)
xtrain.shape
(124, 13)

1.1,1 criterion=“entropy”

clf=tree.DecisionTreeClassifier(criterion="entropy")
clf=clf.fit(xtrain,ytrain)
score=clf.score(xtest,ytest)
score
0.9259259259259259

1.1.2 criterion=“gini”

# 不填默认gini
clf=tree.DecisionTreeClassifier(criterion="gini")
clf=clf.fit(xtrain,ytrain)
score=clf.score(xtest,ytest)
score
0.9259259259259259

1.2 画树

  • 使用 tree.export_graphviz
# 列出列项向量名字,便于查看理解结果
import graphviz
feature_name = ['酒精','苹果酸','灰','灰的碱性','镁','总酚','类黄酮','非黄烷类酚类','花青素','颜 色强度','色调','od280/od315稀释葡萄酒','脯氨酸']
dot_data=tree.export_graphviz(clf
                               ,out_file=None
                               ,class_names=["琴酒","雪莉","贝尔摩德"]
                               ,filled=True #颜色填充
                               ,rounded=True #圆角
 )
graph=graphviz.Source(dot_data)
graph

在这里插入图片描述

1.3 探索决策树

# 各个特征对决策树的建立影响程度,特征重要性,越大影响程度越大
clf.feature_importances_
array([0.41133413, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.38205108, 0.02401924, 0.        , 0.        ,
       0.        , 0.05485876, 0.12773679])
[* zip(feature_name,clf.feature_importances_)]
[('酒精', 0.4113341349496296),
 ('苹果酸', 0.0),
 ('灰', 0.0),
 ('灰的碱性', 0.0),
 ('镁', 0.0),
 ('总酚', 0.0),
 ('类黄酮', 0.3820510756901232),
 ('非黄烷类酚类', 0.024019241220117216),
 ('花青素', 0.0),
 ('颜 色强度', 0.0),
 ('色调', 0.0),
 ('od280/od315稀释葡萄酒', 0.05485876081137878),
 ('脯氨酸', 0.12773678732875127)]

1.4 random_state & splitte

  • random_state用来设置分枝中的随机模式的参数,默认None,在高维度时随机性会表现更明显,低维度的数据 (比如鸢尾花数据集),随机性几乎不会显现。输入任意整数,会一直长出同一棵树,让模型稳定下来。
  • splitter也是用来控制决策树中的随机选项的,有两种输入值,输入”best",决策树在分枝时虽然随机,但是还是会 优先选择更重要的特征进行分枝(重要性可以通过属性feature_importances_查看),输入“random",决策树在 分枝时会更加随机,树会因为含有更多的不必要信息而更深更大,并因这些不必要信息而降低对训练集的拟合。这 也是防止过拟合的一种方式。当你预测到你的模型会过拟合,用这两个参数来帮助你降低树建成之后过拟合的可能 性。当然,树一旦建成,我们依然是使用剪枝参数来防止过拟合。
clf = tree.DecisionTreeClassifier(criterion="entropy",random_state=30) 
clf = clf.fit(xtrain, ytrain) 
score = clf.score(xtest, ytest) #返回预测的准确度

score

0.9444444444444444
dot_data=tree.export_graphviz(clf
                               ,out_file=None
                               ,class_names=["琴酒","雪莉","贝尔摩德"]
                               ,filled=True #颜色填充
                               ,rounded=True #圆角
 )
graph=graphviz.Source(dot_data)
graph

在这里插入图片描述

1.5剪枝

  • max_depth
  • 限制树的最大深度,超过设定深度的树枝全部剪掉
  • min_samples_leaf
  • 一个节点在分枝后的每个子节点都必须包含至少min_samples_leaf个训练样本,否则分 枝就不会发生,或者,分枝会朝着满足每个子节点都包含min_samples_leaf个样本的方向去发生
  • min_samples_split
  • min_samples_split限定,一个节点必须要包含至少min_samples_split个训练样本,这个节点才允许被分枝,否则 分枝就不会发生。
clf=tree.DecisionTreeClassifier(criterion="gini"
                                ,random_state=32
                                ,max_depth=3
                                ,min_samples_leaf=10
                                ,min_samples_split=10
)
clf = clf.fit(xtrain, ytrain) 
score = clf.score(xtest, ytest) #返回预测的准确度

score

0.8333333333333334
dot_data=tree.export_graphviz(clf
                               ,out_file=None
                               ,class_names=["琴酒","雪莉","贝尔摩德"]
                               ,filled=True #颜色填充
                               ,rounded=True #圆角
 )
graph=graphviz.Source(dot_data)
graph

在这里插入图片描述

clf.score(xtrain,ytrain)
0.9435483870967742
clf.score(xtest,ytest)
0.8333333333333334

1.5使用网格调整参数

parameters={
            "splitter":("best","random")
           ,"criterion":("gini","entropy")
           ,"max_depth":[*range(2,6)]
           ,'min_samples_leaf':[*range(1,50,5)] 
           , 'min_impurity_decrease':[*np.linspace(0,0.5,20)]
}
clf=DecisionTreeClassifier(random_state=25)
gs=GridSearchCV(clf,parameters,cv=10)
gs.fit(xtrain,ytrain)
GridSearchCV(cv=10, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=25,
                                              splitter='best'),
             iid='dep...
                                                   0.23684210526315788,
                                                   0.2631578947368421,
                                                   0.2894736842105263,
                                                   0.3157894736842105,
                                                   0.3421052631578947,
                                                   0.3684210526315789,
                                                   0.39473684210526316,
                                                   0.42105263157894735,
                                                   0.4473684210526315,
                                                   0.47368421052631576, 0.5],
                         'min_samples_leaf': [1, 6, 11, 16, 21, 26, 31, 36, 41,
                                              46],
                         'splitter': ('best', 'random')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)
gs.best_params_
{'criterion': 'gini',
 'max_depth': 4,
 'min_impurity_decrease': 0.02631578947368421,
 'min_samples_leaf': 1,
 'splitter': 'best'}

clf=tree.DecisionTreeClassifier(
    criterion="gini"
    ,max_depth=4
    ,min_impurity_decrease=0.02631578947368421
    ,min_samples_leaf=1
    ,splitter='best'
    ,random_state=25
)
clf = clf.fit(xtrain, ytrain) 
score = clf.score(xtest, ytest) #返回预测的准确度

score
0.9444444444444444
clf.score(xtest,ytest)

0.9444444444444444
dot_data=tree.export_graphviz(clf
                               ,out_file=None
                               ,class_names=["琴酒","雪莉","贝尔摩德"]
                               ,filled=True #颜色填充
                               ,rounded=True #圆角
 )
graph=graphviz.Source(dot_data)
graph

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-eEv4m3qk-1597289629277)(output_28_0.svg)]

随机森林( RandomForestClassifie)

  • 随机森林是非常具有代表性的Bagging集成算法,它的所有基评估器都是决策树,分类树组成的森林就叫做随机森 林分类器,回归树所集成的森林就叫做随机森林回归器。

2.1 n_estimators默认为100

  • 这是森林中树木的数量,即基评估器的数量。这个参数对随机森林模型的精确性影响是单调的,n_estimators越 大,模型的效果往往越好。但是相应的,任何模型都有决策边界,n_estimators达到一定的程度之后,随机森林的 精确性往往不在上升或开始波动
wine=load_wine()
rfc=RandomForestClassifier(n_estimators=25)
rfc=cross_val_score(rfc,wine.data,wine.target,cv=10).mean()
plt.plot(range(1,11),rfc)
[<matplotlib.lines.Line2D at 0x11c2f2c3a48>]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-5QI5yCUo-1597289629278)(output_33_1.png)]


0.9833333333333334
rfc
0.9833333333333334

  • 菜菜的sklearn学习得到 https://live.bilibili.com/12582510
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值