导库
from sklearn import tree
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
import pandas as pd
探索数据
wine=load_wine()
wine.data.shape
(178, 13)
wine.target.shape
(178,)
type(wine.data[1])
numpy.ndarray
pd.concat([pd.DataFrame(wine.data),pd.DataFrame(wine.target)],axis=1)[:5]
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 0 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 14.23 | 1.71 | 2.43 | 15.6 | 127.0 | 2.80 | 3.06 | 0.28 | 2.29 | 5.64 | 1.04 | 3.92 | 1065.0 | 0 |
1 | 13.20 | 1.78 | 2.14 | 11.2 | 100.0 | 2.65 | 2.76 | 0.26 | 1.28 | 4.38 | 1.05 | 3.40 | 1050.0 | 0 |
2 | 13.16 | 2.36 | 2.67 | 18.6 | 101.0 | 2.80 | 3.24 | 0.30 | 2.81 | 5.68 | 1.03 | 3.17 | 1185.0 | 0 |
3 | 14.37 | 1.95 | 2.50 | 16.8 | 113.0 | 3.85 | 3.49 | 0.24 | 2.18 | 7.80 | 0.86 | 3.45 | 1480.0 | 0 |
4 | 13.24 | 2.59 | 2.87 | 21.0 | 118.0 | 2.80 | 2.69 | 0.39 | 1.82 | 4.32 | 1.04 | 2.93 | 735.0 | 0 |
wine.feature_names
['alcohol',
'malic_acid',
'ash',
'alcalinity_of_ash',
'magnesium',
'total_phenols',
'flavanoids',
'nonflavanoid_phenols',
'proanthocyanins',
'color_intensity',
'hue',
'od280/od315_of_diluted_wines',
'proline']
wine.target_names
array(['class_0', 'class_1', 'class_2'], dtype='<U7')
划分数据集与测试集
x_train,x_test,y_train,y_test=train_test_split(wine.data,wine.target,test_size=0.2)
type(x_train.shape)
tuple
建立模型
clf=tree.DecisionTreeClassifier(criterion="entropy")
clf=clf.fit(x_train,y_train)
score=clf.score(x_test,y_test)
score
0.9722222222222222
画树
import graphviz
wine.feature_names
['alcohol',
'malic_acid',
'ash',
'alcalinity_of_ash',
'magnesium',
'total_phenols',
'flavanoids',
'nonflavanoid_phenols',
'proanthocyanins',
'color_intensity',
'hue',
'od280/od315_of_diluted_wines',
'proline']
clf
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
max_depth=None, max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort='deprecated',
random_state=None, splitter='best')
wine.target_names
array(['class_0', 'class_1', 'class_2'], dtype='<U7')
feature_name = ['酒精','苹果酸','灰','灰的碱性','镁','总酚','类黄酮','非黄烷类酚类','花青素','颜色强度','色调','od280/od315稀释葡萄酒','脯氨酸']
import graphviz
dot_data = tree.export_graphviz(clf
,out_file=None
,feature_names= feature_name
,class_names=["琴酒","雪莉","贝尔摩德"]
,filled=True
,rounded=True
)
graph = graphviz.Source(dot_data)
graph
# 特征重要性
clf.feature_importances_
array([0. , 0. , 0. , 0.01872006, 0. ,
0. , 0.4246147 , 0.01238658, 0. , 0.25103605,
0. , 0. , 0.29324261])
[*zip(feature_name,clf.feature_importances_)]
[('酒精', 0.0),
('苹果酸', 0.0),
('灰', 0.012386578479639187),
('灰的碱性', 0.01872005695828305),
('镁', 0.0),
('总酚', 0.0),
('类黄酮', 0.4246147034036727),
('非黄烷类酚类', 0.0),
('花青素', 0.0),
('颜色强度', 0.25103604926413603),
('色调', 0.0),
('od280/od315稀释葡萄酒', 0.0),
('脯氨酸', 0.2932426118942691)]
无论决策树模型如何进化,在分枝上的本质都还是追求某个不纯度相关的指标的优化,而正如我们提到的,不纯度是基于节点来计算的,也就是说,决策树在建树时,是靠优化节点来追求一棵优化的树,但最优的节点能够保证最优的树吗?集成算法被用来解决这个问题:sklearn表示,既然一棵树不能保证最优,那就建更多的不同的树,然后从中取最好的。怎样从一组数据集中建不同的树?在每次分枝时,不从使用全部特征,而是随机选取一部分特征,从中选取不纯度相关指标最优的作为分枝用的节点。这样,每次生成的树也就不同了。
作者:是菜菜和菊安酱呀
链接:https://www.jianshu.com/p/dcc7a28060b1
来源:简书
著作权归作者所有。商业转载请联系作者获得授权,非商业转载请注明出处。
# random_state 随机数种子
# random_state用来设置分枝中的随机模式的参数,默认None,
# 在高维度时随机性会表现更明显,低维度的数据(比如鸢尾花数据集),
# 随机性几乎不会显现。输入任意整数,会一直长出同一棵树,让模型稳定下来。
clf=tree.DecisionTreeClassifier(criterion="entropy",random_state=4)
clf=clf.fit(x_train,y_train)
score=clf.score(x_test,y_test)
score
0.9166666666666666
# splitter也是用来控制决策树中的随机选项的,有两种输入值,
# 输入”best",决策树在分枝时虽然随机,
# 但是还是会优先选择更重要的特征进行分枝
# (重要性可以通过属性feature_importances_查看),
# 输入“random",决策树在分枝时会更加随机,
# 树会因为含有更多的不必要信息而更深更大,
# 并因这些不必要信息而降低对训练集的拟合
clf=tree.DecisionTreeClassifier(criterion="entropy"
,random_state=10
,splitter="random")
clf=clf.fit(x_train,y_train)
score=clf.score(x_test,y_test)
score
0.9444444444444444
clf=tree.DecisionTreeClassifier(criterion="entropy"
,random_state=10
,splitter="best")
clf=clf.fit(x_train,y_train)
score=clf.score(x_test,y_test)
score
0.9722222222222222
feature_name
['酒精',
'苹果酸',
'灰',
'灰的碱性',
'镁',
'总酚',
'类黄酮',
'非黄烷类酚类',
'花青素',
'颜色强度',
'色调',
'od280/od315稀释葡萄酒',
'脯氨酸']
dot_data=tree.export_graphviz(clf
,feature_names=feature_name
, class_names=["琴酒","雪莉","贝尔摩德"]
,filled=True
,rounded=True
)
graph=graphviz.Source(dot_data)
graph
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-fdhXQY1C-1596879540248)(output_28_0.svg)]
剪枝参数
防止过拟合
- max_depth
限制树的最大深度,超过设定深度的树枝全部剪掉
- min_samples_leaf & min_samples_split
min_samples_leaf限定,一个节点在分枝后的每个子节点都必须包含至少min_samples_leaf个训练样本,否则分枝就不会发生,或者,分枝会朝着满足每个子节点都包含min_samples_leaf个样本的方向去发生
score_train = clf.score(x_train, y_train)
score_train
1.0
clf=tree.DecisionTreeClassifier(criterion="entropy"
,random_state=20
,splitter="random"
,max_depth=3
,min_samples_leaf=20
,min_samples_split=10)
clf=clf.fit(x_train,y_train)
dot_data = tree.export_graphviz(clf
,feature_names= feature_name
,class_names=["琴酒","雪莉","贝尔摩德"]
,filled=True
,rounded=True
)
graph = graphviz.Source(dot_data)
graph
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-fWkEQZ9N-1596879540248)(output_31_0.svg)]
clf.score(x_train,y_train)
clf.score(x_test,y_test)
0.8888888888888888
clf=tree.DecisionTreeClassifier(criterion="entropy"
,random_state=10
,splitter="random")
clf=clf.fit(x_train,y_train)
score=clf.score(x_test,y_test)
score
0.9444444444444444
确认最优的剪枝参数
- 超参数的学习曲线,是一条以超参数的取值为横坐标,模型的度量指标为纵坐标的曲线,它是用来衡量不同超参数取值下模型的表现的线
import matplotlib.pyplot as plt
test=[]
for i in range(10):
clf=tree.DecisionTreeClassifier(max_depth=i+1
,criterion="entropy"
,random_state=30
,splitter="random")
clf=clf.fit(x_train,y_train)
score=clf.score(x_test,y_test)
test.append(score)
plt.plot(range(1,11),test)
plt.legend()
No handles with labels found to put in legend.
<matplotlib.legend.Legend at 0x1dd1b596e88>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-O1PTW2vM-1596879540249)(output_35_2.png)]
目标权重参数
- class_weight & min_weight_fraction_leaf
有了权重之后,样本量就不再是单纯地记录数目,而是受输入的权重影响了,因此这时候剪枝,就需要搭配min_ weight_fraction_leaf这个基于权重的剪枝参数来使用
重要属性和接口
-
feature_importances_,能够查看各个特征对模型的重要性
-
apply返回每个测试样本所在的叶子节点的索引
- clf.apply(Xtest)
-
predict返回每个测试样本的分类/回归结果
- clf.predict(Xtest)
-
score
实例 分类树在合成集的表现
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
#生成随机二分型数据 make_classification
x,y=make_classification(n_samples=1000 #个数
,n_features=2 # 特征数
,n_redundant=0 # 冗余特征
,n_informative=2 # 信息特征
,random_state=1 # 随机模式
, n_clusters_per_class=1 # 蔟内标签类
)
plt.scatter(x[:,0],x[:,1])
<matplotlib.collections.PathCollection at 0x1dd1b5d5c08>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Xl9kfCC4-1596879540249)(output_40_1.png)]
for a,b in zip(x,y):
if b==1:
plt.scatter(a[0],a[1],color='red')
else:
plt.scatter(a[0],a[1],color='blue') v
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-eJGbHac5-1596879540250)(output_41_0.png)]
x=x[1:5]
x
array([[ 1.36917601, -0.63734411],
[ 0.50231787, -0.45910529],
[ 1.83319262, -1.29808229],
[ 1.04235568, 1.12152929]])
y=y[1:5]
y
array([1, 1, 1, 0])
for a,b in zip(x,y):
if b==1:
plt.scatter(a[0],a[1],color='red')
else:
plt.scatter(a[0],a[1],color='blue')
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-mDXDQAYk-1596879540250)(output_44_0.png)]
x.shape
(1000, 2)
import numpy as np
rng=np.random.RandomState(2)
x+=2*rng.uniform(size=x.shape)
linearly_separable = (x, y)
plt.plot(x[:,0],x[:,1])
[<matplotlib.lines.Line2D at 0x1dd1d87dac8>]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-K4vKHIFx-1596879540251)(output_46_1.png)]
画出3种数据集对应的三颗决策树分类效果
# 生成数据集
datasets = [make_moons(noise=0.3, random_state=0),
make_circles(noise=0.2, factor=0.5, random_state=1),
linearly_separable]
import operator
from functools import reduce
from matplotlib.colors import ListedColormap
figure = plt.figure(figsize=(6, 9))
i=1
for ds_index,ds in enumerate(datasets):
x,y=ds
# StandardScaler类是一个用来讲数据进行归一化和标准化的类
x=StandardScaler().fit_transform(x)
x_train,x_test,y_train,y_text=train_test_split(x,y,test_size=.4)
x1_min, x1_max = x[:, 0].min() - .5, x[:, 0].max() + .5
x2_min, x2_max = x[:, 1].min() - .5, x[:, 1].max() + .5
arr1,arr2=np.meshgrid(np.arange(x1_min,x1_max,0.2),
np.arange(x2_min,x2_max,0.2))
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
ax = plt.subplot(len(datasets), 2, i)
if ds_index ==0:
ax.set_title("Input data")
# 先画训练集
ax.scatter(x_train[:,0],x_train[:,1],c=y_train,cmap=cm_bright)
#画测试集
# ax.scatter(x_test[:,0], x_test[:,1])
ax.set_xlim(arr1.min(), arr1.max())
# ax.set_ylim(arr2.min(), arr2.max())
# ax.set_xticks(())
ax.set_yticks(())
i+=1
ax=plt.subplot(len(datasets),2,i)
clf=tree.DecisionTreeClassifier(max_depth=5)
clf=clf.fit(x_train,y_train)
score=clf.score(x_test,y_test)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-a5rEJMU3-1596879540251)(output_49_0.png)]
- 根据菜菜的sklearn学习得