决策树实例-红酒数据集
无参数模型
from sklearn import tree
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
import pandas as pd
wine = load_wine()
df1 = pd.concat([pd.DataFrame(wine.data), pd.DataFrame(wine.target)], axis=1)
print(df1)
#查看特征
print(wine.feature_names)
#查看标签
print(wine.target_names)
#分割数据集
x_train, x_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size=0.3)
print(x_train.shape)
print(wine.data.shape)
# 实例化classifier
clf = tree.DecisionTreeClassifier(criterion='entropy')
clf = clf.fit(x_train, y_train)
score = clf.score(x_test, y_test)
print(score)
# feature_names 数据集的特征
feature_name=['酒精','苹果酸','灰','灰的碱性','镁','总酚','类黄酮','非黄烷类酚类','花青素','颜色强度','色调','od280/od315稀释葡萄酒','脯氨酸']
import graphviz
dot_data=tree.export_graphviz(clf,feature_names=feature_name,
class_names=['琴酒','雪莉','贝尔摩德'],
filled=True,
rounded=True)
graph=graphviz.Source(dot_data)
graph.render('tree')
print(clf.feature_importances_)
print([*zip(feature_name,clf.feature_importances_)])
参数选择
random_state
clf = tree.DecisionTreeClassifier(criterion='entropy', random_state=30)
clf = clf.fit(x_train, y_train)
score = clf.score(x_test, y_test)
print(score)
splitter
#splitter有best,random两种参数
#best:模型会优先选择较为重要的特征进行分支
#random:分支随机选择,树会更深,降低过拟合
clf = tree.DecisionTreeClassifier(criterion='entropy'
, random_state=30,
splitter='random')
clf = clf.fit(x_train, y_train)
score = clf.score(x_test, y_test)
print(score)
# feature_names 数据集的特征
feature_name=['酒精','苹果酸','灰','灰的碱性','镁','总酚','类黄酮','非黄烷类酚类','花青素','颜色强度','色调','od280/od315稀释葡萄酒','脯氨酸']
import graphviz
dot_data=tree.export_graphviz(clf,feature_names=feature_name,
class_names=['琴酒','雪莉','贝尔摩德'],
filled=True,
rounded=True)
graph=graphviz.Source(dot_data)
graph.render('tree')
剪枝策略
max_depth:限制深度
min_samples_leaf:每个叶子节点包含的最少样本数
min_samples_split至少包括多少个训练样本才允许分支
clf = tree.DecisionTreeClassifier(criterion='entropy'
, random_state=30,
splitter='random'
, max_depth=3
, min_samples_leaf=10
,min_samples_split=10
)
clf = clf.fit(x_train, y_train)
score = clf.score(x_test, y_test)
print(score)
超参数学习曲线
通过图像更加直观反映max_depth对结果的影响,确定调参区间
import matplotlib.pyplot as plt
test = []
for i in range(10):
clf = tree.DecisionTreeClassifier(criterion='entropy'
,max_depth=i+1
, random_state=30,
splitter='random'
)
clf = clf.fit(x_train, y_train)
score = clf.score(x_test, y_test)
test.append(score)
plt.plot(range(1,11),test,color='r',label='max_depth')
plt.legend()
plt.show()
# apply返回每个测试样本所在的叶子节点索引
print(clf.apply(x_test))
# predict返回每个测试样本的分类/回归结果
print(clf.predict(x_test))
参考
https://www.bilibili.com/video/BV1sb411c7S6