#1、导入需要的算法库和模块
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
import graphviz
#2、探索数据
wine = load_wine() #导入红酒数据
pd.concat([pd.DataFrame(wine.data),pd.DataFrame(wine.target)],axis = 1) #连接数据
#3、分训练集和测试集
xtrain,xtest,ytrain,ytest = train_test_split(wine.data, wine.target, test_size = 0.3)
#4、建立模型
clf = tree.DecisionTreeClassifier(
criterion='entropy' #实例化
,random_state=30
,splitter='random'
,max_depth=3
,min_samples_leaf=10
,min_samples_split=10
)
clf = clf.fit(xtrain,ytrain) #训练模型
score = clf.score(xtest, ytest)
score
>>0.8703703703703703
#5、画出一棵树
feature_name = ['酒精','苹果酸','灰','灰的碱性','镁','总酚','类黄酮','非黄烷类酚类','花青素','颜色强度','色调','od280/od315稀释葡萄酒','脯氨酸']
dot_data = tree.export_graphviz(clf
,out_file = None
,feature_names=feature_name
,class_names=["琴酒","雪莉","贝尔摩德"]
,filled = True
,rounded = True
)
graph = graphviz.Source(dot_data)
graph
#6、探索决策树
#特征重要性
clf.feature_importances_
>>array([0.23823264, 0. , 0. , 0. , 0. ,
0. , 0.60075019, 0.00337075, 0. , 0. ,
0. , 0.09889766, 0.05874876])
[*zip(feature_name,clf.feature_importances_)] #查看各特征重要性
>>[('酒精', 0.3054576317612548),
('苹果酸', 0.0),
('灰', 0.0),
('灰的碱性', 0.0),
('镁', 0.0),
('总酚', 0.0),
('类黄酮', 0.5105678404764971),
('非黄烷类酚类', 0.00040120124488055394),
('花青素', 0.0),
('颜色强度', 0.0),
('色调', 0.0),
('od280/od315稀释葡萄酒', 0.15597190486406962),
('脯氨酸', 0.027601421653297924)]
#7、确认最优剪枝参数
import matplotlib.pyplot as plt
test = []
for i in range(10):
clf = tree.DecisionTreeClassifier( max_depth=i+1
,criterion='entropy'
,random_state=30
,splitter='random'
)
clf = clf.fit(xtrain, ytrain)
score = clf.score(xtest, ytest)
test.append(score)
plt.plot(range(1,11),test,color='red',label='max_depth')
plt.legend()
plt.show()
>>图
#apply返回每个测试样本所在的叶子节点的索引
clf.apply(xtest)
#predict返回每个测试样本的分类/回归结果
clf.predict(xtest)