决策树python实践

最新推荐文章于 2024-07-11 09:58:56 发布

ALWAYS_FANG

最新推荐文章于 2024-07-11 09:58:56 发布

阅读量507

点赞数

分类专栏：机器学习

本文链接：https://blog.csdn.net/qq_38120760/article/details/82902961

版权

机器学习专栏收录该内容

24 篇文章 0 订阅

订阅专栏

#coding=utf-8

import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.tree import export_graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

mpl.rcParams['font.sans-serif']=['simhei']
mpl.rcParams['axes.unicode_minus']=False
#利用决策树对鸢尾花数据分类
#只使用鸢尾花数据的花瓣长度和花瓣宽度进行预测
data = load_iris()
x_train,x_test,y_train,y_test = train_test_split(data.data[:,2:4],data.target,test_size=0.3)
pipe = Pipeline([("tree",DecisionTreeClassifier(criterion="entropy"))])


#首先选择最佳深度
#第一种
y = []
for i in range(1,15):
    pipe.set_params(tree__max_depth=i).fit(x_train,y_train)
    y.append(pipe.score(x_test,y_test))
    
plt.xlabel(u"深度")
plt.ylabel(u"准确率")
plt.title(u"一次验证")
plt.plot(range(1,15),y,marker="*",ms=10)
plt.show()
#第二种交叉验证
y = []
print "十则交叉验证"
for i in range(1,15):
    pipe.set_params(tree__max_depth=i)
    score = cross_val_score(pipe._final_estimator,data.data[:,2:4],data.target,cv=10)
    y.append(np.mean(score))
plt.xlabel(u"深度")
plt.ylabel(u"准确率")
plt.title(u"十则交叉验证")
plt.plot(range(1,15),y,marker="*",ms=10)
plt.show()

#通过上面的验证得出最佳深度为4
data0 = data.data[data.target==0][:,2:4]
data1 = data.data[data.target==1][:,2:4]
data2 = data.data[data.target==2][:,2:4]

#训练模型
pipe.set_params(tree__max_depth=4).fit(data.data[:,2:4],data.target)
#用颜色在图中区分每一个区域
#定义10000个网格点，用模型去预测
x=np.linspace(np.min(data.data[:,2]),np.max(data.data[:,2]),100)
y=np.linspace(np.min(data.data[:,3]),np.max(data.data[:,3]),100)
x,y = np.meshgrid(x,y)
pre_x = np.stack([x.ravel(),y.flat],axis=1)
pre_y = pipe.predict(pre_x)
pre_data0 = pre_x[pre_y==0]
pre_data1 = pre_x[pre_y==1]
pre_data2 = pre_x[pre_y==2]

plt.title(u"分类效果")
plt.xlabel(u"petal_length")
plt.ylabel(u"petal_width")

plt.scatter(pre_x[pre_y==0][:,0],pre_x[pre_y==0][:,1],color="lightgray")
plt.scatter(pre_x[pre_y==1][:,0],pre_x[pre_y==1][:,1],color="wheat")
plt.scatter(pre_x[pre_y==2][:,0],pre_x[pre_y==2][:,1],color="lightskyblue")
plt.scatter(data0[:,0],data0[:,1],marker="*")
plt.scatter(data1[:,0],data1[:,1],marker="^")
plt.scatter(data2[:,0],data2[:,1],marker="v")
plt.xlim(np.min(data.data[:,2]),np.max(data.data[:,2]))
plt.ylim(np.min(data.data[:,3]),np.max(data.data[:,3]))
plt.show()

#保存决策树图片文件。生成的文件可以用gvedit软件打开
dot_data = export_graphviz(pipe._final_estimator,out_file="decision_tree.dot")


#随机森林
pipe = Pipeline([("rf",RandomForestClassifier(max_features=2))])
y=[]
for i in range(1,21):
    pipe.set_params(rf__n_estimators=i).fit(x_train,y_train)
    y.append(pipe.score(x_test, y_test))
plt.plot(range(1,21),y)
plt.title(u"子树个数对准确率的影响")
plt.xlabel(u"子树个数")
plt.ylabel(u"准确率")
plt.show()

ALWAYS_FANG

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
决策树python实践

#coding=utf-8import numpy as npimport matplotlib as mplimport matplotlib.pyplot as pltfrom sklearn.datasets import load_irisfrom sklearn.model_selection import train_test_splitfrom sklearn.pip...
复制链接

扫一扫

专栏目录