python机器学习之决策树（decision tree）

英雄各有见

已于 2022-03-20 17:17:40 修改

阅读量5.3k

点赞数 8

分类专栏： python机器学习文章标签： python 决策树机器学习

于 2022-01-25 21:08:14 首次发布

本文链接：https://blog.csdn.net/qq_51539256/article/details/122674362

版权

python机器学习专栏收录该内容

5 篇文章 0 订阅

订阅专栏

文章目录

前言
一、环境
二、运用决策树进行分类
三、决策树进行回归（待更新）

前言

以鸢尾花数据为例，介绍决策树及python实现

一、环境

操作系统： windows 10
IDE: pycharm(python 3.9)
浏览器：Microsoft Edge

二、运用决策树进行分类

1.数据预处理，划分数据集

代码如下：

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split
import pydotplus

# 花萼长度、花萼宽度，花瓣长度，花瓣宽度
iris_feature_E = 'sepal length', 'sepal width', 'petal length', 'petal width'
iris_feature_E_2 = 'sepal length', 'sepal width'
iris_feature = u'花萼长度', u'花萼宽度', u'花瓣长度', u'花瓣宽度'
iris_class = 'Iris-setosa', 'Iris-versicolor', 'Iris-virginica'
# 数据文件路径
path = 'iris.data'
data = pd.read_csv(path, header=None)
x = data[range(4)]
print(x)
# 非常有用的方法，获取不同元素的个数并标号0-2
y = pd.Categorical(data[4]).codes

# 为了可视化，仅使用前两列特征
x = x.iloc[:, :2]

# random_state=1表示随机划分
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, random_state=1)
print(y_test.shape)

2.调用决策树分类器进行分类

代码如下：

model = DecisionTreeClassifier(criterion='entropy')
model.fit(x_train, y_train)
y_test_hat = model.predict(x_test)  # 测试数据

3.graphviz工具画决策树图像

代码如下：

# 保存
# dot -Tpng my.dot -o my.png
# 1、输出[方法一]
with open('iris.dot', 'w') as f:
    tree.export_graphviz(model, out_file=f)
# 1、输出[方法二]
# tree.export_graphviz(model, out_file='iris1.dot')
# 2、输出为pdf格式
# graph = pydotplus.graph_from_dot_file('iris.dot')
dot_data = tree.export_graphviz(model, out_file=None, feature_names=iris_feature_E_2, class_names=iris_class,
                               filled=True, rounded=True, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf('iris.pdf')
# 3、生成图像
f = open('iris.png', 'wb')
f.write(graph.create_png())
f.close()

决策树如下：

两变量决策树

4.画出分类后样本点的分布图像

# 画图
N, M = 50, 50  # 横纵各采样多少个值
x1_min, x2_min = x.min()
x1_max, x2_max = x.max()
t1 = np.linspace(x1_min, x1_max, N)
t2 = np.linspace(x2_min, x2_max, M)
x1, x2 = np.meshgrid(t1, t2)  # 生成网格采样点
x_show = np.stack((x1.flat, x2.flat), axis=1)  # 测试点
print (x_show.shape)

cm_light = mpl.colors.ListedColormap(['#A0FFA0', '#FFA0A0', '#A0A0FF'])
cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b'])
y_show_hat = model.predict(x_show)  # 预测值
y_show_hat = y_show_hat.reshape(x1.shape)  # 使之与输入的形状相同
print (y_show_hat)
plt.figure(facecolor='w')
plt.pcolormesh(x1, x2, y_show_hat, cmap=cm_light)  # 预测值的显示
plt.scatter(x_test[0], x_test[1], c=y_test.ravel(), edgecolors='k', s=150, zorder=10, cmap=cm_dark, marker='*')  # 测试数据
plt.scatter(x[0], x[1], c=y.ravel(), edgecolors='k', s=40, cmap=cm_dark)  # 全部数据
plt.xlabel(iris_feature[0], fontsize=15)
plt.ylabel(iris_feature[1], fontsize=15)
plt.xlim(x1_min, x1_max)
plt.ylim(x2_min, x2_max)
plt.grid(True)
plt.title(u'鸢尾花数据的决策树分类', fontsize=17)
plt.show()

样本点的分布：

样本点分布图像

5.模型评估

y_test = y_test.reshape(-1)
print (y_test_hat)
print (y_test)
result = (y_test_hat == y_test)   # True则预测正确，False则预测错误
acc = np.mean(result)
print ('准确度: %.2f%%' % (100 * acc))

# 过拟合：错误率
depth = np.arange(1, 15)
err_list = []
for d in depth:
    clf = DecisionTreeClassifier(criterion='entropy', max_depth=d)
    clf.fit(x_train, y_train)
    y_test_hat = clf.predict(x_test)  # 测试数据
    result = (y_test_hat == y_test)  # True则预测正确，False则预测错误
    if d == 1:
        print (result)
    err = 1 - np.mean(result)
    err_list.append(err)
    # print d, ' 准确度: %.2f%%' % (100 * err)
    print (d, ' 错误率: %.2f%%' % (100 * err))
plt.figure(facecolor='w')
plt.plot(depth, err_list, 'ro-', lw=2)
plt.xlabel(u'决策树深度', fontsize=15)
plt.ylabel(u'错误率', fontsize=15)
plt.title(u'决策树深度与过拟合', fontsize=17)
plt.grid(True)
plt.show()