机器学习实现决策树Decision Tree

还不秃顶的计科生

已于 2024-10-08 15:34:53 修改

阅读量92

点赞数 1

分类专栏：机器学习文章标签：机器学习决策树人工智能

于 2024-10-08 12:38:10 首次发布

本文链接：https://blog.csdn.net/weixin_74009895/article/details/142751211

版权

机器学习专栏收录该内容

51 篇文章 0 订阅

订阅专栏

第一部分：决策树核心思想和原理

分而治之，逐个击破，民主决策思想

决策树是一种常用的分类和回归模型，主要通过构建一棵树状结构来实现数据的分类或回归。

决策树是一种监督学习算法

可以用于实现分类也可以回归

决策树的深度：也就是决策树的层数

决策树需要研究的三个基本问题：

①特征选择(谁作为根节点呢)

②节点分裂（应该作为二分类还是多分类呢）

③阈值确定（判断标准的这个范围取值，大于小于多少分为yes或者no）

决策树属于一个“白盒”模型，可解释性很强，不像神经网络一样，神经网络是一个“黑盒模型”

第二部分：信息熵

详细讲解内容见我博客：

机器学习之损失函数大汇总~MSE+MAE+对数损失函数+交叉熵损失函数-CSDN博客

第三部分：决策树二分类问题代码实现

（1）第一部分：导包

#第一部分：导包
import numpy as np
import matplotlib.pyplot as plt

（2）第二部分：创建二分类问题熵变化图像

#第二部分：创建二分类问题熵变化图像
def entropy(p):
    return -(p*np.log2(p)+(1-p)*np.log2(1-p))
plot_x = np.linspace(0.001,0.999,200)
plt.plot(plot_x, entropy(plot_x))
plt.show()#绘制出熵的变化情况

（3）第三部分：创建数据集并绘制所有样本点

#第三部分：创建数据集并绘制所有样本点
from sklearn.datasets import load_iris
iris = load_iris()
x=iris.data[:,1:3]#我们只用第一列和第二列的数据即可
y=iris.target
plt.scatter(x[:,0],x[:,1],c=y)#横轴为第一列，纵轴为第二列
plt.show()

（4）第四部分：sklearn中的决策树

#第四部分：sklearn中的决策树
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth=2, criterion='entropy')  # 设置树的深度为2层，并设置按照熵进行划分标准
clf.fit(x, y)

（5）第五部分：使用决策边界来绘制样本点分类散布情况

#第五部分：使用决策边界来绘制样本点分类散布情况
def decision_boundary_plot(X, y, clf):
    axis_x1_min, axis_x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    axis_x2_min, axis_x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    x1, x2 = np.meshgrid(np.arange(axis_x1_min, axis_x1_max, 0.01),
                         np.arange(axis_x2_min, axis_x2_max, 0.01))
    z = clf.predict(np.c_[x1.ravel(), x2.ravel()])
    z = z.reshape(x1.shape)
    from matplotlib.colors import ListedColormap
    custom_cmap = ListedColormap(['#F5B9EF', '#BBFFBB', '#F9F9CB'])
    plt.contourf(x1, x2, z, cmap=custom_cmap)
    plt.scatter(X[:, 0], X[:, 1], c=y)
    plt.show()
decision_boundary_plot(x,y,clf)

（6）第六部分：可视化分类结构图

#第六部分：可视化分类结构图
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
# 绘制决策树
plt.figure(figsize=(10, 6))  # 设置图像大小
plot_tree(clf, filled=True)  # `filled=True` 会为不同的类别填充颜色
plt.show()

（7）完整pycharm代码实现

#第一部分：导包
import numpy as np
import matplotlib.pyplot as plt
#第二部分：创建二分类问题熵变化图像
def entropy(p):
    return -(p*np.log2(p)+(1-p)*np.log2(1-p))
plot_x = np.linspace(0.001,0.999,200)
plt.plot(plot_x, entropy(plot_x))
plt.show()#绘制出熵的变化情况
#第三部分：创建数据集并绘制所有样本点
from sklearn.datasets import load_iris
iris = load_iris()
x=iris.data[:,1:3]#我们只用第一列和第二列的数据即可
y=iris.target
plt.scatter(x[:,0],x[:,1],c=y)#横轴为第一列，纵轴为第二列
plt.show()
#第四部分：sklearn中的决策树
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth=2, criterion='entropy')  # 设置树的深度为2层，并设置按照熵进行划分标准
clf.fit(x, y)
#第五部分：使用决策边界来绘制样本点分类散布情况
def decision_boundary_plot(X, y, clf):
    axis_x1_min, axis_x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    axis_x2_min, axis_x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    x1, x2 = np.meshgrid(np.arange(axis_x1_min, axis_x1_max, 0.01),
                         np.arange(axis_x2_min, axis_x2_max, 0.01))
    z = clf.predict(np.c_[x1.ravel(), x2.ravel()])
    z = z.reshape(x1.shape)
    from matplotlib.colors import ListedColormap
    custom_cmap = ListedColormap(['#F5B9EF', '#BBFFBB', '#F9F9CB'])
    plt.contourf(x1, x2, z, cmap=custom_cmap)
    plt.scatter(X[:, 0], X[:, 1], c=y)
    plt.show()
decision_boundary_plot(x,y,clf)
#第六部分：可视化分类结构图
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
# 绘制决策树
plt.figure(figsize=(10, 6))  # 设置图像大小
plot_tree(clf, filled=True)  # `filled=True` 会为不同的类别填充颜色
plt.show()

（8）对第六部分详细阐述七内部原理

①计算熵的函数

from collections import Counter
import numpy as np
# 计算熵的函数
def calc_entropy(y):
    counter = Counter(y)  # 统计每个类别的数量
    sum_ent = 0
    for i in counter:
        p = counter[i] / len(y)
        sum_ent += -(p * np.log2(p))
    return sum_ent

②划分数据集的函数

# 划分数据集的函数，dim 代表维度，value 代表划分阈值
def split_dataset(x, y, dim, value):
    index_left = (x[:, dim] <= value)
    index_right = (x[:, dim] > value)
    return x[index_left], y[index_left], x[index_right], y[index_right]

③寻找最优划分条件的函数

# 寻找最优划分条件的函数
def find_best_split(x, y):
    best_dim = -1
    best_value = -1
    best_entropy = np.inf
    best_entropy_left, best_entropy_right = -1, -1
    for dim in range(x.shape[1]):
        sorted_index = np.argsort(x[:, dim])
        for i in range(x.shape[0] - 1):
            value_left, value_right = x[sorted_index[i], dim], x[sorted_index[i + 1], dim]
            if value_left != value_right:
                value = (value_left + value_right) / 2  # 候选阈值
                x_left, y_left, x_right, y_right = split_dataset(x, y, dim, value)
                entropy_left, entropy_right = calc_entropy(y_left), calc_entropy(y_right)
                entropy = (len(x_left) * entropy_left + len(x_right) * entropy_right) / x.shape[0]
                if entropy < best_entropy:
                    best_dim = dim
                    best_value = value
                    best_entropy = entropy
                    best_entropy_left, best_entropy_right = entropy_left, entropy_right
    return best_dim, best_value, best_entropy, best_entropy_left, best_entropy_right
print(find_best_split(x,y))

④完整pycharm汇总

#第一部分：导包
import numpy as np
import matplotlib.pyplot as plt
#第二部分：创建二分类问题熵变化图像
def entropy(p):
    return -(p*np.log2(p)+(1-p)*np.log2(1-p))
plot_x = np.linspace(0.001,0.999,200)
plt.plot(plot_x, entropy(plot_x))
plt.show()#绘制出熵的变化情况
#第三部分：创建数据集并绘制所有样本点
from sklearn.datasets import load_iris
iris = load_iris()
x=iris.data[:,1:3]#我们只用第一列和第二列的数据即可
y=iris.target
plt.scatter(x[:,0],x[:,1],c=y)#横轴为第一列，纵轴为第二列
plt.show()
#第四部分：sklearn中的决策树
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth=2, criterion='entropy')  # 设置树的深度为2层，并设置按照熵进行划分标准
clf.fit(x, y)
#第五部分：使用决策边界来绘制样本点分类散布情况
def decision_boundary_plot(X, y, clf):
    axis_x1_min, axis_x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    axis_x2_min, axis_x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    x1, x2 = np.meshgrid(np.arange(axis_x1_min, axis_x1_max, 0.01),
                         np.arange(axis_x2_min, axis_x2_max, 0.01))
    z = clf.predict(np.c_[x1.ravel(), x2.ravel()])
    z = z.reshape(x1.shape)
    from matplotlib.colors import ListedColormap
    custom_cmap = ListedColormap(['#F5B9EF', '#BBFFBB', '#F9F9CB'])
    plt.contourf(x1, x2, z, cmap=custom_cmap)
    plt.scatter(X[:, 0], X[:, 1], c=y)
    plt.show()
decision_boundary_plot(x,y,clf)
#第六部分：可视化分类结构图
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
# 绘制决策树
plt.figure(figsize=(10, 6))  # 设置图像大小
plot_tree(clf, filled=True)  # `filled=True` 会为不同的类别填充颜色
plt.show()

from collections import Counter
import numpy as np
# 计算熵的函数
def calc_entropy(y):
    counter = Counter(y)  # 统计每个类别的数量
    sum_ent = 0
    for i in counter:
        p = counter[i] / len(y)
        sum_ent += -(p * np.log2(p))
    return sum_ent

# 划分数据集的函数，dim 代表维度，value 代表划分阈值
def split_dataset(x, y, dim, value):
    index_left = (x[:, dim] <= value)
    index_right = (x[:, dim] > value)
    return x[index_left], y[index_left], x[index_right], y[index_right]

# 寻找最优划分条件的函数
def find_best_split(x, y):
    best_dim = -1
    best_value = -1
    best_entropy = np.inf
    best_entropy_left, best_entropy_right = -1, -1
    for dim in range(x.shape[1]):
        sorted_index = np.argsort(x[:, dim])
        for i in range(x.shape[0] - 1):
            value_left, value_right = x[sorted_index[i], dim], x[sorted_index[i + 1], dim]
            if value_left != value_right:
                value = (value_left + value_right) / 2  # 候选阈值
                x_left, y_left, x_right, y_right = split_dataset(x, y, dim, value)
                entropy_left, entropy_right = calc_entropy(y_left), calc_entropy(y_right)
                entropy = (len(x_left) * entropy_left + len(x_right) * entropy_right) / x.shape[0]
                if entropy < best_entropy:
                    best_dim = dim
                    best_value = value
                    best_entropy = entropy
                    best_entropy_left, best_entropy_right = entropy_left, entropy_right
    return best_dim, best_value, best_entropy, best_entropy_left, best_entropy_right
print(find_best_split(x,y))