机器学习-决策树

最新推荐文章于 2024-10-12 16:55:46 发布

老龙QAQ

最新推荐文章于 2024-10-12 16:55:46 发布

阅读量319

点赞数

分类专栏：机器学习文章标签：机器学习决策树 python

本文链接：https://blog.csdn.net/weixin_52167116/article/details/127394112

版权

机器学习专栏收录该内容

2 篇文章 0 订阅

订阅专栏

决策树

1.决策树 - 分类树(DecisionTreeClassifier)

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
import numpy as np

"""
    加载数据
"""

X, y = load_iris(return_X_y=True)

"""
    切分数据集

"""
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.15,
                                                    random_state=1)
"""
    构建模型，将树最大深度设置为3

"""
dtc = DecisionTreeClassifier(max_depth=2)

"""
    训练模型

"""
dtc.fit(X=X_train,y=y_train)

"""
	模型预测

"""
y_pred = dtc.predict(X=X_test)

"""
	画出构建好的树

"""
plot_tree(dtc)

"""
	测试误差

"""
(y_pred == y_test).mean()

2.决策树 - 回归树(DecisionTreeRegressor)

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree
import numpy as np

"""
    加载数据集
    
"""
X, y = load_boston(return_X_y=True)

"""
    切分数据集

"""
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.15,
                                                    random_state=1)

"""
    构建模型

"""
dtr = DecisionTreeRegressor(max_depth=2)

"""
    训练模型

"""
dtr.fit(X=X_train,y=y_train)

"""
模型预测

"""
y_pred = dtr.predict(X=X_test)

"""
	画出构建好的树

"""
plot_tree(dtr)

"""
	测试误差

"""
((y_test - y_pred) **2).mean()

3.实现决策树算法

def get_entropy(y):
    """
        根据标签，计算信息熵
    """
    if(len(y)) == 0:
        return 0
    
    samples = np.array([y.tolist().count(idx) for idx in range(3)])
    p = samples / samples.sum() + 1e-6
    entropy = -(p * np.log2(p)).sum()
    return entropy

def get_gini(y):
    """
        根据标签，计算 gini 系数
    """

    samples = np.array([y.tolist().count(idx) for idx in range(3)])
    print(samples)

    p = samples / samples.sum()
    print(p)

    gini = (p * (1 - p)).sum()

    return gini

"""
    遍历每个特征及其分隔值
    寻找熵下降最快的分割点
"""
best_split = {"feature_idx" : 0,"feature_value" : 0,"entropy" : np.inf}

for feature_idx in range(4):
    #print(feature_idx)

    for value in set(X_train[:,feature_idx]):
        # print("当前分割值为",value)
        left_idx = X_train[:,feature_idx] <= value
        right_idx = X_train[:,feature_idx] > value
        # print(left_idx)
        n_left = sum(left_idx)
        n_right = sum(right_idx)
        
        
        X_left = X_train[left_idx]
        # print(X_left)
        y_left = y_train[left_idx]
        entropy_left = get_entropy(y_left)
        # print(type(entropy_left))
        
        X_right = X_train[right_idx]
        y_right = y_train[right_idx]
        entropy_right =  get_entropy(y_right)
        
        entropy = n_left / (n_left + n_right) * entropy_left + n_right / (n_left + n_right) * entropy_right
        
        if entropy < best_split["entropy"]:
            best_split["feature_idx"] = feature_idx
            best_split['feature_value'] = value
            best_split["entropy"] = entropy