机器学习-决策树

决策树

1.决策树 - 分类树(DecisionTreeClassifier)

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
import numpy as np

"""
    加载数据
"""

X, y = load_iris(return_X_y=True)

"""
    切分数据集

"""
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.15,
                                                    random_state=1)
"""
    构建模型,将树最大深度设置为3

"""
dtc = DecisionTreeClassifier(max_depth=2)

"""
    训练模型

"""
dtc.fit(X=X_train,y=y_train)

"""
	模型预测

"""
y_pred = dtc.predict(X=X_test)

"""
	画出构建好的树

"""
plot_tree(dtc)

"""
	测试误差

"""
(y_pred == y_test).mean()

2.决策树 - 回归树(DecisionTreeRegressor)

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree
import numpy as np

"""
    加载数据集
    
"""
X, y = load_boston(return_X_y=True)

"""
    切分数据集

"""
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.15,
                                                    random_state=1)

"""
    构建模型

"""
dtr = DecisionTreeRegressor(max_depth=2)

"""
    训练模型

"""
dtr.fit(X=X_train,y=y_train)

"""
模型预测

"""
y_pred = dtr.predict(X=X_test)

"""
	画出构建好的树

"""
plot_tree(dtr)

"""
	测试误差

"""
((y_test - y_pred) **2).mean()

3.实现决策树算法

def get_entropy(y):
    """
        根据标签,计算信息熵
    """
    if(len(y)) == 0:
        return 0
    
    samples = np.array([y.tolist().count(idx) for idx in range(3)])
    p = samples / samples.sum() + 1e-6
    entropy = -(p * np.log2(p)).sum()
    return entropy

def get_gini(y):
    """
        根据标签,计算 gini 系数
    """

    samples = np.array([y.tolist().count(idx) for idx in range(3)])
    print(samples)

    p = samples / samples.sum()
    print(p)

    gini = (p * (1 - p)).sum()

    return gini

"""
    遍历每个特征及其分隔值
    寻找熵下降最快的分割点
"""
best_split = {"feature_idx" : 0,"feature_value" : 0,"entropy" : np.inf}

for feature_idx in range(4):
    #print(feature_idx)

    for value in set(X_train[:,feature_idx]):
        # print("当前分割值为",value)
        left_idx = X_train[:,feature_idx] <= value
        right_idx = X_train[:,feature_idx] > value
        # print(left_idx)
        n_left = sum(left_idx)
        n_right = sum(right_idx)
        
        
        X_left = X_train[left_idx]
        # print(X_left)
        y_left = y_train[left_idx]
        entropy_left = get_entropy(y_left)
        # print(type(entropy_left))
        
        X_right = X_train[right_idx]
        y_right = y_train[right_idx]
        entropy_right =  get_entropy(y_right)
        
        entropy = n_left / (n_left + n_right) * entropy_left + n_right / (n_left + n_right) * entropy_right
        
        if entropy < best_split["entropy"]:
            best_split["feature_idx"] = feature_idx
            best_split['feature_value'] = value
            best_split["entropy"] = entropy

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值