决策树
1.决策树 - 分类树(DecisionTreeClassifier)
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
import numpy as np
"""
加载数据
"""
X, y = load_iris(return_X_y=True)
"""
切分数据集
"""
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.15,
random_state=1)
"""
构建模型,将树最大深度设置为3
"""
dtc = DecisionTreeClassifier(max_depth=2)
"""
训练模型
"""
dtc.fit(X=X_train,y=y_train)
"""
模型预测
"""
y_pred = dtc.predict(X=X_test)
"""
画出构建好的树
"""
plot_tree(dtc)
"""
测试误差
"""
(y_pred == y_test).mean()
2.决策树 - 回归树(DecisionTreeRegressor)
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree
import numpy as np
"""
加载数据集
"""
X, y = load_boston(return_X_y=True)
"""
切分数据集
"""
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.15,
random_state=1)
"""
构建模型
"""
dtr = DecisionTreeRegressor(max_depth=2)
"""
训练模型
"""
dtr.fit(X=X_train,y=y_train)
"""
模型预测
"""
y_pred = dtr.predict(X=X_test)
"""
画出构建好的树
"""
plot_tree(dtr)
"""
测试误差
"""
((y_test - y_pred) **2).mean()
3.实现决策树算法
def get_entropy(y):
"""
根据标签,计算信息熵
"""
if(len(y)) == 0:
return 0
samples = np.array([y.tolist().count(idx) for idx in range(3)])
p = samples / samples.sum() + 1e-6
entropy = -(p * np.log2(p)).sum()
return entropy
def get_gini(y):
"""
根据标签,计算 gini 系数
"""
samples = np.array([y.tolist().count(idx) for idx in range(3)])
print(samples)
p = samples / samples.sum()
print(p)
gini = (p * (1 - p)).sum()
return gini
"""
遍历每个特征及其分隔值
寻找熵下降最快的分割点
"""
best_split = {"feature_idx" : 0,"feature_value" : 0,"entropy" : np.inf}
for feature_idx in range(4):
for value in set(X_train[:,feature_idx]):
left_idx = X_train[:,feature_idx] <= value
right_idx = X_train[:,feature_idx] > value
n_left = sum(left_idx)
n_right = sum(right_idx)
X_left = X_train[left_idx]
y_left = y_train[left_idx]
entropy_left = get_entropy(y_left)
X_right = X_train[right_idx]
y_right = y_train[right_idx]
entropy_right = get_entropy(y_right)
entropy = n_left / (n_left + n_right) * entropy_left + n_right / (n_left + n_right) * entropy_right
if entropy < best_split["entropy"]:
best_split["feature_idx"] = feature_idx
best_split['feature_value'] = value
best_split["entropy"] = entropy