参考资料:
1.github: https://github.com/fengdu78/lihang-code
2.李航 《统计学习方法》
基于sklearn 实现决策树
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
def create_data():
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['label'] = iris.target
df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width','label']
data = np.array(df.iloc[:100, [0, 1, -1]])
return data[:,:2], data[:, -1]
X, y = create_data()
# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# 模型训练
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
# 模型预测
clf.score(X_test, y_test)
导入的包(库)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from collections import Counter
import math
from math import log
import pprint
《统计学习方法》书上数据集
def create_data():
datasets = [
['青年', '否' ,'否' , '一般', '否'],
['青年', '否' ,'否' , '好', '否'],
['青年', '是' ,'否' , '好', '是'],
['青年', '是' ,'是' , '一般', '是'],
['青年', '否' ,'否' , '一般', '否'],
['中年', '否' ,'否' , '一般', '否'],
['中年', '否' ,'否' , '好', '否'],
['中年', '是' ,'是' , '好', '是'],
['中年', '否' ,'是' , '非常好', '是'],
['中年', '否' ,'是' , '非常好', '是'],
['老年', '否' ,'是' , '非常好', '是'],
['老年', '否' ,'是' , '好', '是'],
['老年', '是' ,'否' , '好', '是'],
['老年', '是' ,'否' , '非常好', '是'],
['老年', '否' ,'否' , '一般', '否']
]
labels = ['年龄', '有工作', '有自己的房子', '信贷情况', '类别']
return datasets, label
datasets, labels = create_data()
train_data = pd.DataFrame(datasets, columns=labels)
熵: H ( x ) = − ∑ i = 1 n p i log p i H(x) = -\sum_{i=1}^{n}p_i\log{p_i} H(x)=−∑i=1npilogpi
def cal_entropy(datasets):
data_length = len(datasets)
label_count = {}
for i in range(data_length):
# 读取数据的最后一列label
label = datasets[i-1][-1]
if label not in label_count:
label_count[label] = 0
# 统计每个label的数量
label_count[label] += 1
# 根据熵的计算公式计算熵
entropy = -sum([(p/data_length) * log(p/data_length, 2) for p in label_count.values()])
return entropy
条件熵: H ( X ∣ Y ) = ∑ P ( X ∣ Y ) log P ( X ∣ Y ) H(X|Y)=\sum{P(X|Y)}\log{P(X|Y)} H(X∣Y)=∑P(X∣Y)logP(X∣Y)
def cal_cond_entropy(datasets, axis=0):
data_length = len(datasets)
feature_sets = {}
for i in range(data_length):
feature = datasets[i][axis]
if feature not in feature_sets:
feature_sets[feature] = []
# 统计每一个特征的包含数据
feature_sets[feature].append(datasets[i])
# 根据条件熵的计算公式计算条件熵
cond_entropy = sum([(len(p)/data_length) * cal_entropy(p) for p in feature_sets.values()])
return cond_entropy
信息增益 : g ( D , A ) = H ( D ) − H ( D ∣ A ) g(D, A)=H(D)-H(D|A) g(D,A)=H(D)−H(D∣A)
def info_gain(entropy, cond_entropy):
return entropy - cond_entropy
def info_gain_train(datasets):
count = len(datasets[0]) - 1
entropy = cal_entropy(datasets)
best_feature = []
for c in range(count):
# 对每个特征计算信息增益
c_info_gain = info_gain(entropy, cal_cond_entropy(datasets, axis=c))
best_feature.append((c, c_info_gain))
print('特征({}) - info_gain - {:.3f}'.format(labels[c], c_info_gain))
best = max(best_feature, key=lambda x:x[-1])
return '特征({})的信息增益最大,选择为根节点特征'.format(labels[best[0]])
info_gain_train(np.array(datasets))
训练结果
信息增益比: g R ( D , A ) = g ( D , A ) H ( A ) g_R(D, A) = \frac{g(D,A)}{H(A)} gR(D,A)=H(A)g(D,A)
def feature_entropy(datasets, axis=0):
data_length = len(datasets)
feature_sets = {}
# 遍历每条数据,计算对于每个特征的每个可能取值对应的样本数量
for i in range(data_length):
feature = datasets[i][axis]
if feature not in feature_sets:
feature_sets[feature] = 0
feature_sets[feature] += 1
# 计算axis对应特征的熵值
f_entropy = -sum([(p/data_length) * log(p/data_length, 2) for p in feature_sets.values()])
return f_entropy
def info_gain_ratio(datasets, axis=0):
# 根据信息增益比的计算公式计算信息增益
i_gain = info_gain(cal_entropy(datasets), cal_cond_entropy(datasets, axis=axis))
f_entropy = feature_entropy(datasets, axis=axis)
return i_gain / f_entropy
def info_gain_ratio_train(datasets):
count = len(datasets[0]) - 1
entropy = cal_entropy(datasets)
best_feature = []
for c in range(count):
# 计算每个特征的信息增益比
c_info_gain_ratio = info_gain_ratio(datasets, axis=c)
best_feature.append((c, c_info_gain_ratio))
print('特征({}) - info_gain_ratio - {:.3f}'.format(labels[c], c_info_gain_ratio))
# 获取信息增益比最大的特征
best = max(best_feature, key=lambda x:x[-1])
return '特征({})的信息增益比最大,选择为根节点特征'.format(labels[best[0]])
训练结果
基尼指数: G i n i ( D ) = ∑ k = 1 K p k log p k = 1 − ∑ k = 1 K p k 2 Gini(D)=\sum_{k=1}^{K}p_k\log{p_k}=1-\sum_{k=1}^{K}p_k^2 Gini(D)=∑k=1Kpklogpk=1−∑k=1Kpk2
def Gini(datasets):
data_length = len(datasets)
# 统计特征集合
feature_sets = {}
for i in range(data_length):
feature = datasets[i][-1]
if feature not in feature_sets:
feature_sets[feature] = 0
# 统计每个特征的样本数量
feature_sets[feature] += 1
# 根据基尼指数的计算公式计算基尼指数
gini = -sum([ math.pow((p/data_length), 2) for p in feature_sets.values()])
gini_d = 1 + gini
return gini_d
# 关于特征的基尼指数计算
def Gini_d(datasets, axis=0):
data_length = len(datasets)
feature_sets = {}
for i in range(data_length):
feature = datasets[i][axis]
if feature not in feature_sets:
feature_sets[feature] = []
# 统计每一个特征的包含样本数据
feature_sets[feature].append(datasets[i])
# 关于特征的基尼指数
gini_c = []
# 遍历所有的特征集合,如年龄特征中可能取值:青年,中年,老年,依次遍历这三个值
for p in feature_sets:
p_data = feature_sets[p]
# 以第一条遍历的数据为例,统计青年包含的样本数据
gini_d_a = (len(p_data)/data_length) * Gini(p_data)
# 统计不包含青年的样本数据
f_remain = []
for key in feature_sets:
if key != p:
f_remain.extend(feature_sets[key])
f_array_remain = np.array(f_remain)
f_remain = f_array_remain.reshape((data_length-len(p_data), len(datasets[0])))
gini_d_drop_a = (len(f_remain) / data_length) * Gini(f_remain)
# 计算基于年龄为青年划分的样本数据的基尼指数
gini_p = gini_d_a + gini_d_drop_a
# 记录年龄中青年,中年,老年依次作为划分点的基尼指数
gini_c.append((p, gini_p))
# 打印出基尼指数的数值信息
print('特征({})为({}) - gini_axis_p - {:.3f}'.format(labels[axis], p, gini_p))
# 选择最小的基尼指数特征值作为划分点
min_split = min(gini_c, key=lambda x:x[-1])
print('特征({})中的({})最优切分点'.format(labels[axis], min_split[0]))
return gini_c, (labels[axis], min_split[1])
def gini_d_c_train(datasets):
count = len(datasets[0]) - 1
best_feature = []
for c in range(count):
c_gini_d, min_c = Gini_d(datasets, c)
best_feature.append(min_c)
print('特征({}) - 基尼指数 - {}'.format(labels[c], c_gini_d))
best = min(best_feature, key=lambda x:x[-1])
return '特征({})的基尼指数最小,选择为根节点特征'.format(best[0])
gini_d_c_train(np.array(datasets))