第五章 决策树--部分定义代码实现

参考资料:

1.github: https://github.com/fengdu78/lihang-code
2.李航 《统计学习方法》

基于sklearn 实现决策树
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

def create_data():
    iris = load_iris()
    df = pd.DataFrame(iris.data, columns=iris.feature_names)
    df['label'] = iris.target
    df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width','label']
    data = np.array(df.iloc[:100, [0, 1, -1]])
    
    return data[:,:2], data[:, -1]

X, y = create_data()
# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# 模型训练
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# 模型预测
clf.score(X_test, y_test)

导入的包(库)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

from collections import Counter
import math
from math import log

import pprint
《统计学习方法》书上数据集
def create_data():
    datasets = [
        ['青年', '否' ,'否' , '一般', '否'],
        ['青年', '否' ,'否' , '好', '否'],
        ['青年', '是' ,'否' , '好', '是'],
        ['青年', '是' ,'是' , '一般', '是'],
        ['青年', '否' ,'否' , '一般', '否'],
        ['中年', '否' ,'否' , '一般', '否'],
        ['中年', '否' ,'否' , '好', '否'],
        ['中年', '是' ,'是' , '好', '是'],
        ['中年', '否' ,'是' , '非常好', '是'],
        ['中年', '否' ,'是' , '非常好', '是'],
        ['老年', '否' ,'是' , '非常好', '是'],
        ['老年', '否' ,'是' , '好', '是'],
        ['老年', '是' ,'否' , '好', '是'],
        ['老年', '是' ,'否' , '非常好', '是'],
        ['老年', '否' ,'否' , '一般', '否']          
    ]
    labels = ['年龄', '有工作', '有自己的房子', '信贷情况', '类别']
    return datasets, label
datasets, labels = create_data()
train_data = pd.DataFrame(datasets, columns=labels)

在这里插入图片描述

熵: H ( x ) = − ∑ i = 1 n p i log ⁡ p i H(x) = -\sum_{i=1}^{n}p_i\log{p_i} H(x)=i=1npilogpi
def cal_entropy(datasets):
    data_length = len(datasets)
    label_count = {}
    for i in range(data_length):
        # 读取数据的最后一列label
        label = datasets[i-1][-1]
        if label not in label_count:
            label_count[label] = 0
        # 统计每个label的数量
        label_count[label] += 1
    # 根据熵的计算公式计算熵
    entropy = -sum([(p/data_length) * log(p/data_length, 2) for p in label_count.values()])
    return entropy
条件熵: H ( X ∣ Y ) = ∑ P ( X ∣ Y ) log ⁡ P ( X ∣ Y ) H(X|Y)=\sum{P(X|Y)}\log{P(X|Y)} H(XY)=P(XY)logP(XY)
def cal_cond_entropy(datasets, axis=0):
    data_length = len(datasets)
    feature_sets = {}
    for i in range(data_length):
        feature = datasets[i][axis]
        if feature not in feature_sets:
            feature_sets[feature] = []
        # 统计每一个特征的包含数据
        feature_sets[feature].append(datasets[i])
    # 根据条件熵的计算公式计算条件熵
    cond_entropy = sum([(len(p)/data_length) * cal_entropy(p) for p in feature_sets.values()])
    return cond_entropy
信息增益 : g ( D , A ) = H ( D ) − H ( D ∣ A ) g(D, A)=H(D)-H(D|A) g(D,A)=H(D)H(DA)
def info_gain(entropy, cond_entropy):
    return entropy - cond_entropy
def info_gain_train(datasets):
    count = len(datasets[0]) - 1
    entropy = cal_entropy(datasets)
    best_feature = []
    for c in range(count):
        # 对每个特征计算信息增益
        c_info_gain = info_gain(entropy, cal_cond_entropy(datasets, axis=c))
        best_feature.append((c, c_info_gain))
        print('特征({}) - info_gain - {:.3f}'.format(labels[c], c_info_gain))
        
    best = max(best_feature, key=lambda x:x[-1])
    return '特征({})的信息增益最大,选择为根节点特征'.format(labels[best[0]])
info_gain_train(np.array(datasets))
训练结果

在这里插入图片描述

信息增益比: g R ( D , A ) = g ( D , A ) H ( A ) g_R(D, A) = \frac{g(D,A)}{H(A)} gR(D,A)=H(A)g(D,A)
def feature_entropy(datasets, axis=0):
    data_length = len(datasets)
    feature_sets = {}
    # 遍历每条数据,计算对于每个特征的每个可能取值对应的样本数量
    for i in range(data_length):
        feature = datasets[i][axis]
        if feature not in feature_sets:
            feature_sets[feature] = 0
        feature_sets[feature] += 1
    # 计算axis对应特征的熵值
    f_entropy = -sum([(p/data_length) * log(p/data_length, 2) for p in feature_sets.values()])
    return f_entropy    

def info_gain_ratio(datasets, axis=0):
    # 根据信息增益比的计算公式计算信息增益
    i_gain = info_gain(cal_entropy(datasets), cal_cond_entropy(datasets, axis=axis))
    f_entropy = feature_entropy(datasets, axis=axis)
    return i_gain / f_entropy
def info_gain_ratio_train(datasets):
    count = len(datasets[0]) - 1
    entropy = cal_entropy(datasets)
    best_feature = []
    for c in range(count):
        # 计算每个特征的信息增益比
        c_info_gain_ratio = info_gain_ratio(datasets, axis=c)
        best_feature.append((c, c_info_gain_ratio))
        print('特征({}) - info_gain_ratio - {:.3f}'.format(labels[c], c_info_gain_ratio))
    # 获取信息增益比最大的特征
    best = max(best_feature, key=lambda x:x[-1])
    return '特征({})的信息增益比最大,选择为根节点特征'.format(labels[best[0]])
训练结果

在这里插入图片描述

基尼指数: G i n i ( D ) = ∑ k = 1 K p k log ⁡ p k = 1 − ∑ k = 1 K p k 2 Gini(D)=\sum_{k=1}^{K}p_k\log{p_k}=1-\sum_{k=1}^{K}p_k^2 Gini(D)=k=1Kpklogpk=1k=1Kpk2
def Gini(datasets):
    data_length = len(datasets)
    # 统计特征集合
    feature_sets = {}
    for i in range(data_length):
        feature = datasets[i][-1]
        if feature not in feature_sets:
            feature_sets[feature] = 0
        # 统计每个特征的样本数量
        feature_sets[feature] += 1
    # 根据基尼指数的计算公式计算基尼指数    
    gini = -sum([ math.pow((p/data_length), 2) for p in feature_sets.values()])
    gini_d = 1 + gini
    return gini_d
# 关于特征的基尼指数计算
def Gini_d(datasets, axis=0):
    data_length = len(datasets)
    feature_sets = {}
    for i in range(data_length):
        feature = datasets[i][axis]
        if feature not in feature_sets:
            feature_sets[feature] = []
        # 统计每一个特征的包含样本数据
        feature_sets[feature].append(datasets[i])
    # 关于特征的基尼指数        
    gini_c = []
    # 遍历所有的特征集合,如年龄特征中可能取值:青年,中年,老年,依次遍历这三个值
    for p in feature_sets:
        p_data = feature_sets[p]
        # 以第一条遍历的数据为例,统计青年包含的样本数据
        gini_d_a = (len(p_data)/data_length) * Gini(p_data)
        # 统计不包含青年的样本数据
        f_remain = []
        for key in feature_sets:
            if key != p:
                f_remain.extend(feature_sets[key])
        f_array_remain = np.array(f_remain)
        f_remain = f_array_remain.reshape((data_length-len(p_data), len(datasets[0])))
  
        gini_d_drop_a = (len(f_remain) / data_length) * Gini(f_remain)
        # 计算基于年龄为青年划分的样本数据的基尼指数
        gini_p = gini_d_a + gini_d_drop_a
        # 记录年龄中青年,中年,老年依次作为划分点的基尼指数
        gini_c.append((p, gini_p))
        # 打印出基尼指数的数值信息
        print('特征({})为({}) - gini_axis_p - {:.3f}'.format(labels[axis], p,  gini_p))
    # 选择最小的基尼指数特征值作为划分点
    min_split = min(gini_c, key=lambda x:x[-1])
    print('特征({})中的({})最优切分点'.format(labels[axis], min_split[0]))  
    return gini_c, (labels[axis], min_split[1])
def gini_d_c_train(datasets):
    count = len(datasets[0]) - 1
    best_feature = []
    for c in range(count):
        c_gini_d, min_c = Gini_d(datasets, c)
        best_feature.append(min_c)
        print('特征({}) - 基尼指数 - {}'.format(labels[c], c_gini_d))
    best = min(best_feature, key=lambda x:x[-1])
    return '特征({})的基尼指数最小,选择为根节点特征'.format(best[0])
gini_d_c_train(np.array(datasets))
训练结果

在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值