论文复现：Deep Neural Decision Trees (DNDT)

小凉爽&玉米粒

已于 2023-07-21 22:33:49 修改

阅读量615

点赞数 1

文章标签：决策树算法机器学习 python

于 2023-07-21 22:29:07 首次发布

本文链接：https://blog.csdn.net/weixin_45478754/article/details/131861258

版权

1.原论文

这篇文章是 2018 年的文章，ICML Workshop on Human Interpretability in Machine Learning (WHI 2018)。
论文地址：https://arxiv.org/pdf/1806.06988.pdf
源码：https://github.com/wOOL/DNDT

2.论文介绍

这个博主讲得非常清晰：https://zhuanlan.zhihu.com/p/273383418

3.完善后的代码（包含与传统决策树的比较）：

import numpy as np
import torch
import torch.nn.functional as F
from functools import reduce
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder
import time
import random
from sklearn.base import BaseEstimator
import warnings

#one_hot编码
def ont_hot_code(labels):
    labels = np.array(labels)
    ohe = OneHotEncoder()
    ohe.fit([[0], [1]])
    code_df = ohe.transform(labels.reshape(-1,1)).toarray()

    cn_label = code_df[:, 0] #narray
    mci_label = code_df[:, 1]


    return cn_label, mci_label


class NNDT_Classifier(BaseEstimator):
    def __init__(self, num_cut, num_class, epoch, temperature, num_leaf=None, cut_points_list=None, leaf_score=None, W=None,b=None):
        self.num_cut = num_cut
        self.num_leaf = np.prod(np.array(num_cut) + 1)
        self.num_class = num_class
        self.temperature = torch.tensor(temperature)
        self.cut_points_list = [torch.rand([i], requires_grad=True) for i in num_cut]
        self.leaf_score = torch.rand([self.num_leaf, self.num_class], requires_grad=True)
        self.loss_function = torch.nn.CrossEntropyLoss()
        self.optimizer = torch.optim.Adam(self.cut_points_list + [self.leaf_score] + [self.temperature], lr=0.01)
        self.epoch = epoch
        self.W = W
        self.b = b

    # 计算克罗内克积
    def torch_kron_prod(self, a, b):
        res = torch.einsum('ij,ik->ijk', [a, b])
        res = torch.reshape(res, [-1, np.prod(res.shape[1:])])
        return res

    # 软分箱算法
    def torch_bin(self, x, cut_points, temperature):
        # x is a N-by-1 matrix (column vector)
        # cut_points is a D-dim vector (D is the number of cut-points)
        # this function produces a N-by-(D+1) matrix, each row has only one element being one and the rest are all zeros
        D = cut_points.shape[0]
        self.W = torch.reshape(torch.linspace(1.0, D + 1.0, D + 1), [1, -1])
        cut_points, _ = torch.sort(cut_points)  # make sure cut_points is monotonically increasing
        self.b = torch.cumsum(torch.cat([torch.zeros([1]), -cut_points], 0), 0)  # 返回维度dim中输入元素的累计和
        h = torch.matmul(x, self.W) + self.b  # tensor的乘法
        h = h / self.temperature
        # res = torch.exp(h - torch.max(h))
        # res = res / torch.sum(res, dim=-1, keepdim=True)
        # 进行softmax计算
        res = F.softmax(h, dim=1)
        return res

    # 建树
    def nn_decision_tree(self, x):
        # cut_points_list contains the cut_points for each dimension of feature
        leaf = reduce(self.torch_kron_prod,
                      map(lambda z: self.torch_bin(x[:, z[0]:z[0] + 1], z[1], self.temperature),
                          enumerate(self.cut_points_list)))  # 对参数序列中元素进行累积，返回值是一个数值
        return torch.matmul(leaf, self.leaf_score)

    def fit(self, X_train, y_train):
        x = torch.from_numpy(X_train.astype(np.float32))
        y = torch.from_numpy(np.argmax(y_train, axis=1))
        for i in range(1000):
            self.optimizer.zero_grad()  # 把梯度初始化置零，也就是把loss关于weight的导数变成0
            y_pred = self.nn_decision_tree(x)
            loss = self.loss_function(y_pred, y)
            loss.backward()
            self.optimizer.step()
            if i % 200 == 0:
                print("epoch %d loss= %f"%(i, loss.detach().numpy()))
        print('error rate %.2f' % (1 - np.mean(np.argmax(y_pred.detach().numpy(), axis=1) == np.argmax(y_train, axis=1))))
        return y_pred


    def predict(self, X_test):
        x = torch.from_numpy(X_test.astype(np.float32))
        y_pred = self.nn_decision_tree(x)
        return y_pred


    def get_params(self, deep=True):
        out = dict()
        for key in self._get_param_names():
            try:
                value = getattr(self, key)
            except AttributeError:
                warnings.warn('From version 0.24, get_params will raise an '
                              'AttributeError if a parameter cannot be '
                              'retrieved as an instance attribute. Previously '
                              'it would return None.',
                              FutureWarning)
                value = None
            if deep and hasattr(value, 'get_params'):
                deep_items = value.get_params().items()
                out.update((key + '__' + k, val) for k, val in deep_items)
            out[key] = value
        return out


if __name__ == '__main__':
    #数据准备
    np.random.seed(1943)
    torch.manual_seed(1943)

    cancers = datasets.load_breast_cancer()
    x = cancers['data']
    _x = x[:, :8]
    y = cancers['target']

    cn_hat, mci_hat = ont_hot_code(y)
    y = np.vstack((cn_hat, mci_hat))
    _y = y.T

    feature_names = cancers['feature_names']

    seed = random.seed(1990)

    X_train, X_test, y_train, y_test = train_test_split(_x, _y, train_size=0.70, random_state=seed)

    d = X_train.shape[1]
    num_cut = [1, 1, 1, 1, 1, 1, 1, 1]  # "Petal length" and "Petal width"
    num_class = 2
    epoch = 1000
    temperature = 0.1

    start_time = time.time()

    #搭建模型
    nndt = NNDT_Classifier(num_cut, num_class, epoch, temperature)
    nndt.fit(X_train, y_train)
    print("--- %s seconds ---" % (time.time() - start_time))

    #预测
    y_pred = nndt.predict(X_test)
    y_pred = np.argmax(y_pred.detach().numpy(), axis=1)
    # acc = accuracy_score(y_test[:, 1], y_pred)
    # print("acc= ", acc)
    print("====================================================================")
    print("NNDT： ", classification_report(y_test[:, 1], y_pred))
    # print("========================模型参数============================================")
    # print(nndt.get_params())


    #与传统决策树比较
    from sklearn import tree

    clf = tree.DecisionTreeClassifier()

    start_time = time.time()

    clf = clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    # acc = accuracy_score(y_test, y_pred)
    # print("acc= ", acc)
    print("--- %s seconds ---" % (time.time() - start_time))
    print("====================================================================")
    print("DT： ", classification_report(y_test, y_pred))


    #画图
    plt.figure(figsize=(8, 8))

    plt.scatter(X_test[:, 0],
                X_test[:, 1],
                c=np.argmax(y_test, axis=1),
                marker='o',
                s=50,
                cmap='summer',
                edgecolors='black')

    plt.show()