决策树实现

1 决策树自编程实现

import numpy as np
import pandas as pd
import math
import time
from collections import namedtuple
 
class Node(namedtuple("Node","children type content feature label")): # 孩子节点、分类特征的取值、节点内容、节点分类特征、标签
    """定义节点"""
    def __repr__(self):
        return str(tuple(self))
 
class DecisionTree():
    """决策树"""
    def __init__(self,method="info_gain_ratio"):
        self.tree=None
        self.method=method
 
    def _experienc_entropy(self,X):
        """计算经验熵"""
        # 统计每个取值的出现频率
        x_types_prob=X.iloc[:,0].value_counts()/X.shape[0]
        # 计算经验熵
        x_experienc_entropy=sum((-p*math.log(p,2) for p in x_types_prob))
        return x_experienc_entropy
 
    def _conditinal_entropy(self,X_train,y_train,feature):
        """计算条件熵"""
        # feature特征下每个特征取值数量统计
        x_types_count= X_train[feature].value_counts()
        # 每个特征取值频率计算
        x_types_prob = x_types_count / X_train.shape[0]
        # 每个特征取值下类别y的经验熵
        x_experienc_entropy=[self._experienc_entropy(y_train[(X_train[feature]==i).values]) for i in x_types_count.index]
        # 特征feature对数据集的经验条件熵
        x_conditinal_entropy=(x_types_prob.mul(x_experienc_entropy)).sum()
        return x_conditinal_entropy
 
    def _information_gain(self,X_train,y_train,feature):
        """计算信息增益"""
        return self._experienc_entropy(y_train)-self._conditinal_entropy(X_train,y_train,feature)
 
    def _information_gain_ratio(self,X_train,y_train,features,feature):
        """计算信息增益比"""
        index=features.index(feature)
        return self._information_gain(X_train,y_train,feature)/self._experienc_entropy(X_train.iloc[:,index:index+1])
 
    def _choose_feature(self,X_train,y_train,features):
        """选择分类特征"""
        if self.method=="info_gain_ratio":
            info=[self._information_gain_ratio(X_train,y_train,features,feature) for feature in features]
        elif self.method=="info_gain":
            info=[self._information_gain(X_train,y_train,feature) for feature in features]
        else:
            raise TypeError
        optimal_feature=features[np.argmax(info)]
        # for i in range(len(features)):
        #     print(features[i],":",info[i])
        return optimal_feature
 
    def _built_tree(self,X_train,y_train,features,type=None):
        """递归构造决策树"""
        # 只有一个节点或已经完全分类,则决策树停止继续分叉
        if len(features)==1 or len(np.unique(y_train))==1:
            label=list(y_train[0].value_counts().index)[0]
            return Node(children=None,type=type,content=(X_train,y_train),feature=None,label=label)
        else:
            # 选择分类特征值
            feature=self._choose_feature(X_train,y_train,features)
            features.remove(feature)
            # 构建节点,同时递归创建孩子节点
            features_iter=np.unique(X_train[feature])
            children=[]
            for item in features_iter:
                X_item=X_train[(X_train[feature]==item).values]
                y_item=y_train[(X_train[feature]==item).values]
                children.append(self._built_tree(X_item,y_item,features,type=item))
            return Node(children=children,type=type,content=None,feature=feature,label=None)
 
    def _prune(self):
        """进行剪枝"""
        pass
 
    def fit(self,X_train,y_train,features):
        self.tree=self._built_tree(X_train,y_train,features)
        #self.tree=self._prune(tree)
 
    def _search(self,X_new):
        tree=self.tree
        # 若还有孩子节点,则继续向下搜索,否则搜索停止,在当前节点获取标签
        while tree.children:
            for child in tree.children:
                if X_new[tree.feature].loc[0]==child.type:
                    tree=child
                    break
        return tree.label
 
    def predict(self,X_new):
       return self._search(X_new)
 
def main():
    star=time.time()
    # 训练数据集
    features=["年龄","有工作","有自己的房子","信贷情况"]
    X_train=np.array([
                      ["青年", "", "", "一般"],
                      ["青年", "", "", ""],
                      ["青年", "", "", ""],
                      ["青年", "", "", "一般"],
                      ["青年", "", "", "一般"],
                      ["中年", "", "", "一般"],
                      ["中年", "", "", ""],
                      ["中年", "", "", ""],
                      ["中年", "", "", "非常好"],
                      ["中年", "", "", "非常好"],
                      ["老年", "", "", "非常好"],
                      ["老年", "", "", ""],
                      ["老年", "", "", ""],
                      ["老年", "", "", "非常好"],
                      ["老年", "", "", "一般"]
                      ])
    y_train=np.array(["","","", "", "", "", "", "", "", "", "", "", "", "", ""])
    # 转换成pd.DataFrame模式
    X_train = pd.DataFrame(X_train, columns=features)
    y_train = pd.DataFrame(y_train)
    # 训练
    clf=DecisionTree(method="info_gain")
    clf.fit(X_train,y_train,features.copy())
    # 预测
    X_new=np.array([["青年", "", "", "一般"]])
    X_new= pd.DataFrame(X_new, columns=features)
    y_predict=clf.predict(X_new)
    print(y_predict)
    print("time:{:.4f}s".format(time.time()-star))
 
if __name__=="__main__":
    main()

 

2 调用sklearn实现

 

from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
import numpy as np
import pandas as pd
import time
 
from IPython.display import Image
from sklearn import tree
import pydotplus
 
def show(clf,features,y_types):
    """决策树的可视化"""
    dot_data = tree.export_graphviz(clf, out_file=None,
                                    feature_names=features,
                                    class_names=y_types,
                                    filled=True, rounded=True,
                                    special_characters=True)
    graph = pydotplus.graph_from_dot_data(dot_data)
    #Image(graph.create_png())  #jupyter里可以显示,pycharm显示不出
    graph.write_png(r'DT_show.png')
 
def main():
    star=time.time()
    # 原始样本数据
    features=["age","work","house","credit"]
    X_train=pd.DataFrame([
                      ["青年", "", "", "一般"],
                      ["青年", "", "", ""],
                      ["青年", "", "", ""],
                      ["青年", "", "", "一般"],
                      ["青年", "", "", "一般"],
                      ["中年", "", "", "一般"],
                      ["中年", "", "", ""],
                      ["中年", "", "", ""],
                      ["中年", "", "", "非常好"],
                      ["中年", "", "", "非常好"],
                      ["老年", "", "", "非常好"],
                      ["老年", "", "", ""],
                      ["老年", "", "", ""],
                      ["老年", "", "", "非常好"],
                      ["老年", "", "", "一般"]
                      ])
    y_train = pd.DataFrame(["", "", "", "", "", "", "", "", "", "", "", "", "", "", ""])
    # 数据预处理
    le_x=preprocessing.LabelEncoder()
    le_x.fit(np.unique(X_train))
    X_train=X_train.apply(le_x.transform)
    print(X_train)
    le_y=preprocessing.LabelEncoder()
    le_y.fit(np.unique(y_train))
    y_train=y_train.apply(le_y.transform)
    # 调用sklearn.DT建立训练模型
    clf=DecisionTreeClassifier()
    clf.fit(X_train,y_train)
    # 可视化
    show(clf,features,[str(k) for k in np.unique(y_train)])
    # 用训练得到模型进行预测
    X_new=pd.DataFrame([["青年", "", "", "一般"]])
    X=X_new.apply(le_x.transform)
    y_predict=clf.predict(X)
    # 结果输出
    X_show=[{features[i]:X_new.values[0][i]} for i in range(len(features))]
    print("{0}被分类为:{1}".format(X_show,le_y.inverse_transform(y_predict)))
    print("time:{:.4f}s".format(time.time()-star))
 
if __name__=="__main__":
    main()

 

参考:

[1] 《统计学习方法》李航

[2] 深度之眼统计学习方法集训营课后练习(http://www.deepshare.net/

转载于:https://www.cnblogs.com/WJZheng/p/11298866.html

决策树实现可以使用Java编程语言。在Java中,有许多开源的机器学习库可以帮助实现决策树模型,例如Weka、Apache Spark MLlib和J48等。这些库提供了丰富的功能和API,可以用于构建、训练和评估决策树模型。 其中,Weka是一个非常流行的机器学习库,它提供了各种算法和工具,包括决策树算法。使用Weka,你可以通过加载数据集、选择合适的属性和设置参数来构建决策树模型。然后,你可以使用训练集对模型进行训练,并使用测试集对模型进行评估。最后,你可以使用训练好的模型进行预测。 以下是一个使用Weka库实现决策树的简单示例代码: ```java import weka.core.Instances; import weka.core.converters.ConverterUtils.DataSource; import weka.classifiers.trees.J48; public class DecisionTreeExample { public static void main(String\[\] args) throws Exception { // 加载数据集 DataSource source = new DataSource("path/to/dataset.arff"); Instances data = source.getDataSet(); // 设置类别属性 data.setClassIndex(data.numAttributes() - 1); // 构建决策树模型 J48 tree = new J48(); tree.buildClassifier(data); // 输出决策树模型 System.out.println(tree); } } ``` 在上述代码中,你需要将"path/to/dataset.arff"替换为你的数据集文件的路径。然后,你可以使用J48类构建决策树模型,并使用buildClassifier方法对数据进行训练。最后,你可以使用System.out.println方法输出决策树模型的信息。 请注意,这只是一个简单的示例,实际的决策树实现可能需要更多的步骤和参数设置。你可以根据具体的需求和数据集进行调整和扩展。 #### 引用[.reference_title] - *1* *2* *3* [ID3算法决策树(java实现)](https://blog.csdn.net/kiround/article/details/127018844)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v91^koosearch_v1,239^v3^insert_chatgpt"}} ] [.reference_item] [ .reference_list ]
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值