决策树的python代码_决策树 python代码实现

本文介绍了如何使用Python实现决策树。通过计算信息熵、选择最佳划分特征和创建树结构,构建了一个决策树模型。利用该模型,可以对新的数据进行预测。文中还展示了将决策树保存和加载的具体步骤。
摘要由CSDN通过智能技术生成

决策树任务总结:有n条训练数据,每一条数据格式为[属性1,属性2,…,属性k,结果i],即数据为n*(k+1)的矩阵。

根据这n条数据生成一颗决策树,当来一条新数据时,能够根据k个属性,代入决策树预测出结果。

决策树是树状,叶子节点是结果,非叶子节点是决策节点,每一个决策节点是对某个属性的判断。

而选择哪一个属性作为当前划分属性,则是比较每一个属性划分前后信息熵变化的差异,选差异最大的作为当前划分属性。

trees.py

import math

import operator

#计算信息熵

def calcInformationEntropy(dataSet):

numOfDataset=float(len(dataSet))

labels=[onepice[-1]

for onepice in dataSet]

uniqueLabels=set(labels)

Entropy=0.0

for value in uniqueLabels:

countValue=labels.count(value)

prob=float(countValue)/numOfDataset

Entropy+=-1*prob*math.log(prob,2)

return Entropy

#按照第i维特征的某一个值划分数据

def splitDataSet(dataSet,featureIndex,value):

subDataSet=[]

for line in dataSet:

if line[featureIndex]==value:

newline=line[:featureIndex];

newline.extend(line[featureIndex+1:])

subDataSet.append(newline)

return subDataSet

#选择最好的特征,即划分前后信息熵增益最大

def chooseBestFeatureToSplit(dataSet):

preEntropy=calcInformationEntropy(dataSet)

numOfFeatures=len(dataSet[0])-1

bestFeature=-1

maxEntropyGain=0.0

for i in range(numOfFeatures):

featureList=[example[i] for example in dataSet]

uniqueFeatures=set(featureList)

postEntropy=0.0

for value in uniqueFeatures:

subDataSet=splitDataSet(dataSet,i,value)

prob=float(len(subDataSet))/float(len(dataSet))

postEntropy+=prob*calcInformationEntropy(subDataSet)

entropyGain=preEntropy-postEntropy

if maxEntropyGain

maxEntropyGain=entropyGain

bestFeature=i

return bestFeature

#当特征已经用完,投票决定剩下的样本,少数服从多数

def majorityCnt(classList):

classCount={}

for vote in classList:

if vote not in classCount.keys(): classCount[vote] = 0

classCount[vote] += 1

sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)

return sortedClassCount[0][0]

#根据训练样本,生成树

def createTree(dataSet,labels):

classList=[example[-1]

for example in dataSet]

if classList.count(classList[0])==len(classList):

return classList[0]

if len(dataSet[0])==1:

return majorityCnt(classList)

bestFeature=chooseBestFeatureToSplit(dataSet)

features=[example[bestFeature] for example in dataSet]

uniqueFeatures=set(features)

curLabel=labels[bestFeature]

dcTree={}

subLabels=labels[:]

del(subLabels[bestFeature])

for value in uniqueFeatures:

dcTree[value]=createTree(splitDataSet(dataSet,bestFeature,value),subLabels)

myTree={}

myTree[curLabel]=dcTree

return myTree

#决策子函数

def decision(Tree,inputFeature):

firstNode=list(Tree.keys())[0]

value=inputFeature[firstNode]

if isinstance(Tree[firstNode][value],dict):

return decision(Tree[firstNode][value],inputFeature)

return Tree[firstNode][value]

#输入一条数据 ,预测他是什么类型的

def prediction(Tree,inputFeatureVec,labelsVec):

if len(labelsVec)!=len(inputFeatureVec):

return "error input"

lenVec=len(labelsVec)

inputDict={}

for i in range(lenVec):

inputDict[labelsVec[i]]=inputFeatureVec[i]

return decision(Tree,inputDict)

#把树存储下来

def storeTree(inputTree,filename):

import pickle

fw = open(filename,'w')

pickle.dump(inputTree,fw)

fw.close()

#把树还原出来

def grabTree(filename):

import pickle

fr = open(filename)

return pickle.load(fr)

test.py

import trees

import pandas as pd

df=pd.read_csv("lenses.txt",header=None,sep='\t')

labels=['age','prescript','astigmstic','tearRate']

dataSet=[]

for i in range(len(df)):

dataSet.append(list(df.loc[i][:]))

myTree=trees.createTree(dataSet[:-1],labels)

result=trees.prediction(myTree,dataSet[-1][:-1],labels)

好的,下面是关于决策树的CART算法Python实现示例。 首先,我们需要导入相关的库: ```python import numpy as np import pandas as pd from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import accuracy_score ``` 接下来,我们需要加载数据集: ```python iris = load_iris() X = pd.DataFrame(iris.data, columns=iris.feature_names) y = pd.DataFrame(iris.target, columns=['target']) ``` 然后,我们将数据集分成训练集和测试集: ```python X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) ``` 接下来,我们定义一个函数来计算基尼不纯度: ```python def gini_impurity(y): _, counts = np.unique(y, return_counts=True) probabilities = counts / len(y) return 1 - np.sum(probabilities ** 2) ``` 然后,我们定义一个函数来计算基尼不纯度的加权平均值: ```python def weighted_gini_impurity(groups): total_size = sum(len(group) for group in groups) gini = 0 for group in groups: size = len(group) if size == 0: continue score = gini_impurity(group['target']) gini += score * (size / total_size) return gini ``` 接下来,我们定义一个函数来拆分数据集: ```python def test_split(index, value, X, y): left_mask = X.iloc[:, index] < value right_mask = X.iloc[:, index] >= value left = {'X': X[left_mask], 'y': y[left_mask]} right = {'X': X[right_mask], 'y': y[right_mask]} return left, right ``` 然后,我们定义一个函数来选择最佳的数据集拆分: ```python def get_best_split(X, y): best_index, best_value, best_score, best_groups = None, None, float('inf'), None for index in range(X.shape[1]): for value in X.iloc[:, index]: groups = test_split(index, value, X, y) score = weighted_gini_impurity(list(groups.values())) if score < best_score: best_index, best_value, best_score, best_groups = index, value, score, groups return {'feature_index': best_index, 'feature_value': best_value, 'groups': best_groups} ``` 接下来,我们定义一个函数来创建一个叶节点: ```python def create_leaf_node(y): return y['target'].mode()[0] ``` 然后,我们定义一个函数来创建一个决策树: ```python def create_decision_tree(X, y, max_depth, min_size, depth): best_split = get_best_split(X, y) left, right = best_split['groups'].values() del(best_split['groups']) if not left or not right: return create_leaf_node(pd.concat([left, right], axis=0)) if depth >= max_depth: return create_leaf_node(y) if len(left) < min_size: left = create_leaf_node(left) else: left = create_decision_tree(left['X'], left['y'], max_depth, min_size, depth+1) if len(right) < min_size: right = create_leaf_node(right) else: right = create_decision_tree(right['X'], right['y'], max_depth, min_size, depth+1) return {'left': left, 'right': right, **best_split} ``` 最后,我们定义一个函数来进行预测: ```python def predict(node, row): if row[node['feature_index']] < node['feature_value']: if isinstance(node['left'], dict): return predict(node['left'], row) else: return node['left'] else: if isinstance(node['right'], dict): return predict(node['right'], row) else: return node['right'] ``` 现在我们已经定义了所有必要的函数,我们可以用以下代码来创建并测试我们的决策树模型: ```python tree = create_decision_tree(X_train, y_train, max_depth=5, min_size=10, depth=1) y_pred = np.array([predict(tree, row) for _, row in X_test.iterrows()]) print('Accuracy:', accuracy_score(y_test, y_pred)) ``` 这就是一个基于CART算法决策树Python实现示例。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值