模块化决策树函数的python实现
标准的决策树不适合处理连续型数据,CART可以处理连续型数据,但是前提假设决策树是二叉树,也就是说不适合处理特征取值3个及以上的离散数据生成多叉树。本文代码实现的决策树结合两者的优点,既能处理离散型和连续型数据;且生成的决策树可以同时存在二叉和多叉;同时支持基于信息熵的信息增益、信息增益比和基尼指数的特征划分函数的选择。
为了在递归生成决策树时提高代码复用,本文对选定的最优连续型特征,根据最佳切分点进行了二值化处理,使之成为取值为0和1的离散特征。为了代码的紧凑性,本文也考虑了一些细节,如对连续型特征不支持信息增益比的划分选择,因为连续型数据根据最佳划分点划分后就两个子集,也就是说离散后该特征就两种取值。而信息增益比是为了解决信息增益在划分特征时,存在偏向于选择取值较多的特征的问题。
虽然如此,但毕竟本人第一次写模块化的函数,无论从对原理的理解,还是代码书写的规范性和紧凑型方面都还存在很多不足,欢迎各位大佬拍砖,毕竟有讨论才能有思想的火花,也是促进自己进步的一种方式吧。
from numpy import *
from math import log
import pandas as pd
def MajorClass(datasets):
classDict = {}
for cls in datasets[:,-1]:
classDict[cls] = classDict.get(cls, 0) + 1
sortclass = sorted(classDict.items(), key=lambda item: item[1])
return sortclass[-1][0]
def Gini(datasets):
labelCounts = {}
for label in datasets[:,-1]:
labelCounts[label] = labelCounts.get(label, 0) + 1
Gini = 1
for key in labelCounts:
prob = labelCounts[key] / len(datasets)
Gini -= prob * prob
return Gini
def CalcShannonent(datasets):
labelcounts = {}
for label in datasets[:,-1]:
labelcounts[label] = labelcounts.get(label, 0) + 1
shannonent = 0.0
for key in labelcounts:
prob = float(labelcounts[key]) / len(datasets)
shannonent -= prob * log(prob, 2)
return shannonent
def FeatValueEntropy(datasets, featlist):
featvaluedict = {}; featentropy =0.0
for value in featlist:
featvaluedict[value] = featvaluedict.get(value,0) +1
for key in featvaluedict:
prob = len(featvaluedict[key]) / float(len(datasets))
featentropy -= prob * log(prob,2)
return featentropy
def SplitScatteredDataset(datasets, featindex, value):
retdataset = []
for featvec in datasets:
if featvec[featindex] == value:
reducefeatvec = featvec[:featindex]
reducefeatvec.extend(featvec[featindex+1:])
retdataset.append(reducefeatvec)
return retdataset
def SplitContinuousDataset(datasets, featindex, value):
subdataset0 = []; subdataset1 = []
for featvec in datasets:
if featvec[featindex] < value:
reducevec = featvec[:featindex]
reducevec.extend(featvec[featindex+1:])
subdataset0.append(reducevec)
else:
reducevec = featvec[:featindex]
reducevec.extend(featvec[featindex + 1:])
subdataset1.append(reducevec)
return subdataset0, subdataset1
def ChooseBestSplit(datasets, errtype, feature, gainratio=False):
m,n = shape(datasets)
if errtype == CalcShannonent:
baseinfo = errtype(datasets)
bestinfo = 0.0
else: bestinfo = inf
bestindex = 0; bestvalue = 0.0
continuous = 0
for featindex in range(n-1):
featlist = [instance[featindex] for instance in datasets]
if type(featlist[0]).__name__ == 'float':
sortfeatlist = sorted(featlist)
splitlist = []
for j in range(len(sortfeatlist)-1):
splitlist.append((sortfeatlist[j] + sortfeatlist[j+1]) / 2.0)
if errtype == CalcShannonent():
newinfo = 0.0
for value in splitlist:
subdataset0, subdataset1 = \
SplitContinuousDataset(datasets, featindex, value)
prob0 = len(subdataset0) / float(len(datasets))
newinfo += prob0 * errtype(subdataset0)
prob1 = len(subdataset1) / float(len(datasets))
newinfo += prob1 * errtype(subdataset1)
gaininfo = baseinfo - newinfo
if gaininfo > bestinfo:
bestinfo = gaininfo
bestindex = featindex
bestvalue = value
continuous = 1
else:
for value in splitlist:
newinfo = 0.0
subdataset0, subdataset1 = \
SplitContinuousDataset(datasets, featindex, value)
prob0 = len(subdataset0) / float(len(datasets))
newinfo += prob0 * errtype(subdataset0)
prob1 = len(subdataset1) / float(len(datasets))
newinfo += prob1 * errtype(subdataset1)
if newinfo < bestinfo:
bestinfo = newinfo
bestindex = featindex
bestvalue = value
continuous = 1
else:
if errtype == CalcShannonent():
newinfo = 0.0
uniquevalue = set(featlist)
for value in uniquevalue:
subdataset = SplitScatteredDataset(datasets, featindex, value)
prob = len(subdataset) / float(len(datasets))
newinfo += prob * errtype(datasets)
gaininfo = baseinfo - newinfo
if gainratio:
featentropy = FeatValueEntropy(datasets,featlist)
gaininfo = gaininfo / featentropy
if gaininfo > bestinfo:
bestinfo = gaininfo
bestindex = featindex
continues = 0
else:
newinfo = 0.0
uniquevalue = set(featlist)
for value in uniquevalue:
subdataset = SplitScatteredDataset(datasets, featindex, value)
prob = len(subdataset) / float(len(datasets))
newinfo += prob * errtype(datasets)
if newinfo < bestinfo:
bestinfo = newinfo
bestindex = featindex
continuous = 0
if continuous:
feature[bestindex] = feature[bestindex] + '<' + str(bestvalue)
for instance in datasets:
if instance[bestindex] < bestvalue:
instance[bestindex] = 1
else: instance[bestindex] = 0
return bestindex
def CreatTree(datasets, featurelist, errtype, regression=False):
if len(set(datasets[:,-1])) == 1:
return datasets[0,-1]
if len(featurelist) == 1:
return MajorClass(datasets)
bestfeatindex = ChooseBestSplit(datasets, errtype, featurelist)
bestfeatname = featurelist[bestfeatindex]
myTree = {bestfeatname: {}}
featvalue = [instance[bestfeatindex] for instance in datasets]
del (featurelist[bestfeatindex])
uniqueval = set(featvalue)
for value in uniqueval:
subfeaturelist = featurelist[:]
subdatasets = SplitScatteredDataset(datasets, bestfeatindex, value)
myTree[bestfeatname][value] = myTree(subdatasets,subfeaturelist, errtype)
return myTree
df = pd.read_csv('xxx/xxx.txt')
datasets = df.values
featlist = df.columns.values.tolist()