1 #-*- coding: utf-8 -*-
2 """
3 Created on Wed Jan 24 19:01:40 20184
5 @author: markli6
7 采用信息增益作为特征选择原则构建决策树8 """
9 importnumpy as np;10 importpandas as pd;11
12 classDecisionTree(object):13 def __init__(self,features):14 """
15 features 样本具有的特征的取值范围,例如具有A1,A2两个特征,features=[A1,A2]16 """
17 self.features =features;18
19 deffit(self,TrainData):20 """
21 TrainData 训练样本,数据格式为二维数组m*n22 m 个样本,n-1 个特征,最后一列是类别标签23 """
24 tree =self.GetDicTree(TrainData,self.features);25 returntree;26
27
28 defSetEntropy(self,Data):29 """
30 获取数据集的经验熵31 Data 数据集的最后一列,类别32 """
33 N = len(Data[:,-1]); #获取数据集的大小
34 Numoflabel = pd.value_counts(Data[:,-1]); #得数据集中到每一类别的数量
35 classlabel = list(set(Data[:,-1]));36 entropy = 0;#数据集的信息熵
37 for c inclasslabel:38 try:39 Ck = Numoflabel[c]; #得到类别为c的样例的数量
40 entropy = entropy - Ck/N * np.log2(Ck/N);41 exceptKeyError:42 Ck =0;43 entropy = entropy -Ck;44
45 returnentropy;46
47 defConditionEntropy(self,Data,index):48 """
49 获取某一特征的条件经验熵50 Data 数据集与TrainData格式一致51 feature 特征的取值范围 例如 A1=[1,2,3]52 feature_index 该特征在数据集中属于第几个特征,从0开始53 """
54 ConEntropy = 1;55 feature_value =list(set(Data[:,index]));56 N =len(Data[:,0]);57 for a infeature_value:58 d = Data[np.where(Data[:,index]==a)];59 d_n =len(d);60 if(d_n ==0):61 return0;62 #计算特征取a值时的数据集的经验熵
63 d_entropy =self.SetEntropy(d);64 ConEntropy = ConEntropy * (d_n / N) *d_entropy;65
66 return -ConEntropy;67
68 defSelectBestFeature(self,Data):69 """
70 选出数据集中最大信息增益的特征及最优特征71 Data 数据集与TrainData格式一致72 """
73 AddEntropy =[];74 entropy = self.SetEntropy(Data); #求得数据集的经验熵
75 feature_num = len(Data[0])-1; #获得数据集中特征数量
76 for i inrange(feature_num):77 ConEntropy = self.ConditionEntropy(Data,i); #求得每个特征的条件熵
78 adden = entropy -ConEntropy;79 AddEntropy.append(adden);80
81 index =np.argmax(AddEntropy);82 returnindex;83
84
85 defVoteClass(self,classlist):86 """
87 当特征被选完,但还是无法准确判断哪一类时,采用投票的方式确定其类88 """
89 classlabel =list(set(classlist));90 dic ={};91 for c inclasslabel:92 if(c not indic.keys()):93 dic[c] =0;94 else:95 dic[c] += 1;96 returnmax(dic);97
98 defGetDicTree(self,TrainData,features):99 """
100 构造字典树101 TrainData 训练数据集102 """
103 classlabel = list(set(TrainData[:,-1])); #获得数据集的类别标签
104 #classlabel = [row[-1] for row in TrainData];
105
106 if(len(classlabel) == 1):107 returnclasslabel[0];108
109 if(len(TrainData[0]) == 1):110 return self.VoteClass(TrainData[:,-1]);111
112 bestfeature_index =self.SelectBestFeature(TrainData);113 bestfeature = features[bestfeature_index]; #选出最优的特征
114 dictree = {bestfeature:{}}; #以最优特征为节点构建子树
115 del(features[bestfeature_index]) #删除已选过的特征
116
117 #根据最优特征的取值拆分数据集,递归上述选最优特征过程
118 feature_attr =list(set(TrainData[:,bestfeature_index]));119 for value infeature_attr:120 sub_features =features[:];121 subdata =self.SplitData(TrainData,bestfeature_index,value);122 dictree[bestfeature][value] =self.GetDicTree(subdata,sub_features);123
124 returndictree;125
126 defSplitData(self,Data,feature_index,feature_value):127 subdata = Data[np.where(Data[:,feature_index] ==feature_value)];128 n =len(Data[0]);129 subdata = [[row[i] for i in range(n) if i != feature_index] for row insubdata];130 returnnp.array(subdata);131
132
133
134
135