决策树(离散数据)的python实现
1.计算熵和信息增益率等,来确定按特征进行分类的顺序
2.利用多重字典一层一层建立决策树
熵:Ent(D)=−∑pi*log(pi)
条件熵:条件熵𝐻(X|Y)相当于联合熵𝐻(𝑋,𝑌)减去单独的熵𝐻(Y),即:𝐻(X|Y)=𝐻(𝑋,𝑌)−𝐻(Y)
信息增益率:信息增益/分裂信息度量
1.创建特征和分类数据
## 矩阵、列表皆可
dataSet = [[ 1. , 2.1],
[ 2. , 1.1],
[ 1.3, 1. ],
[ 1. , 1. ],
[ 2. , 1. ]]
classSet = ["a", "a", "b", "b", "a"]
# dataSet = mat([[ 1. , 2.1],[ 2. , 1.1],[ 1.3, 1. ],[ 1. , 1. ],[ 2. , 1. ]])
# classSet = mat(["a", "a", "b", "b", "a"])
2.计算信息熵
from numpy import *
def entropy(dataSet,classSet):
## 信息熵
## 输入特征和分类 并一律转化为矩阵
dataSet=mat(dataSet)
classSet=mat(classSet)
classSet=asarray(classSet).tolist() ## 分类矩阵转化为列表
numfeature= len(dataSet) ## 特征数:5
classCounts = {}
for class0 in classSet[0]:
if class0 not in classCounts.keys():
classCounts[class0] = 0
classCounts[class0] += 1 # 每类个数, {'Y': 4, 'N': 3}
entropy = 0.0
for key in classCounts:
prob = float(classCounts[key])/numfeature
entropy -= prob * log2(prob)
return entropy
entropy(dataSet,classSet)
## -(2/5)*log2(2/5)-(3/5)*log2(3/5)=0.9709505944546686
熵为:0.9709505944546686
3.计算条件熵
def splitDataSet(dataSet,classSet, axis, value):
## 每类特征分别按特征值进行聚类
a=asarray(mat(dataSet).T).tolist()
classSet=asarray(mat(classSet)).tolist()[0]
a.append(list(classSet))
a=mat(a).T
dataSet=asarray(a).tolist()
retDataSet = []
for featVec in dataSet:
## 按各特征进行聚类,
## 如[[1,2,a],[1,3,a],[2,3,c]] 前两列特征,后一列分类
## 聚类后[[2,a],[3,a]]和[[3,c]]
if str(featVec[axis]) == str(value):
reduceFeatVec = featVec[:axis]
reduceFeatVec.extend(featVec[axis+1:])
retDataSet.append(reduceFeatVec)
# print(retDataSet)
return retDataSet
def conditional_entropy(dataSet,classSet):
## 条件熵
dataSet=asarray(dataSet).tolist()
dataSet=mat(dataSet);
numFeatures = len(dataSet[0]) #特征个数
conditional_entlist=[]
entropylist=[]
for i,features in enumerate(np.asarray(dataSet.T).tolist()):
uniqueVals = set(features) # 每组特征值去重
entropy1 = 0.0
conditional_entropy = 0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet,classSet, i, value) #每个唯一值对应的剩余feature的组成子集
prob = len(subDataSet)/float(len(dataSet))
conditional_entropy += prob * entropy(mat(subDataSet)[:,:-1],mat(subDataSet)[:,-1].T)
entropy1 += -prob * log2(prob)
conditional_entlist.append(conditional_entropy)
entropylist.append(entropy1)
return conditional_entlist,entropylist
conditional_entropy(dataSet,classSet)
## 条件熵:2/5*(-(1/2)*log2(1/2)-(1/2)*log2(1/2))+2/5*(-(1)*log2(1)-(0)*log2(0))+1/5*(-(1)*log2(1)-(0)*log2(0))=0.4
## 分裂信息度量:-(2/5)*log2(2/5)-(2/5)*log2(2/5)-(1/5)*log2(1/5)=1.5219280948873621
条件熵、分裂信息度量为:([0.4, 0.5509775004326937], [1.5219280948873621, 1.3709505944546687])
4.计算信息增益和信息增益率
def information_gain(dataSet,classSet):
### 信息增益和信息增益率 返回信息增益率最大的特征
# entropy=entropy(dataSet,classSet)
maxInf_gainRatio=0
entropy0=entropy(dataSet,classSet)
conditional_entlist,entropylist=conditional_entropy(dataSet,classSet)
information_gain=entropy0-conditional_entlist
print("信息增益:",information_gain)
print("信息增益率:")
for i in range(len(entropylist)):
if ( entropylist[i]== 0): # fix the overflow bug
continue
Inf_gainRatio = information_gain[i] / entropylist[i]
print(Inf_gainRatio)
#这个feature的infoGainRatio
if (Inf_gainRatio > maxInf_gainRatio): #选择最大的gain ratio
maxInf_gainRatio = Inf_gainRatio
bestFeature = i
return(bestFeature)
information_gain(dataSet,classSet)
信息增益:[0.57095059 0.41997309]
信息增益率:0.37514952012034747 、 0.30633714717416943
返回的信息增益率最好的特征是第0个
5.创建树(多重字典)
def createTree(dataSet, feat_name):
dataSet = asarray(mat(dataSet)).tolist()
classList = [example[-1] for example in dataSet]
if classList.count(classList[0]) == len(classList): # 类别完全相同,停止划分,返回其类别
return classList[0]
if len(dataSet[0]) == 0: # 遍历完所有特征时返回"End of the classification
return "End of the classification"
bestfeat = information_gain(mat(dataSet)[:,:-1],mat(dataSet)[:,-1].T) # 信息增益率最大的特征位置 0-->1
bestfeat_name = feat_name[bestfeat] # 信息增益率最大的特征名
dm_tree = {bestfeat_name:{}} # 多重字典构建树 {'t1': {'1.0': {'t2': {'1.0': 'b', '2.1': 'a'}}, '1.3': 'b', '2.0': 'a'}}
del(feat_name[bestfeat])
feat_data = [featdata[bestfeat] for featdata in dataSet] ## ['1.0', '2.0', '1.3', '1.0', '2.0']
feat_set = set(feat_data) ## {'1.0', '1.3', '2.0'}
for value in feat_set:
featname = feat_name[:]
dm_tree[bestfeat_name][value] = createTree(splitDataSet(mat(dataSet)[:,:-1],mat(dataSet)[:,-1].T, bestfeat, value), featname)
return dm_tree
c = hstack((mat(dataSet),mat(classSet).T)) ## 合并特征和分类矩阵
o=createTree(c,["t1","t2"]) ## 决策树字典
drawtree.createPlot(o) ## 绘制决策树,drawtree:绘制树的自定义模块
决策树如下:
6.利用树进行分类
def treeclass(dm_tree,t_data,feat_name):
for i in range(len(t_data)):
try:
key1=[key1 for key1 in dm_tree[feat_name[i]].keys()]
print(key1)
except:
pass
else:
if str(t_data[i]) in key1:
tree2=dm_tree[feat_name[i]]
print(tree2)
if str(type(tree2[str(t_data[i])])) == "<class 'dict'>":
a=t_data.pop(i)
feat_name.pop(i)
print(t_data,tree2[str(a)],feat_name)
treeclass(tree2[str(a)],t_data,feat_name)
else:
print(tree2[str(t_data[i])])
return tree2[str(t_data[i])]