设置好属性字典
D_keys = dict(色泽=['青绿', '乌黑', '浅白'], 根蒂=['蜷缩', '硬挺', '稍蜷'], 敲声=['清脆', '沉闷', '浊响'], 纹理=['稍糊', '模糊', '清晰'],
脐部=['凹陷', '稍凹', '平坦'], 触感=['软粘', '硬滑'])
读取数据:
x = []
f = open(r'C:\Users\dell\Desktop\xiguaji3.0.txt', 'r')
lines = f.readlines()
for line in lines:
line = line.replace('\n', '')
x.append(line.split(","))
f.close()
x = np.array(x)
# print(x)
titles = list(x[0, 1:-3]) # 表头
y = x[:, -1] # 标记
x = np.delete(x, 0, axis=0) # 删除第一行,表头
x = np.delete(x, [0, 7, 8], axis=1) # 删除第一
计算数据中的熵
# 重新定义log函数,针对底为0的情况
def log(x):
if x != 0:
return math.log(x, 2)
else:
return 0
# 计算titles中属性的熵
def culEnt(x,title):
indexnumber = {}
for m, n in enumerate(title):
indexnumber[n] = {}
for k in title[n]:
indexnumber[n][k] = []
for index, row in enumerate(x):
if row[m] == k:
indexnumber[n][k].append(index)
Ent = {}
for key1 in indexnumber:
a = 0.0
for key2 in indexnumber[key1]:
yes = 0
no = 0
if len(indexnumber[key1][key2]) != 0:
for i in indexnumber[key1][key2]:
if x[i][-1] == '好瓜':
yes += 1
else:
no += 1
p1 = float(yes / (yes + no))
p2 = float((yes + no) / (x.shape[0]))
a += p2 * (p1 * log(p1) + (1 - p1) * log((1 - p1))) # 防止对0取对数,重新定义log函数
else:
a += 0
Ent[key1] = -a
return indexnumber, Ent
输出的indexnumber包含title中每个属性取值的样本编号;Ent包含每个属性的熵大小
输出结果如下:
{‘色泽’: {‘青绿’: [0, 3, 5, 9, 12, 16], ‘乌黑’: [1, 2, 6, 7, 8, 14], ‘浅白’: [4, 10, 11, 13, 15]},
‘根蒂’: {‘蜷缩’: [0, 1, 2, 3, 4, 11, 15, 16], ‘硬挺’: [9, 10], ‘稍蜷’: [5, 6, 7, 8, 12, 13, 14]},
‘敲声’: {‘清脆’: [9, 10], ‘沉闷’: [1, 3, 8, 13, 16], ‘浊响’: [0, 2, 4, 5, 6, 7, 11, 12, 14, 15]},
‘纹理’: {‘稍糊’: [6, 8, 12, 13, 16], ‘模糊’: [10, 11, 15], ‘清晰’: [0, 1, 2, 3, 4, 5, 7, 9, 14]},
#‘脐部’: {‘凹陷’: [0, 1, 2, 3, 4, 12, 13], ‘稍凹’: [5, 6, 7, 8, 14, 16], ‘平坦’: [9, 10, 11, 15]},
#‘触感’: {‘软粘’: [5, 6, 9, 11, 14], ‘硬滑’: [0, 1, 2, 3, 4, 7, 8, 10, 12, 13, 15, 16]}}
{‘色泽’: 0.88937738110375, ‘根蒂’: 0.8548275868023224, ‘敲声’: 0.8567211127541194,
‘纹理’: 0.6169106490008467, ‘脐部’: 0.7083437635274363, ‘触感’: 0.9914560571925497}
下面进行决策树递归编程,按照书上的算法:
def calculateShannonEnt(x, title, table):
yes = 0 # 存储正例样本个数
for row in x:
if row[-1] == "好瓜":
yes += 1
p_yes = float(yes/x.shape[0])
if p_yes == 1:
print("leaf节点:好瓜")
return # 结束递归
if p_yes == 0:
print("leaf节点:坏瓜")
return
judge = True
for i in range(x.shape[1]): # 列数,等于属性集
if len(set(x[:, i])) != 1: # 如果某一属性各样本取值存在不等的情况,则说明在属性集上不全相等
judge = False
break
if len(title) == 0 | judge: #len(title) == 0意味着属性集为空
print("leaf节点:好瓜") if p_yes >= 0.5 else print("leaf节点:坏瓜")
return
indexnumber, Ent = culEnt(x, title) # indexnumber:每个属性包含的样本情况,Ent:每个属性的信息增益
# print(indexnumber)
key_name = min(Ent, key=Ent.get) # 最优划分属性
print("节点:{0}:分支{1}".format(key_name, indexnumber[key_name]))
for key in indexnumber[key_name]:
if len(indexnumber[key_name][key]) == 0:
print("leaf节点:好瓜") if p_yes >= 0.5 else print("leaf节点:坏瓜")
return
else:
print("分支:", key)
m = [] #存储去除最优属性后的新的属性
for i in table:
if i != key_name:
m.append(i)
n = {} #存储去除最优属性后的新的属性字典
for i in title:
if i != key_name:
n[i] = title[i]
a = np.array(x[indexnumber[key_name][key]])
a=np.delete(a, table.index(key_name), axis=1) #划分后的样本
calculateShannonEnt(a, n, m)
最后输出结果与书上一致
节点:纹理:分支{‘稍糊’: [6, 8, 12, 13, 16], ‘模糊’: [10, 11, 15], ‘清晰’: [0, 1, 2, 3, 4, 5, 7, 9, 14]}
分支: 稍糊
节点:触感:分支{‘软粘’: [0], ‘硬滑’: [1, 2, 3, 4]}
分支: 软粘
leaf节点:好瓜
分支: 硬滑
leaf节点:坏瓜
分支: 模糊
leaf节点:坏瓜
分支: 清晰
节点:根蒂:分支{‘蜷缩’: [0, 1, 2, 3, 4], ‘硬挺’: [7], ‘稍蜷’: [5, 6, 8]}
分支: 蜷缩
leaf节点:好瓜
分支: 硬挺
leaf节点:坏瓜
分支: 稍蜷
节点:色泽:分支{‘青绿’: [0], ‘乌黑’: [1, 2], ‘浅白’: []}
分支: 青绿
leaf节点:好瓜
分支: 乌黑
节点:触感:分支{‘软粘’: [1], ‘硬滑’: [0]}
分支: 软粘
leaf节点:坏瓜
分支: 硬滑
leaf节点:好瓜
leaf节点:好瓜 浅白