基于信息增益的决策树算法的个人简单实现
数据如下:
class Mytreedata:
datalist = {
1: [‘青绿’, ‘蜷缩’, ‘浊响’, ‘清晰’, ‘凹陷’, ‘硬滑’, ‘0.697’, ‘0.460’, ‘是’],
2: [‘乌黑’, ‘蜷缩’, ‘沉闷’, ‘清晰’, ‘凹陷’, ‘硬滑’, ‘0.774’, ‘0.376’, ‘是’],
3: [‘乌黑’, ‘蜷缩’, ‘浊响’, ‘清晰’, ‘凹陷’, ‘硬滑’, ‘0.634’, ‘0.264’, ‘是’],
4: [‘青绿’, ‘蜷缩’, ‘沉闷’, ‘清晰’, ‘凹陷’, ‘硬滑’, ‘0.608’, ‘0.318’, ‘是’],
5: [‘浅白’, ‘蜷缩’, ‘浊响’, ‘清晰’, ‘凹陷’, ‘硬滑’, ‘0.556’, ‘0.215’, ‘是’],
6: [‘青绿’, ‘稍蜷’, ‘浊响’, ‘清晰’, ‘稍凹’, ‘软粘’, ‘0.403’, ‘0.237’, ‘是’],
7: [‘乌黑’, ‘稍蜷’, ‘浊响’, ‘稍糊’, ‘稍凹’, ‘软粘’, ‘0.481’, ‘0.149’, ‘是’],
8: [‘乌黑’, ‘稍蜷’, ‘浊响’, ‘清晰’, ‘稍凹’, ‘硬滑’, ‘0.437’, ‘0.211’, ‘是’],
9: [‘乌黑’, ‘稍蜷’, ‘沉闷’, ‘稍糊’, ‘稍凹’, ‘硬滑’, ‘0.666’, ‘0.091’, ‘否’],
10: [‘青绿’, ‘硬挺’, ‘清脆’, ‘清晰’, ‘平坦’, ‘软粘’, ‘0.243’, ‘0.267’, ‘否’],
11: [‘浅白’, ‘硬挺’, ‘清脆’, ‘模糊’, ‘平坦’, ‘硬滑’, ‘0.245’, ‘0.057’, ‘否’],
12: [‘浅白’, ‘蜷缩’, ‘浊响’, ‘模糊’, ‘平坦’, ‘软粘’, ‘0.343’, ‘0.099’, ‘否’],
13: [‘青绿’, ‘稍蜷’, ‘浊响’, ‘稍糊’, ‘凹陷’, ‘硬滑’, ‘0.639’, ‘0.161’, ‘否’],
14: [‘浅白’, ‘稍蜷’, ‘沉闷’, ‘稍糊’, ‘凹陷’, ‘硬滑’, ‘0.657’, ‘0.198’, ‘否’],
15: [‘乌黑’, ‘稍蜷’, ‘浊响’, ‘清晰’, ‘稍凹’, ‘软粘’, ‘0.360’, ‘0.370’, ‘否’],
16: [‘浅白’, ‘蜷缩’, ‘浊响’, ‘模糊’, ‘平坦’, ‘硬滑’, ‘0.593’, ‘0.042’, ‘否’],
17: [‘青绿’, ‘蜷缩’, ‘沉闷’, ‘模糊’, ‘稍凹’, ‘硬滑’, ‘0.719’, ‘0.103’, ‘否’],
}
注意点:
1、算法递归返回的三个条件
(1)当前结点包含的样本全属于同一类别,无需划分
(2)当前属性集为空,或是所有样本在所有属性上取值相同,无法划分
(3)当前结点包含的样本集为空,不能划分
2、算法打印树的部分比较乱,可能需要个人改进
以下是代码部分
from data import Mytreedata
from collections import Counter
import math
alldata = Mytreedata().datalist
raw_D = {}
raw_D['nodes'] = list(alldata.keys())
Atrr_index = {'色泽': 0, '根蒂': 1, '敲声': 2, '纹理': 3, '脐部': 4, '触感': 5, '密度': 6, '含糖率': 7, '标签': 8}
raw_A = ['色泽', '根蒂', '敲声', '纹理', '脐部', '触感', '密度', '含糖率']
lisan_A = ['密度', '含糖率']
'''判断D中样本是否都属于同一类别'''
def D_onelabel(D):
label_list = [alldata[item][8] for item in list(D['nodes'])]
label_set = set(label_list)
if len(label_set) == 1:
return True, label_set
else:
return False, next(iter(label_set))
def D_A_onevalue(D, A):
for atrr in A:
valuelist = [alldata[item][Atrr_index[atrr]] for item in D['nodes']]
valueset = set(valuelist)
if len(valueset) > 1:
key = 1
break
if key == 1:
return False, None
labellist = [alldata[item][8] for item in list(D['nodes'])]
labelcount = Counter(labellist)
labelclass = '是' if labelcount['是'] > labelcount['否'] else '否'
return True, labelclass
def calculate_entropy(data):
total_count = len(data)
unique_values = set(data)
entropy = 0.0
for value in unique_values:
value_count = data.count(value)
probability = value_count / total_count
entropy -= probability * math.log2(probability)
return entropy
def select_attr(D, A):
selectA = None
select_index = None
max_gain = -20908908
real_sub = []
ent_D = calculate_entropy([alldata[item][8] for item in D['nodes']])
real_bz_list = []
for attr_index, attr in enumerate(A):
new = 0
value_candidatelist = [alldata[item][Atrr_index[attr]] for item in D['nodes']]
value_candidateset = set(value_candidatelist)
sorted_values = sorted(value_candidateset)
sublist = []
gain = 0
if attr in lisan_A:
for i in range(len(sorted_values) - 1):
binary_value = (float(sorted_values[i]) + float(sorted_values[i + 1])) / 2
new = 0
sublist = []
sublist1 = []
sublist2 = []
for item in D['nodes']:
if float(alldata[item][Atrr_index[attr]]) <= binary_value:
sublist1.append(item)
elif float(alldata[item][Atrr_index[attr]]) > binary_value:
sublist2.append(item)
sublist.append(sublist1)
sublist.append(sublist2)
total_length = sum(len(sub) for sub in sublist)
for list_one in sublist:
new = new + len(list_one) / total_length * calculate_entropy(
[alldata[item][8] for item in list_one])
gain = ent_D - new
if gain > max_gain:
max_gain = gain
selectA = attr
select_index = attr_index
real_sub = sublist
real_bz_list = [attr + '<=' + str(binary_value), attr + '>' + str(binary_value)]
else:
new = 0
sublist = [[item for item in D['nodes'] if alldata[item][Atrr_index[attr]] == value_candidate] for
value_candidate in value_candidateset]
total_length = sum(len(sub) for sub in sublist)
for list_one in sublist:
new = new + len(list_one) / total_length * calculate_entropy([alldata[item][8] for item in list_one])
gain = ent_D - new
if gain > max_gain:
max_gain = gain
selectA = attr
select_index = attr_index
real_sub = sublist
real_bz_list = [attr + '==' + value_candidate for value_candidate in value_candidateset]
return {'selectA': selectA, 'select_index': select_index, 'max_gain': max_gain, 'real_sub': real_sub,
'real_bz_list': real_bz_list}
def TreeGenerate(D, A):
sub_D = {}
tree = None
node = {}
node = D
do, doc = D_onelabel(D)
if do == True:
node['label'] = doc
print(node)
return node
daov, daoc = D_A_onevalue(D, A)
if len(A) == 0 or daov == True:
node['label'] = daoc
print(node)
return node
result_dict = select_attr(D, A)
selectA = result_dict['selectA']
select_index = result_dict['select_index']
max_gain = result_dict['max_gain']
real_sub = result_dict['real_sub']
real_bz_list = result_dict['real_bz_list']
if len(real_sub) == 0:
daov, daoc = D_A_onevalue(D, A)
node['label'] = daoc
print(node)
return node
assert len(real_sub) == len(real_bz_list), "子集和子集标签数目相等"
for itemlist, real_bz in zip(real_sub, real_bz_list):
if len(itemlist) > 0:
sub_D['nodes'] = itemlist
sub_D['zhibiao'] = real_bz
sub_D['label'] = 'null'
sub_D['subnode'] = []
sub_A = A[:select_index] + A[select_index + 1:]
subnode = TreeGenerate(sub_D, sub_A)
node['subnode'].append(subnode)
print(node)
return node
def main():
raw_D['label'] = 'tree'
raw_D['subnode'] = []
raw_D['zhibiao'] = None
print(TreeGenerate(raw_D, raw_A))
main()