好的,以下是用Python实现决策树算法的基本步骤:
1. 准备数据集:构建一个包含目标变量和自变量的数据集。
2. 划分数据集:根据某些规则将数据集划分成更小的子集,直到所有的数据都属于同一类别。
3. 计算熵:计算划分后的子集的熵,根据信息增益选择最优的划分特征。
4. 递归构建决策树:根据选择的最优特征递归构建决策树,直到所有的数据都属于同一类别或者达到预设的树的深度。
5. 预测新数据:使用构建好的决策树预测新数据的类别。
代码实现:
```
import math
def create_dataset():
dataset = [[1, 1, 'yes'],
[1, 1, 'yes'],
[1, 0, 'no'],
[0, 1, 'no'],
[0, 1, 'no']]
labels = ['no surfacing', 'flippers']
return dataset, labels
def calc_shannon_ent(dataset):
num_entries = len(dataset)
label_counts = {}
for feat_vec in dataset:
current_label = feat_vec[-1]
if current_label not in label_counts.keys():
label_counts[current_label] = 0
label_counts[current_label] += 1
shannon_ent = 0.0
for key in label_counts:
prob = float(label_counts[key])/num_entries
shannon_ent -= prob * math.log(prob, 2)
return shannon_ent
def split_dataset(dataset, axis, value):
ret_dataset = []
for feat_vec in dataset:
if feat_vec[axis] == value:
reduced_feat_vec = feat_vec[:axis]
reduced_feat_vec.extend(feat_vec[axis+1:])
ret_dataset.append(reduced_feat_vec)
return ret_dataset
def choose_best_feature_to_split(dataset):
num_features = len(dataset[0]) - 1
base_entropy = calc_shannon_ent(dataset)
best_info_gain = 0.0
best_feature = -1
for i in range(num_features):
feat_list = [example[i] for example in dataset]
unique_vals = set(feat_list)
new_entropy = 0.0
for value in unique_vals:
sub_dataset = split_dataset(dataset, i, value)
prob = len(sub_dataset)/float(len(dataset))
new_entropy += prob * calc_shannon_ent(sub_dataset)
info_gain = base_entropy - new_entropy
if (info_gain > best_info_gain):
best_info_gain = info_gain
best_feature = i
return best_feature
def majority_cnt(class_list):
class_count = {}
for vote in class_list:
if vote not in class_count.keys():
class_count[vote] = 0
class_count[vote] += 1
sorted_class_count = sorted(class_count.items(), key=lambda x:x[1], reverse=True)
return sorted_class_count[0][0]
def create_tree(dataset, labels):
class_list = [example[-1] for example in dataset]
if class_list.count(class_list[0]) == len(class_list):
return class_list[0]
if len(dataset[0]) == 1:
return majority_cnt(class_list)
best_feat = choose_best_feature_to_split(dataset)
best_feat_label = labels[best_feat]
my_tree = {best_feat_label:{}}
del(labels[best_feat])
feat_values = [example[best_feat] for example in dataset]
unique_vals = set(feat_values)
for value in unique_vals:
sub_labels = labels[:]
my_tree[best_feat_label][value] = create_tree(split_dataset(dataset, best_feat, value), sub_labels)
return my_tree
def classify(input_tree, feat_labels, test_vec):
first_str = list(input_tree.keys())[0]
second_dict = input_tree[first_str]
feat_index = feat_labels.index(first_str)
key = test_vec[feat_index]
value_of_feat = second_dict[key]
if isinstance(value_of_feat, dict):
class_label = classify(value_of_feat, feat_labels, test_vec)
else:
class_label = value_of_feat
return class_label
if __name__ == '__main__':
dataset, labels = create_dataset()
my_tree = create_tree(dataset, labels)
print(my_tree)
print(classify(my_tree, labels, [1, 1]))
```
以上就是基本的决策树算法的Python实现,代码中的 create_dataset 函数用于创建数据集,calc_shannon_ent 函数用于计算熵,split_dataset 函数用于划分数据集,choose_best_feature_to_split 函数用于选择最优的划分特征,create_tree 函数用于构建决策树,classify 函数用于预测新数据的类别。