from math import log from sklearn.model_selection import train_test_split #数据导入下边有程序报错,因为数据比较少,直接打上去了 data = [ {'outlook': 'sunny', 'temperature': 'hot', 'humidity': 'high', 'windy': 'FALSE', 'play': 'no'}, {'outlook': 'sunny', 'temperature': 'hot', 'humidity': 'high', 'windy': 'TRUE', 'play': 'no'}, {'outlook': 'overcast', 'temperature': 'hot', 'humidity': 'high', 'windy': 'FALSE', 'play': 'yes'}, {'outlook': 'rainy', 'temperature': 'mild', 'humidity': 'high', 'windy': 'FALSE', 'play': 'yes'}, {'outlook': 'rainy', 'temperature': 'cool', 'humidity': 'normal', 'windy': 'FALSE', 'play': 'yes'}, {'outlook': 'rainy', 'temperature': 'cool', 'humidity': 'normal', 'windy': 'TRUE', 'play': 'no'}, {'outlook': 'overcast', 'temperature': 'cool', 'humidity': 'normal', 'windy': 'TRUE', 'play': 'yes'}, {'outlook': 'sunny', 'temperature': 'mild', 'humidity': 'high', 'windy': 'FALSE', 'play': 'no'}, {'outlook': 'sunny', 'temperature': 'cool', 'humidity': 'normal', 'windy': 'FALSE', 'play': 'yes'}, {'outlook': 'rainy', 'temperature': 'mild', 'humidity': 'normal', 'windy': 'FALSE', 'play': 'yes'},] #计算数据集的信息熵函数 def calc_entropy(data): num_instances = len(data) label_counts = {} # 统计各个标签的数量 for instance in data: label = instance['play'] if label not in label_counts: label_counts[label] = 0 label_counts[label] += 1 entropy = 0.0 # 计算熵 for label in label_counts: prob = float(label_counts[label]) / num_instances entropy -= prob * log(prob, 2) return entropy #对数据集按照指定属性进行划分 def split_data(data, attribute): split_data = {} for instance in data:#对于数据集中的每一个实例 value = instance[attribute] if value not in split_data: split_data[value] = [] split_data[value].append(instance) return split_data #选择最佳的划分属性 def choose_attribute(data, attributes): entropy = calc_entropy(data) best_info_gain = 0.0 best_attribute = None # 遍历所有属性,计算信息增益 for attribute in attributes: split_data_dict = split_data(data, attribute) attribute_entropy = 0.0 for value in split_data_dict: gailv = len(split_data_dict[value]) / float(len(data)) attribute_entropy += gailv * calc_entropy(split_data_dict[value]) info_gain = entropy - attribute_entropy if info_gain > best_info_gain: best_info_gain = info_gain best_attribute = attribute return best_attribute #递归构建决策树 def create_decision_tree(data, attributes): # 如果数据集为空,返回None if len(data) == 0: return None # 统计正负样本的数量 counts = {} for instance in data: label = instance['play'] if label not in counts: counts[label] = 0 counts[label] += 1 # 检查是否全为正样本或负样本,若是,返回决策树叶子节点 if len(counts) == 1: return {'attribute': None, 'class': list(counts.keys())[0]} # 如果没有可用属性了,返回正样本数最多的叶子节点 if len(attributes) == 0: max_label = None max_count = 0 for label in counts: if counts[label] > max_count: max_label = label max_count = counts[label] return {'attribute': None, 'class': max_label} # 选择最佳划分属性 best_attribute = choose_attribute(data, attributes) # 划分数据 split_data_dict = split_data(data, best_attribute) # 构建子结点 children = {} for value in split_data_dict: new_attributes = attributes[:] new_attributes.remove(best_attribute) child = create_decision_tree(split_data_dict[value], new_attributes) children[value] = child # 返回决策树节点 return {'attribute': best_attribute, 'children': children} #对测试数据进行预测 def predict(decision_tree, test_data): predictions = [] for data in test_data: node = decision_tree while node['attribute'] is not None: value = data[node['attribute']] if value in node['children']: node = node['children'][value] else: # 处理缺失值 node = node['children'][None] predictions.append(node['class']) return predictions #对预测结果进行评估 def evaluate(predictions, expected): correct = sum([1 for i in range(len(predictions)) if predictions[i] == expected[i]]) accuracy = correct / len(predictions) return accuracy # 列出所有的属性 attributes = ['outlook', 'temperature', 'humidity', 'windy'] # 构建决策树 decision_tree = create_decision_tree(data, attributes) # 对测试集进行预测和评估 test_data = data[:] predictions = predict(decision_tree, test_data) accuracy = evaluate(predictions, [instance['play'] for instance in test_data]) print(decision_tree) print(predictions) print(accuracy) #对数据集进行划分其三分之二进行训练,三分之一进行测试 train_data,test_data1=train_test_split(data,test_size=0.33,random_state=0) decision_tree1=create_decision_tree(train_data,attributes) predictions1=predict(decision_tree1,test_data1) accuracy1=evaluate(predictions1, [instance['play'] for instance in test_data]) print(decision_tree1) print(predictions1) print(accuracy1)