import matplotlib.pyplot as plt
import numpy as np
from math import log
import operator
import csv
def readDataset(filename):
'''
读取数据
:param filename: 数据文件名,CSV格式
:return: 以列表形式返回数据列表和特征列表
'''
with open(filename) as f:
reader = csv.reader(f)
header_row = next(reader)
labels = header_row[1:9]
dataset = []
for line in reader:
tempVect = line[1:10]
dataset.append(tempVect)
return dataset, labels
def infoEnt(dataset):
'''
计算信息熵
:param dataset: 输入数据集
:return: 返回信息熵
'''
numdata = len(dataset)
labels = {}
for featVec in dataset:
label = featVec[-1]
if label not in labels.keys():
labels[label] = 0
labels[label] += 1
infoEnt = 0
for lab in labels.keys():
prop = float(labels[lab]) / numdata
infoEnt -= (prop * log(prop, 2))
return infoEnt
def bestFeatureSplit(dataset):
'''
最优属性划分
:param dataset: 输入需要划分的数据集
:return: 返回最优划分属性的下标
'''
numFeature = len(dataset[0]) - 1
baseInfoEnt = infoEnt(dataset)
bestInfoGain = 0
bestFeature = -1
bestSplitPoint = None
continuous = False
for i in range(numFeature):
featList = [example[i] for example in dataset]
newEnt = 0
if all(c in "0123456789.-" for c in featList[0]): # 连续属性
continuous = True
featList.sort()
tempFeatList = [float(feat) for feat in featList] # 字符串转换成数字,用set(featList)会出现结果不稳定
mediumPoints = []
for index in range(len(tempFeatList) - 1):
mediumPoints.append((tempFeatList[index] + tempFeatList[index + 1]) / 2)
for point in mediumPo
西瓜书 课后习题4.3 基于信息熵决策树,连续和离散属性,并验证模型
最新推荐文章于 2022-09-22 21:43:31 发布
该博客介绍了如何使用信息熵构建决策树,特别讨论了处理连续和离散属性的方法。通过西瓜数据集进行实战,生成了一棵决策树,虽然结构与书中示例略有差异,但不影响其准确率。提供了数据集下载链接以及相关参考资源。
摘要由CSDN通过智能技术生成