下面是一个示例代码,实现了计算信息增益的功能:
import numpy as np
def entropy(labels):
"""
计算给定标签的熵值
"""
n_labels = len(labels)
if n_labels <= 1:
return 0
counts = np.bincount(labels)
probs = counts / n_labels
n_classes = np.count_nonzero(probs)
if n_classes <= 1:
return 0
ent = 0.
for i in probs:
ent -= i * np.log2(i)
return ent
def partition(data, labels, attribute):
"""
将数据根据属性划分为多个子集
"""
partitions = {}
for x, y in zip(data, labels):
if x[attribute] not in partitions:
partitions[x[attribute]] = []
partitions[x[attribute]].append(y)
return partitions
def information_gain(data, labels, attribute):
"""
计算给定属性的信息增益
"""
ent = entropy(labels)
partitions = partition(data, labels, attribute)
gain = ent
for partition in partitions.values():
gain -= len(partition) / len(labels) * entropy(partition)
return gain