决策树的Gini系数是一种衡量数据集纯度的指标,用于选择最佳的划分特征。下面是一个简单的示例代码,用于计算决策树节点的Gini系数:
```python
import numpy as np
def gini_index(groups, classes):
# 计算总样本数量
total_samples = sum(len(group) for group in groups)
# 初始化Gini系数
gini = 0.0
# 遍历每个组
for group in groups:
size = float(len(group))
# 避免除以0的情况
if size == 0:
continue
score = 0.0
# 计算每个类别在当前组中的比例
for class_val in classes:
p = [row[-1] for row in group].count(class_val) / size
score += p * p
# 根据组的大小加权计算Gini系数
gini += (1.0 - score) * (size / total_samples)
return gini
# 示例数据集
dataset = [[2.771244718,1.784783929,0],
[1.728571309,1.169761413,0],
[3.678319846,2.81281357,0],
[3.961043357,2.61995032,0],
[2.999208922,2.209014212,0],
[7.497545867,3.162953546,1],
[9.00220326,3.339047188,1],
[7.444542326,0.476683375,1],
[10.12493903,3.234550982,1],
[6.642287351,3.319983761,1]]
# 示例划分特征和类别
split_feature = 0
split_value = 6.642287351
left_group = [[2.771244718,1.784783929,0],
[1.728571309,1.169761413,0],
[3.678319846,2.81281357,0],
[3.961043357,2.61995032,0],
[2.999208922,2.209014212,0]]
right_group = [[7.497545867,3.162953546,1],
[9.00220326,3.339047188,1],
[7.444542326,0.476683375,1],
[10.12493903,3.234550982,1],
[6.642287351,3.319983761,1]]
# 计算Gini系数
classes = np.unique([row[-1] for row in dataset])
gini = gini_index([left_group, right_group], classes)
print('Gini Index:', gini)
```