# 特征编号 当前分割值 entropy
best_split = {"feature_idx":0,"feature_value":0,"entropy":np.inf}
for feaure_idx in range(4):
# print("当前分割特征为", feaure_idx)
for value in set(X_train[:, feaure_idx]): # set 去重减少数据量
# print("分割值:", value)
left_idx = X_train[:, feaure_idx] <= value
right_idx = X_train[:, feaure_idx] > value
#求信息熵:计算每一类的概率,然后把每一类的概率相乘再相加再取负号
# 求左边样本个数 这里不能用len 因为right_idx里面是True和False,长度一样
n_left = sum(left_idx)
n_right = sum(right_idx)
# 左
X_left = X_train[left_idx]
y_left = y_train[left_idx]
# 左熵
"""
print("左侧熵:", y_left)
temp = {0:0, 1:0, 2:0} # 初始化一个temp 第0类为0个,第1类为0个,第2类为0个
for ele in y_left: # 遍历,是哪一类哪一类就加1
temp[ele] += 1
print(temp)
samples = list(temp.values())
print(samples)
"""
entropy_left = get_entrepy(y_left)
# print(entropy_left)
# print([y_left.tolist().count(idx) for idx in range(3)])
# 右
X_right = X_train[right_idx]
y_right = y_train[right_idx]
# print("右侧熵", y_right)
entropy_right = get_entrepy(y_right)
# print(entropy_right)
# print("向左走", X_left)
# print("向右走", X_right)
# 加权平均
entropy = n_left / (n_left + n_right) * entropy_left + n_right / (n_left + n_right) * entropy_right
# print(entropy)
# 另一种写法
# print(np.average(a = np.array([entropy_left,entropy_right]),weights = np.array([n_left,n_right])))
# 判断是否为最佳 - 野蛮计算 穷举思想
if entropy < best_split["entropy"]:
best_split["feature_idx"] = feaure_idx
best_split['feature_value'] = value
best_split["entropy"] = entropy
学习笔记 - 贝叶斯
于 2022-10-11 23:59:41 首次发布