决策树模型:让计算机自动构建逐层的 if-else 模型
优点:
可解释性
可提取到重要特征
缺点:
分类简单,解决不了复杂的问题
信息熵
参考:https://blog.csdn.net/weixin_39826984/article/details/111269019
基尼系数
参考:https://blog.csdn.net/weixin_41855010/article/details/110312523
import numpy as np
from collections import Counter
from icecream import ic
from functools import lru_cache
# 信息熵
@lru_cache(maxsize=2**10)
def pr(es):
counter = Counter(es)
def _wrap(e):
return counter[e] / len(es)
return _wrap
def entropy(elements):
# 信息熵
p = pr(elements)
# return -np.sum(p(e) * np.log(p(e)) for e in set(elements))
return -np.sum(np.fromiter([p(e) * np.log(p(e)) for e in set(elements)], dtype=float))
# 基尼系数
def gini(elements):
p = pr(elements)
# return 1 - np.sum(p(e) ** 2 for e in set(elements))
return 1 - np.sum(np.fromiter([p(e) ** 2 for e in set(elements)], dtype=float))
pure_func = gini
ic(pure_func([1, 1, 1, 1, 1, 0]))
ic(pure_func([1, 1, 1, 1, 1, 1]))
ic(pure_func([1, 2, 3, 4, 5, 8]))
ic(pure_func([1, 2, 3, 4, 5, 9]))
ic(pure_func(['a', 'b', 'c', 'c', 'c', 'c', 'c']))
ic(pure_func(['a', 'b', 'c', 'c', 'c', 'c', 'd']))