决策树算法

 

 

 

import numpy as np

 

def calcGini(y):

    """计算GINI系数"""

    label_counts = {}

    for label in y:

        if label not in label_counts:

            label_counts[label] = 0

        label_counts[label] += 1

    

    gini = 1.0

    total_count = len(y)

    for label in label_counts:

        probability = float(label_counts[label]) / total_count

        gini -= probability ** 2

    

    return gini

 

def splitDataSet(data_X, data_Y, index, value):

    """根据某个特征的值将数据集划分为两部分"""

    left_X, left_Y, right_X, right_Y = [], [], [], []

    for i in range(len(data_X)):

        if data_X[i][index] < value:

            left_X.append(data_X[i])

            left_Y.append(data_Y[i])

        else:

            right_X.append(data_X[i])

            right_Y.append(data_Y[i])

    return np.array(left_X), np.array(left_Y), np.array(right_X), np.array(right_Y)

 

def chooseBestFeature(data_X, data_Y):

    m, n = data_X.shape

    bestFeature = -1

    bestFeaVal = -1

    minFeaGini = np.inf

 

    for i in range(n): # 遍历所有特征

        fea_cls = np.unique(data_X[:, i]) # 获取该特征下的所有特征值

        for j in fea_cls: # 遍历所有特征值

            newEqDataX, newEqDataY, newNeqDataX, newNeqDataY = splitDataSet(data_X, data_Y, i, j) # 进行数据集切分

 

            feaGini = 0 # 计算基尼指数

            if len(newEqDataY) > 0:

                feaGini += newEqDataY.size / m * calcGini(newEqDataY)

            if len(newNeqDataY) > 0:

                feaGini += newNeqDataY.size / m * calcGini(newNeqDataY)

 

            if feaGini < minFeaGini:

                bestFeature = i

                bestFeaVal = j

                minFeaGini = feaGini

 

    return bestFeature, bestFeaVal

 

# 示例数据集

data_X = np.array([

    [5.1, 3.5],

    [4.9, 3.0],

    [6.7, 3.1],

    [6.0, 3.0],

    [5.5, 2.5]

])

 

data_Y = np.array([0, 0, 1, 1, 1])

 

bestFeature, bestFeaVal = chooseBestFeature(data_X, data_Y)

print(f"Best Feature Index: {bestFeature}, Best Feature Value: {bestFeaVal}")

 

//todo

预剪枝

提前停止树的构建而对树剪

后剪枝

目的

避免因决策分支过多,以致于把训练集自身的一些特点当做所有数据都具有的一般性质而导致的过拟合。

代码

基尼系数

树的构建

决策树的分类函数

预剪枝

后剪枝

```

import scala.collection.immutable.ListMap

import scala.math.*

import scala.jdk.CollectionConverters.*

import scala.util.chaining.*

 

enum Node:

  case Leaf(value: Any)

  case Branch(label: String, children: ListMap[Any, Node])

 

object C45DecisionTree:

 

  def createTree(dataSet: List[List[Any]], labels: List[String]]): Node =

    val classList = dataSet.map(_.last)

 

    if classList.forall(_ == classList.head) then

      Node.Leaf(classList.head)

    else if dataSet.head.size == 1 then

      Node.Leaf(majorityCnt(classList))

    else

      chooseBestFeatureToSplit(dataSet, labels) match

        case -1 => Node.Leaf(majorityCnt(classList))

        case bestFeat =>

          val bestFeatLabel = labels(bestFeat)

          val subLabels = labels.patch(bestFeat, Nil, 1)

          val children = dataSet

            .map(_(bestFeat))

            .distinct

            .map: value =>

              value -> createTree(splitDataSet(dataSet, bestFeat, value), subLabels)

            .to(ListMap)

            .tap: m =>

              if m.size > 1 then m.view.mapValues(_.toString.take(20)).foreach(println)

          

          Node.Branch(bestFeatLabel, children)

 

  private def majorityCnt(classList: List[Any]]): Any =

    classList.groupMapReduce(identity)(_ => 1)(_ + _)

      .maxBy(_._2)

      ._1

 

  private def chooseBestFeatureToSplit(dataSet: List[List[Any]], labels: List[String]]): Int =

    if dataSet.isEmpty || dataSet.head.size <= 1 then -1

    else

      val baseEntropy = calculateEntropy(dataSet)

      (0 until (dataSet.head.size - 1))

        .map: i =>

          i -> (baseEntropy - calculateWeightedEntropy(dataSet, i))

        .maxByOption(_._2)

        .map(_._1)

        .getOrElse(-1)

 

  private def calculateEntropy(dataSet: List[List[Any]]]): Double =

    if dataSet.isEmpty then 0.0

    else

      val classList = dataSet.map(_.last)

      val total = classList.size.toDouble

      classList

        .groupMapReduce(identity)(_ => 1.0/total)(_ + _)

        .values

        .map: p =>

          -p * log2(p)

        .sum

 

  private def calculateWeightedEntropy(dataSet: List[List[Any]], featureIndex: Int): Double =

    if dataSet.isEmpty then 0.0

    else

      val total = dataSet.size.toDouble

      dataSet

        .groupBy(_(featureIndex))

        .view

        .mapValues: subset =>

          val p = subset.size.toDouble / total

          p * calculateEntropy(subset.map(_.patch(featureIndex, Nil, 1)))

        .values

        .sum

 

  private def splitDataSet(dataSet: List[List[Any]], featureIndex: Int, value: Any): List[List[Any]] =

    dataSet

      .filter(_(featureIndex) == value)

      .map(_.patch(featureIndex, Nil, 1))

```

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值