决策树算法

WSWZHHH@

已于 2025-04-30 09:03:02 修改

阅读量106

点赞数 2

文章标签：决策树

于 2025-04-15 07:00:44 首次发布

本文链接：https://blog.csdn.net/2301_78731213/article/details/147237289

版权

import numpy as np

def calcGini(y):

"""计算GINI系数"""

label_counts = {}

for label in y:

if label not in label_counts:

label_counts[label] = 0

label_counts[label] += 1

gini = 1.0

total_count = len(y)

for label in label_counts:

probability = float(label_counts[label]) / total_count

gini -= probability ** 2

return gini

def splitDataSet(data_X, data_Y, index, value):

"""根据某个特征的值将数据集划分为两部分"""

left_X, left_Y, right_X, right_Y = [], [], [], []

for i in range(len(data_X)):

if data_X[i][index] < value:

left_X.append(data_X[i])

left_Y.append(data_Y[i])

else:

right_X.append(data_X[i])

right_Y.append(data_Y[i])

return np.array(left_X), np.array(left_Y), np.array(right_X), np.array(right_Y)

def chooseBestFeature(data_X, data_Y):

m, n = data_X.shape

bestFeature = -1

bestFeaVal = -1

minFeaGini = np.inf

for i in range(n): # 遍历所有特征

fea_cls = np.unique(data_X[:, i]) # 获取该特征下的所有特征值

for j in fea_cls: # 遍历所有特征值

newEqDataX, newEqDataY, newNeqDataX, newNeqDataY = splitDataSet(data_X, data_Y, i, j) # 进行数据集切分

feaGini = 0 # 计算基尼指数

if len(newEqDataY) > 0:

feaGini += newEqDataY.size / m * calcGini(newEqDataY)

if len(newNeqDataY) > 0:

feaGini += newNeqDataY.size / m * calcGini(newNeqDataY)

if feaGini < minFeaGini:

bestFeature = i

bestFeaVal = j

minFeaGini = feaGini

return bestFeature, bestFeaVal

# 示例数据集

data_X = np.array([

[5.1, 3.5],

[4.9, 3.0],

[6.7, 3.1],

[6.0, 3.0],

[5.5, 2.5]

])

data_Y = np.array([0, 0, 1, 1, 1])

bestFeature, bestFeaVal = chooseBestFeature(data_X, data_Y)

print(f"Best Feature Index: {bestFeature}, Best Feature Value: {bestFeaVal}")

//todo

预剪枝

提前停止树的构建而对树剪

后剪枝

目的

避免因决策分支过多，以致于把训练集自身的一些特点当做所有数据都具有的一般性质而导致的过拟合。

代码

基尼系数

树的构建

决策树的分类函数

预剪枝

后剪枝

```

import scala.collection.immutable.ListMap

import scala.math.*

import scala.jdk.CollectionConverters.*

import scala.util.chaining.*

enum Node:

case Leaf(value: Any)

case Branch(label: String, children: ListMap[Any, Node])

object C45DecisionTree:

def createTree(dataSet: List[List[Any]], labels: List[String]]): Node =

val classList = dataSet.map(_.last)

if classList.forall(_ == classList.head) then

Node.Leaf(classList.head)

else if dataSet.head.size == 1 then

Node.Leaf(majorityCnt(classList))

else

chooseBestFeatureToSplit(dataSet, labels) match

case -1 => Node.Leaf(majorityCnt(classList))

case bestFeat =>

val bestFeatLabel = labels(bestFeat)

val subLabels = labels.patch(bestFeat, Nil, 1)

val children = dataSet

.map(_(bestFeat))

.distinct

.map: value =>

value -> createTree(splitDataSet(dataSet, bestFeat, value), subLabels)

.to(ListMap)

.tap: m =>

if m.size > 1 then m.view.mapValues(_.toString.take(20)).foreach(println)

Node.Branch(bestFeatLabel, children)

private def majorityCnt(classList: List[Any]]): Any =

classList.groupMapReduce(identity)(_ => 1)(_ + _)

.maxBy(_._2)

._1

private def chooseBestFeatureToSplit(dataSet: List[List[Any]], labels: List[String]]): Int =

if dataSet.isEmpty || dataSet.head.size <= 1 then -1

else

val baseEntropy = calculateEntropy(dataSet)

(0 until (dataSet.head.size - 1))

.map: i =>

i -> (baseEntropy - calculateWeightedEntropy(dataSet, i))

.maxByOption(_._2)

.map(_._1)

.getOrElse(-1)

private def calculateEntropy(dataSet: List[List[Any]]]): Double =

if dataSet.isEmpty then 0.0

else

val classList = dataSet.map(_.last)

val total = classList.size.toDouble

classList

.groupMapReduce(identity)(_ => 1.0/total)(_ + _)

.values

.map: p =>

-p * log2(p)

.sum

private def calculateWeightedEntropy(dataSet: List[List[Any]], featureIndex: Int): Double =

if dataSet.isEmpty then 0.0

else

val total = dataSet.size.toDouble

dataSet

.groupBy(_(featureIndex))

.view

.mapValues: subset =>

val p = subset.size.toDouble / total

p * calculateEntropy(subset.map(_.patch(featureIndex, Nil, 1)))

.values

.sum

private def splitDataSet(dataSet: List[List[Any]], featureIndex: Int, value: Any): List[List[Any]] =

dataSet

.filter(_(featureIndex) == value)

.map(_.patch(featureIndex, Nil, 1))

```