《机器学习》课后题8.5
同8.3,第一次写bagging,模仿了一下有个大牛的写法。
python代码
import math
import random as rd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# 单层决策树
class DTStump(object):
"""docstring for DTStump"""
def __init__(self, X, Y):
self.X = X
self.Y = Y
self.build()
# 叶节点选择其类别为D中样本最多的类
def choose_largest_example(self, X):
D = self.Y.loc[X.index]
Count = D.value_counts()
Max = -1
for key, value in Count.items():
if Max < value:
label = key
Max = value
return label
# 计算给定数据集的熵
def calc_Ent(self, X):
D = self.Y.loc[X.index]
numEntries = D.shape[0]
Count = D.value_counts()
Ent = 0.0
for key, value in Count.items():
# print(Count[key])
prob = Count[key] / numEntries
Ent -= prob * math.log(prob, 2)
return Ent
# 生成连续值属性的候选划分点集合T
def candidate_T(self, key, n):
L = set(self.X[key])
T = []
a, Sum = 0, 0
for value in L:
Sum += value
a += 1
if a == n:
T.append(Sum / n)
a, Sum = 0, 0
if a > 0:
T.append(Sum / a)
return T
# 计算样本D基于划分点t二分后的连续值属性信息增益
def calc_Gain_t(self, key, t, Ent_D):
Ent = 0.0
D_size = self.X.shape[0]
Dv = self.X.loc[self.X[key] <= t]
Dv_size =