juce.py
import numpy as np
class TreeNode:
def __init__(self):
self.index = None
self.lch = None
self.rch = None
self.t = None
self.label = None
def set_index(self, index):
self.index = index
def set_lch(self, lch):
self.lch = lch
def set_rch(self, rch):
self.rch = rch
def set_t(self, t):
self.t = t
def set_label(self, l):
self.label = l
class JueCe:
"""
该函数用于计算信息熵,但对于集成学习的决策树而言没有效果,故本方法不使用
"""
# def get_ent(self, ai, set_d, train_data, w, flag, t):
# # param flag=0为小于t
# if flag == 0:
# where_arr = np.where(train_data[set_d, ai] < t)
# else:
# where_arr = np.where(train_data[set_d, ai] > t)
# temp_d = set_d[where_arr[0]]
# count_y = 0
# count_n = 0
# for k in temp_d:
# if train_data[k, -1] == 1:
# count_y += w[k]
# else:
# count_n += w[k]
# if (count_y == 0) | (count_n == 0):
# return 0
# p_y = count_y / (count_n + count_y)
# p_n = count_n / (count_n + count_y)
# return -(p_y * np.log2(p_y) + p_n * np.log2(p_n)) * len(temp_d) / len(set_d)
'''
计算误差
'''
def get_e(self, ai, set_d, train_data, w, flag, t):
# param flag=0为小于t
if flag == 0:
where_arr = np.where(train_data[set_d, ai] < t)
else:
where_arr = np.where(train_data[set_d, ai] > t)
temp_d = set_d[where_arr[0]]
count_y = 0
for k in temp_d:
if train_data[k, -1] == 1:
count_y += 1
if count_y > len(temp_d) / 2:
label = 1
else:
label = -1
e = 0
for k in temp_d:
if train_data[k, -1] != label:
e += w[k]
return e
'''
寻找最优划分划分点t
'''
def get_t(self, a_arr, a_i, set_d, train_data, w):
t_arr = [(a_arr[i] + a_arr[i + 1]) / 2 for i in range(len(a_arr) - 1)]
min_gain = 100000
res_t = -1
for t in t_arr:
# 使用误差率作为寻找最优划分的特征
e_low = self.get_e(a_i, set_d, train_data, w, 0, t)
e_high = self.get_e(a_i, set_d, train_data, w, 1, t)
e = e_low + e_high # 计算gain
if e < min_gain:
min_gain = e
res_t = t
return res_t, min_gain
'''
寻找最优划分的属性a和对应的划分点t
'''
def get_opt(self, set_d, set_a, train_data, w):
res_t = -1
res_a = -1
min_gain = 100000
for a in set_a:
temp_arr = np.array(train_data[:, a], dtype=float)
sort_arr = np.sort(temp_arr)
t, gain = self.get_t(sort_arr, a, set_d, train_data, w)
if gain < min_gain:
min_gain = gain
res_t = t
res_a = a
return res_a, res_t
'''
为叶结点设置分类标签
'''
def get_typename(self, set_d, train_data,w):
where_arr1 = np.where(train_data[set_d, -1] == 1)
where_arr2 = np.where(train_data[set_d, -1] == -1)
temp1 = set_d[where_arr1[0]]
temp2 = set_d[where_arr2[0]]
#正类多
if len(temp1) > len(temp2):
return 1
#若正类和负类数目相同,则采用使分类误差更小的类别作为最终标签
if len(temp1) == len(temp2):
count1 = 0
for k in set_d:
if train_data[k,-1] != 1:
count1 += w[k]
count2 = 0
for k in set_d:
if train_data[k, -1] != -1:
count2 += w[k]
if count1 >= count2:
return -1
return 1
#负类多
return -1
'''
创建决策树节点函数
'''
def create_node(self, set_d, set_a, train_data, w):
node = TreeNode()
type_name = self.get_typename(set_d, train_data,w)
if len(np.unique(train_data[set_d, -1])) == 1:
node.set_label(train_data[set_d[0], -1])
elif len(set_a) == 0:
node.set_label(type_name)
else:
a_hit, t = self.get_opt(set_d, set_a, train_data, w)
node.set_index(a_hit)
node.set_t(t)
low_set = set_d[np.where(train_data[set_d, a_hit] < t)[0]]
high_set = set_d[np.where(train_data[set_d, a_hit] > t)[0]]
#左子树为小于t的数据集
lch = TreeNode()
#右子树为大于t的数据集
rch = TreeNode()
if len(low_set) == 0:
lch.set_label(type_name)
else:
lch = self.create_node(low_set, np.setdiff1d(set_a, a_hit), train_data, w)
if len(high_set) == 0:
rch.set_label(type_name)
else:
rch = self.create_node(high_set, np.setdiff1d(set_a, a_hit), train_data, w)
node.set_lch(lch)
node.set_rch(rch)
return node
'''
对测试数据进行预测函数
'''
def predict(self, root, test_data):
labels = np.array([0] * len(test_data))
for i in range(len(test_data)):
temp = root
while temp.index is not None:
if test_data[i][temp.index] < temp.t:
temp = temp.lch
else:
temp = temp.rch
labels[i] = temp.label
return labels
adaboost.py
import numpy as np
from juce import JueCe
def comput_wucha(train_data, h_t, w):
count = 0
for k in range(len(h_t)):
if train_data[k, -1] != h_t[k]:
count += w[k]
return count
'''
符号函数
'''
def sign(fx):
for j in range(len(fx)):
if fx[j] > 0:
fx[j] = 1
else:
fx[j] = -1
return fx
'''
强分类器进行预测
'''
def Fx(a_t, roots, test_data):
Hx = np.array(np.random.randint(0, 1, (len(a_t), len(test_data))), dtype=float)
for k in range(len(roots)):
Hx[k] = juece.predict(roots[k], test_data)
fx = np.dot(a_t, Hx)
return sign(fx)
load_data = np.loadtxt('juece3.0.txt', dtype=float, delimiter=',', encoding='utf8')
w = np.array([1 / len(load_data)] * len(load_data)) # 样本权值
set_d = np.arange(0, len(load_data)) # 训练集下标
set_a = np.arange(0, len(load_data[0]) - 1) # 属性下标
juece = JueCe() # 初始化决策类
T = 3 # 弱分类器数目
a_t = np.array([0] * T, dtype=float) # 保存系数a
H = np.array(np.random.randint(0, 1, (T, len(load_data)))) # 保存每个分类器的预测结果
roots = []
for t in range(T):
root = juece.create_node(set_d, set_a, load_data, w) # 建立决策树,返回根节点
roots.append(root)
H[t] = juece.predict(root, load_data) # 训练集也作为测试集,进行预测,保存在H中
e_t = comput_wucha(load_data, H[t], w) # 误差计算
if e_t > 0.5:
break
a_t[t] = 0.5 * np.log((1 - e_t) / e_t) # 计算系数
for i in range(len(w)):
w[i] = w[i] * np.exp(-a_t[t] * load_data[i, -1] * H[t][i]) # 更新每个样本的权值,分类错误的样本权值增大
z_t = np.sum(w)
w = w / z_t # 归一化
print("======预测=====")
print("测试集:" + str(load_data))
print("测试结果:" + str(Fx(a_t, roots, load_data)))
训练集采用的西瓜数据集3.0的密度和含糖率
juece3.0.txt
0.697,0.460,1
0.774,0.376,1
0.634,0.264,1
0.608,0.318,1
0.556,0.215,1
0.403,0.237,1
0.481,0.149,1
0.437,0.211,1
0.666,0.091,-1
0.243,0.267,-1
0.245,0.057,-1
0.343,0.099,-1
0.639,0.161,-1
0.657,0.198,-1
0.360,0.370,-1
0.593,0.042,-1
0.719,0.103,-1
也可以用一个简单的训练集进行测试
这里推荐一篇文章https://zhuanlan.zhihu.com/p/27126737
他采用的训练集为
0,1
1,1
2,1
3,-1
4,-1
5,-1
6,1
7,1
8,1
9,-1
经过测试,代码运行效果与该文章所计算的结果相同