李航《统计学习方法》p140页例题
import numpy as np
import math
# 《统计学习方法》p140例子
def read_data():
fp = open("dataSet\\adaBoostData.txt")
sample = []; label = []
for line in fp.readlines():
arr = line.split()
sample.append(int(arr[0]))
label.append(int(arr[1]))
fp.close()
return sample, label
# 更新训练数据分布即权值weight, 更新基分类器的系数
def update(error, thro, dir):
# 先更新及分类器系数
alpha_m = 0.5 * math.log((1-error)/error)
alpha.append(alpha_m)
# 更新训练数据集的权值分布,先求规范化因子
zm = 0
for i in range(sample_n):
gm = 1 if sample[i] < thro else -1
weight[i] = weight[i] * math.exp(-alpha_m * label[i] * gm * dir)
zm += weight[i]
for i in range(sample_n):
weight[i] /= zm
def cal_error(thro, dir):
# 计算基分类器的分类结果
error = 0
for i in range(sample_n):
error += weight[i] * ((sample[i] < thro) ^ (dir*label[i] > 0))
return error
def get_score():
score = 0.0
for i in range(sample_n):
score -= weight[i] * label[i]
return score
# 训练基分类器,也就是训练阈值,不知道有没有简单的求阈值方法
def base_classifier():
# 大于该阈值为-1, 小于为+1(暂时这么记录,符号以实际为准)
score = np.zeros((1, sample_n+1)) # 记录不同阈值分类误差率
score[0][0] = get_score()
for i in range(1, sample_n+1):
score[0][i] = score[0][i-1] + 2 * weight[i-1] * label[i-1]
s_l = score[0].tolist()
s_l_max = max(score[0])
s_l_min = min(score[0])
# 记录方向,dir=-1,小于阈值为-1,大于为+1
if s_l_max <= math.fabs(s_l_min):
dir = -1
thro = s_l.index(s_l_min)-0.5
else:
dir = 1
thro = s_l.index(s_l_max)-0.5
return thro, dir
if __name__ == '__main__':
sample, label = read_data()
sample_n = np.shape(label)[0]
weight = 0.1 * np.ones(np.shape(label))
thro = []; alpha = []
# 训练3个基分类器, 分类器系数存于alpha中
for i in range(3):
thro_tmp, dir_tmp = base_classifier()
thro.append(thro_tmp)
error = cal_error(thro_tmp, dir_tmp)
update(error, thro_tmp, dir_tmp)
print('thro:')
print(thro)
print('alpha:')
print(alpha)
adaBoostData.txt
0 1
1 1
2 1
3 -1
4 -1
5 -1
6 1
7 1
8 1
9 -1