李航《统计学习方法第二版》学习笔记
知识点:
- 提升方法是以分类树或回归树为基分类器的Boosting方法
- 采用加法模型与前向分步算法
- 对分类问题-二叉分类树 对回归问题- 二叉回归树
- 加法模型:
, M代表树的个数,Θ表示树的参数
- 前向分步算法:
,
是当前模型
- 通过经验风险最小化确定下一棵决策树的参数Θ:
- 当采用平方误差损失函数时,其损失变为简单拟合残差
- 对于二元分类问题,提升树算法只需将AdaBoost算法中的基本分类器限制为二类分类器
例8.2 python代码实现并随便输入x值进行预测:
'''数据准备'''
import numpy as np
data_x = [1,2,3,4,5,6,7,8,9,10]
data_y = [5.56, 5.70,5.91, 6.40, 6.80, 7.05, 8.90, 8.70, 9.00, 9.05]
'''寻找最优切分点'''
# 计算m(s)
def compute_s(s_list, c):
result = 0
for s_i in s_list:
result += (s_i-c)**2
return result
# 寻找最优切分点
def find_bestS(data_x, data_y):
s = [(x_i+0.5) for x_i in data_x]
c_lefts = []
c_rights = []
min_s_lists = []
for s_i in s[:-1]:
R1 = []
R2 = []
for x_i in data_x:
if x_i<s_i:
R1.append(data_y[x_i-1])
else:
R2.append(data_y[x_i-1])
c_left = np.round(np.average(R1),2)
c_right = np.round(np.average(R2),2)
c_lefts.append(c_left)
c_rights.append(c_right)
# print("R1:{},c1:{}".format(R1,c_left))
# print("R2:{},c2:{}".format(R2,c_right))
min_left = compute_s(R1,c_left)
min_right = compute_s(R2,c_right)
min_s_lists.append(min_left+min_right)
min_s_index = min_s_lists.index(min(min_s_lists))
min_s = s[min_s_index]
c_left = c_lefts[min_s_index]
c_right = c_rights[min_s_index]
# x小于s的时候,T取值为c_left, 反之T取值为c_right
T = {"s":min_s,"c_left":c_left,"c_right":c_right}
return T
'''计算残差'''
def compute_residual(T):
s = T.get('s')
c_left = T.get('c_left')
c_right = T.get('c_right')
r_lists = []
for x_i in data_x:
if x_i<s:
r = np.round(data_y[x_i-1]-c_left,2)
r_lists.append(r)
else:
r = np.round(data_y[x_i-1]-c_right,2)
r_lists.append(r)
return r_lists
T_lists = []
for i in range(6):
T = find_bestS(data_x, data_y)
T_lists.append(T)
print("第{}棵树:{}".format(i+1,T))
data_y = compute_residual(T)
print("残差为:",data_y)
输出
第1棵树:{'s': 6.5, 'c_left': 6.24, 'c_right': 8.91} 残差为: [-0.68, -0.54, -0.33, 0.16, 0.56, 0.81, -0.01, -0.21, 0.09, 0.14] 第2棵树:{'s': 3.5, 'c_left': -0.52, 'c_right': 0.22} 残差为: [-0.16, -0.02, 0.19, -0.06, 0.34, 0.59, -0.23, -0.43, -0.13, -0.08] 第3棵树:{'s': 6.5, 'c_left': 0.15, 'c_right': -0.22} 残差为: [-0.31, -0.17, 0.04, -0.21, 0.19, 0.44, -0.01, -0.21, 0.09, 0.14] 第4棵树:{'s': 4.5, 'c_left': -0.16, 'c_right': 0.11} 残差为: [-0.15, -0.01, 0.2, -0.05, 0.08, 0.33, -0.12, -0.32, -0.02, 0.03] 第5棵树:{'s': 6.5, 'c_left': 0.07, 'c_right': -0.11} 残差为: [-0.22, -0.08, 0.13, -0.12, 0.01, 0.26, -0.01, -0.21, 0.09, 0.14] 第6棵树:{'s': 2.5, 'c_left': -0.15, 'c_right': 0.04} 残差为: [-0.07, 0.07, 0.09, -0.16, -0.03, 0.22, -0.05, -0.25, 0.05, 0.1] [{'s': 6.5, 'c_left': 6.24, 'c_right': 8.91}, {'s': 3.5, 'c_left': -0.52, 'c_right': 0.22}, {'s': 6.5, 'c_left': 0.15, 'c_right': -0.22}, {'s': 4.5, 'c_left': -0.16, 'c_right': 0.11}, {'s': 6.5, 'c_left': 0.07, 'c_right': -0.11}, {'s': 2.5, 'c_left': -0.15, 'c_right': 0.04}]
预测
'''当x = 6.6时,预测值为多少'''
x = float(input("请输入x的值:"))
y_predict_lists = []
for T_i in T_lists:
if x < T_i['s']:
y_predict_lists.append(T_i['c_left'])
else:
y_predict_lists.append(T_i['c_right'])
y_predict = np.sum(y_predict_lists)
print("预测值为:",y_predict)
输出
请输入x的值:6.6 预测值为: 8.95