一、练习题
1. 练习题1
解答:
因为 ,所以
2.练习题2
解答:
假设,则有:
3.练习题3
解答:取K=2,可以发现如下SAMME算法与李航老师《统计学习方法》8.1.2 adaboost二分类算法完全一致。
4. 练习题4
解答:sklearn上对应源码地址为:
https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/ensemble/_weight_boosting.py
5. 练习题5
解答:可能会略有不同。
6. 练习题6
解答:因为且当
7. 练习题7
解答:
构造拉格朗日函数
令拉格朗日函数对求偏导可得到K个等式:
联立上面方程并使用h_i求和为0可得到结果。
8. 练习题8
解答:
因为的表达式为:
其中
在更新权重时是共有的常数项,可以去除。
9. 练习题9
解答:
1. 刚好有个是中位数且
2. 一定是,因为是逐个加上去的
3.
4. 结果偏向于权重大的数,当然更可信一点。
二、知识回顾
1. 二分类问题下,Adaboost算法如何调节样本的权重?
解答:预测正确的样本减小权重,预测错误的样本增加权重。
2. 样本A在当轮分类错误,且样本B在当轮分类正确,请问在权重调整后,样本A的权重一定大于样本B吗?
解答:第1轮应该是的,后面如果先分类正确然后有分类错误就不一定了。
3. 在处理分类问题时,Adaboost的损失函数是什么?请叙述其设计的合理性。
解答:
这个损失函数满足贝叶斯最优决策条件。
4. Adaboost如何处理回归问题?
解答:
使用Adaboos.R2算法,采用加权中位数作为输出值。
5. 用已训练的Adaboost分类模型和回归模型来预测新样本的标签,请分别具体描述样本从输入到标签输出的流程。
6. 观看周志华老师的讲座视频《Boosting 25年》并谈谈体会。
解答:
从周志华大佬的演讲看,adaboost竟然不容易过拟合,这么牛逼的吗!
三、代码实现
1. SAMME及SAMME.R
(1) 使用GYH老师的代码
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
class AdaboostClassifier:
def __init__(self, base_estimator, n_estimators, algorithm):
self.base_estimator = base_estimator
self.n_estimators = n_estimators
self.algorithm = algorithm
self.boostors = []
if self.algorithm == "SAMME":
self.boostor_weights = []
self.classes = None
def fit(self, X, y, **kwargs):
w = np.repeat(1/X.shape[0], X.shape[0])
self.classes = np.unique(y.reshape(-1)).shape[0]
output = 0
for n in range(self.n_estimators):
cur_boostor = self.base_estimator(**kwargs)
cur_boostor.fit(X, y, w)
if self.algorithm == "SAMME":
y_pred = cur_boostor.predict(X)
err = (w*(y != y_pred)).sum()
alpha = np.log((1-err)/err) + np.log(self.classes-1)
temp_output = np.full(
(X.shape[0], self.classes), -1/(self.classes-1))
temp_output[np.arange(X.shape[0]), y_pred] = 1
self.boostors.append(cur_boostor)
self.boostor_weights.append(alpha)
w *= np.exp(alpha * (y != y_pred))
w /= w.sum()
output += temp_output * alpha
elif self.algorithm == "SAMME.R":
y_pred = cur_boostor.predict_proba(X)
log_proba = np.log(y_pred + 1e-6)
temp_output = (
self.classes-1)*(log_proba-log_proba.mean(1).reshape(-1,1))
temp_y = np.full(
(X.shape[0], self.classes), -1/(self.classes-1))
temp_y[np.arange(X.shape[0]), y] = 1
self.boostors.append(cur_boostor)
w *= np.exp(
(1-self.classes)/self.classes * (temp_y*log_proba).sum(1))
w /= w.sum()
output += temp_output
#acc = accuracy_score(y, np.argmax(output, axis=1))
#print(acc)
def predict(self, X):
result = 0
if self.algorithm == "SAMME":
for n in range(self.n_estimators):
cur_pred = self.boostors[n].predict(X)
temp_output = np.full(
(X.shape[0], self.classes), -1/(self.classes-1))
temp_output[np.arange(X.shape[0]), cur_pred] = 1
result += self.boostor_weights[n] * temp_output
elif self.algorithm == "SAMME.R":
for n in range(self.n_estimators):
y_pred = self.boostors[n].predict_proba(X)
log_proba = np.log(y_pred + 1e-6)
temp_output = (
self.classes-1)*(log_proba-log_proba.mean(1).reshape(-1,1))
result += temp_output
return np.argmax(result, axis=1)
if __name__ == "__main__":
X, y = make_classification(
n_samples=10000, n_features=10,
n_informative=5, random_state=0, n_classes=2
)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=0
)
from sklearn.ensemble import AdaBoostClassifier as ABC
clf = ABC(
DecisionTreeClassifier(max_depth=1),
n_estimators=20, algorithm="SAMME"
)
clf.fit(X_train, y_train)
result = clf.predict(X_test)
print("sklearn中SAMME的验证集得分为: ", accuracy_score(y_test, result))
clf = AdaboostClassifier(
DecisionTreeClassifier,
20, "SAMME"
)
clf.fit(X_train, y_train, max_depth=1)
result = clf.predict(X_test)
print("使用SAMME.R集成的验证集得分为: ", accuracy_score(y_test, result))
clf = ABC(
DecisionTreeClassifier(max_depth=1),
n_estimators=20, algorithm="SAMME.R"
)
clf.fit(X_train, y_train)
result = clf.predict(X_test)
print("sklearn中SAMME.R的验证集得分为: ", accuracy_score(y_test, result))
clf = AdaboostClassifier(
DecisionTreeClassifier,
20, "SAMME.R"
)
clf.fit(X_train, y_train, max_depth=1)
result = clf.predict(X_test)
print("使用SAMME.R集成的验证集得分为: ", accuracy_score(y_test, result))
clf = DecisionTreeClassifier(max_depth=1)
clf.fit(X_train, y_train)
result = clf.predict(X_test)
print("使用决策树桩的验证集得分为: ", accuracy_score(y_test, result))
(2)SAMME 我的代码
# -*- coding: utf-8 -*-
"""
Created on Sat Oct 30 22:54:41 2021
@author: shzy
"""
import numpy as np
import pandas as pd
import math
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
class AdaboostClassifier:
def __init__(self, numBoostingIters=100, maxTreeDepth=3):
self.clfs = []
self.betas = []
self.numBoostingIters = numBoostingIters
self.maxTreeDepth = maxTreeDepth
self.k = 0
self.classes = []
def fit(self, X, y, random_state=None):
self.models = []
self.classes = np.unique((y))
self.K = len(self.classes)
n,d = X.shape
weight = np.full((n,),1/n)
for i in range(self.numBoostingIters):
clf = DecisionTreeClassifier(max_depth=self.maxTreeDepth).fit(X,y,sample_weight=weight)
prediction = clf.predict(X)
e = 1 - accuracy_score(y, prediction, sample_weight=weight)
beta = np.log((1-e)/e) + np.log(self.K - 1)
match = prediction==y
weight[~match] *= np.exp(beta)
weight /= weight.sum()
self.clfs.append(clf)
self.betas.append(beta)
def predict(self, X):
n = len(X)
pred = np.zeros((n,self.K))
i = 0
for beta,clf in zip(self.betas, self.clfs):
yp = clf.predict(X).astype(int)
pred[range(n),yp] += beta
i += 1
pred = np.argmax(pred,axis=1)
return pred
if __name__ == "__main__":
X, y = make_classification(
n_samples=10000, n_features=10,
n_informative=5, random_state=0, n_classes=2
)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=0
)
from sklearn.ensemble import AdaBoostClassifier as ABC
clf = ABC(
DecisionTreeClassifier(max_depth=1),
n_estimators=20, algorithm="SAMME"
)
clf.fit(X_train, y_train)
result = clf.predict(X_test)
print("sklearn中SAMME的验证集得分为: ", accuracy_score(y_test, result))
clf = AdaboostClassifier(
numBoostingIters=20, maxTreeDepth=1
)
clf.fit(X_train, y_train)
result = clf.predict(X_test)
print("使用SAMME集成的验证集得分为: ", accuracy_score(y_test, result))
clf = DecisionTreeClassifier(max_depth=1)
clf.fit(X_train, y_train)
result = clf.predict(X_test)
print("使用决策树桩的验证集得分为: ", accuracy_score(y_test, result))
2. Adaboost.R2
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.datasets import make_regression
class AdaboostReg:
def __init__(self, base_estimator, n_estimator):
self.base_estimator=base_estimator
self.n_estimator=n_estimator
self.booster=[]
self.weight=[]
def fit(self, X, y, **kwargs):
w=np.ones(X.shape[0])/X.shape[0]
for n in range(self.n_estimator):
cur_reg=self.base_estimator(**kwargs)
cur_reg.fit(X,y)
y_pred=cur_reg.predict(X)
e=np.abs(y-y_pred)
e/=e.max()
err=(w*e).sum()
beta=err/(1-err)
alpha=np.log((1-err)/err+1e-6)
w*=np.power(beta,1-e)
w/=w.sum()
self.booster.append(cur_reg)
self.weight.append(alpha)
def predict(self,X):
y_result=[]
for n in range(X.shape[0]):
all_pred=[]
for m in range(self.n_estimators):
cur_pred = self.boostors[m].predict(X[n,:])
all_pred.append(cur_pred)
all_result={"alpha":self.weight,"y":all_pred}
df_all=pd.DataFrame(all_result)
df_all.sort_values("y",ascending=True,inplace=True)
alpha_sum=df_all["alpha"].sum()
half_alpha_sum=0
i=0
while half_alpha_sum<0.5*alpha_sum and i<len(df_all):
half_alpha_sum+=df_all["alpha"].iloc[i]
i=i+1
y_result.append(df_all["y"].iloc[i-1])
return(np.array(y_result))
if __name__ == "__main__":
X,y=make_regression(
n_samples=1000,n_features=8,n_informative=4,random_state=0
)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=0
)
from sklearn.ensemble import AdaBoostRegressor as ABR
clf = ABR(
DecisionTreeRegressor(max_depth=1),
n_estimators=20
)
clf.fit(X_train, y_train)
result = clf.predict(X_test)
print("sklearn中的验证集得分为: ", r2_score(y_test, result))
clf = DecisionTreeRegressor(max_depth=1)
clf.fit(X_train, y_train)
result = clf.predict(X_test)
print("使用决策树桩的验证集得分为: ", r2_score(y_test, result))
clf = AdaboostReg(
DecisionTreeRegressor(max_depth=1), 20
)
clf.fit(X_train, y_train)
result = clf.predict(X_test)
print("使用Adaboost.R2集成的验证集得分为: ", r2_score(y_test, result))