Boosting 族中,大部分是根据前一个学习器的训练效果对样本分布进行调整,再根据新的样本分布训练下一个学习器,样本调整方式不同可以产生Adaboost , Gradient Boosting,XGboost等
推导过程可以参考以下博客,当然有不懂地方也可以留言交流,这里就给出代码。基学习器是基于CART,Adaboost完全按照推导过程编写,可以对照公式看的很明白
https://www.cnblogs.com/pinard/p/6133937.html
https://www.cnblogs.com/massquantity/p/9063033.html
class weakLearner():
def __init__(self):
self.type_feature = None
self.w = None
def __Gini(self,y,sample_weight):
'''
:param data:
:param sample_weight: 特征数据的权重,NaN表示元数据为空值
:param y: 目标数据
:return: Gini: 返回该特征的Gini系数
'''
## 根据第一个公式
K = np.unique(y)
gini = 1 - np.sum([(np.sum(sample_weight[y == k])/ np.sum(sample_weight)) **2 for k in K])
return gini
def __typeFeature(self,X):
# 表示特征是否为连续还是离散
n_sample,n_feature = X.shape
self.type_feature = []
#### 特征属性小于10个,认为是离散型数据用0表示,连续性数据用1 表示
for f_idx in range(n_feature):
if len(np.unique(X[:, f_idx]))< 10:
self.type_feature.append(0)
else:
self.type_feature.append(1)
return self.type_feature
def __binSplitData(self,X,y,f_idx,f_val):
'''
二分类划分数据集
:param X 划分数据
:param f_idx: 数据X的第f_idx个特征 X.iloc[:,f_idx] 太慢 用 X.columns[f_idx]
np.unique(X.at[:,f_idx])可以得到该特征的属性,如 array(['Overcast', 'Rain', 'Sunny'], dtype=object)
:param f_val: 数据f_idx个特征中的属性值,即上面中的一种 'Overcast'/ 'Rain'/ 'Sunny
:param type_feature: 离散特征 0 连续特征 1
:return: 二分后的左右数据子集
'''
### att 数有数据在第f_idx的特征的所有属性,将不等于 f_val 分为一类,其余分为另一类
#################### 0: 离散类型特征二分方法 1:连续数据 ############################
att=X[:, f_idx]
if self.type_feature[f_idx]== 0:
X_left = X[att == f_val]
X_right = X[att != f_val]
y_left = y[att == f_val]
y_right = y[att != f_val]
weight_left = self.w[att == f_val]
weight_right = self.w[att != f_val]
else:
X_left = X[att <= f_val]
X_right = X[att >f_val]
y_left = y[att <= f_val]
y_right = y[att > f_val]
weight_left = self.w[att <= f_val]
weight_right = self.w[att > f_val]
## 切分点和样本点的索引
return X_left, X_right, y_left, y_right, weight_left, weight_right
def __bestSplit(self,X,y):
'''
找到最佳分割特征与特征值
:param X
:return: best_f_idx 最佳分割特征 , best_f_val 特征值
'''
ini_gain = 1
n_sample,n_feature = X.shape
## 第一个终止条件: 当叶子节点中的样本数小于最小分割值,或者所有样本属于同一类别时,不再分割
##-------------------------通过不断二分的过程 寻找对于某个特征,的最佳分割点---------------------------
for f_idx in range(n_feature):
##-------------------------如果该特征中的属性个数小于10,则认为是离散数据 type_feature = 0,否则else---------------------------
if self.type_feature[f_idx] == 0:
for f_val in np.unique(X[:, f_idx]):
## 当某个特征只有两个类别时,仅仅做一次左右子树的划分,不用重复操作
if len(np.unique(X[:, f_idx]))== 2 and f_val == np.unique(X[:, f_idx])[0]:
continue
else:
X_left, X_right, y_left, y_right, weight_left, weight_right = self.__binSplitData(X,y,f_idx,f_val)
Gini_after = np.sum(weight_left) * self.__Gini(y_left,weight_left) + np.sum(weight_right) * self.__Gini(y_right,weight_right)
if Gini_after > ini_gain:
continue
else:
ini_gain = Gini_after
best_f_idx,best_f_val = f_idx,f_val
##------------------------- 连续特征属性的二分 case = 1 ---------------------------
else:
for f_val in np.linspace(np.nanmin(X[:, f_idx])+1,np.nanmax(X[:, f_idx])-1,num=50):
X_left, X_right, y_left, y_right, weight_left, weight_right = self.__binSplitData(X,y,f_idx,f_val)
Gini_after = np.sum(weight_left) * self.__Gini(y_left,weight_left) + np.sum(weight_right) * self.__Gini(y_right,weight_right)
if Gini_after > ini_gain:
continue
else:
ini_gain = Gini_after
best_f_idx,best_f_val = f_idx,f_val
return best_f_idx,best_f_val
def __CART(self,X,y):
'''
生成CART树
:param X: 特征数据
:param y: 目标数据
:return; CART 树
'''
best_f_idx, best_f_val = self.__bestSplit(X,y)
tree = dict()
tree['cut_f'] = best_f_idx
tree['cut_val'] = best_f_val
X_left, X_right, y_left, y_right, weight_left, weight_right = self.__binSplitData(X,y,best_f_idx,best_f_val)
tree['left_tree'] = y_left
tree['right_tree'] = y_right
tree['left_weight'] = weight_left
tree['right_weight'] = weight_right
return tree
def train(self,X,y,sample_weight):
self.w = sample_weight
self.type_feature = self.__typeFeature(X)
self.tree = self.__CART(X,y)
return self.tree
def predict(self,X_test):
return np.array([self.__predict_one(x_test, self.tree) for x_test in X_test])
def __predict_one(self,x_test,tree):
cut_f_idx, cut_val = tree['cut_f'], tree['cut_val']
label_left = Counter(tree['left_tree']).most_common(1)[0][0]
label_right = Counter(tree['right_tree']).most_common(1)[0][0]
if self.type_feature[cut_f_idx] == 0:
result = label_left if x_test[cut_f_idx] == cut_val else label_right
else:
result = label_left if x_test[cut_f_idx] <= cut_val else label_right
return result
class Adaboost():
def __init__(self,estimators: int = 10, classifier = weakLearner):
self.estimators = estimators
self.w = None
self.alphas = []
self.stumps = []
self.weakLearner = classifier
def fit(self,X,y):
self.w = np.array([1 / len(X)] * len(X))
M = self.estimators
for m in range(M):
G_m = self.weakLearner()
tree = G_m.train(X,y,self.w)
###'cut_f' 'cut_val' 'left_tree' 'right_tree', 'left_weight' 'right_weight',
label_left = np.array([Counter(tree['left_tree']).most_common(1)[0][0]] * len(tree['left_tree']) )
label_right = np.array([Counter(tree['right_tree']).most_common(1)[0][0]] * len(tree['right_tree']))
error = 1e-6 + np.sum(tree['left_weight'] * (tree['left_tree'] != label_left)) + np.sum(tree['right_weight'] * (tree['right_tree'] != label_right))
alpha = 1/2 * np.log((1-error)/error)
y_temp = np.hstack((tree['left_tree'],tree['right_tree']))
G = np.hstack((label_left,label_right))
Zm = np.sum(self.w * np.exp(- y_temp * G * alpha))
#Zm = 2 * np.sqrt(error * (1-error))
self.w = self.w * np.exp(- y_temp * G * alpha)
self.stumps.append(G_m)
self.alphas.append(alpha)
def predict(self,X_test):
M = self.estimators
y_ = 0
for m in range(M):
y_ += self.alphas[m] * self.stumps[m].predict(X_test)
return np.sign(y_)
分别用鸢尾花数据和癌症数据测试算法,鸢尾花数据太简单了,预测精度太高,这里就用癌症数据
if __name__ == '__main__':
from collections import Counter
from sklearn import datasets
import numpy as np
from sklearn.model_selection import train_test_split
#iris = datasets.load_iris()
data = datasets.load_breast_cancer()
data.target[data.target > 0] = 1
data.target[data.target == 0] = -1
X, Y = data.data, data.target
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
tree_clf = Adaboost()
tree = tree_clf.fit(X_train,Y_train)
Y_pred = tree_clf.predict(X_test)
print('acc:{}'.format(np.sum(Y_pred == Y_test) / len(Y_test)))
也可以参考Github https://github.com/tingting417/Adaboost-based-on-CART/blob/master/Adaboost.ipynb