Adaboost 原理及代码

7 篇文章 0 订阅
4 篇文章 0 订阅

分类代码

class AdaBoost:
    def __init__(self,n_estimators=50, learning_rate=1.0): #n_estimators=50:分类器数目
        self.clf_num = n_estimators
        self.learning_rate = learning_rate

    def init_args(self,datasets,labels):
        self.X = datasets
        self.Y = labels
        self.M, self.N = datasets.shape

        #弱分类器数据和集合
        self.clf_sets = []

        #初始化每个数据的权重
        self.weights = [1.0/self.M]*self.M

        #G(x)系数alpha,即分类器的权重
        self.alpha = []

    def _G(self,features,labels,weights):
        m = len(features)
        error = 100000.0 #无穷大
        best_v = 0.0
        #单维features
        features_min = min(features)
        features_max = max(features)
        n_step = (features_max - features_min + self.learning_rate)//self.learning_rate
        direct,compare_array = None,None
        for i in range(1,int(n_step)): #找出误差最小的那个划分方式作为一个弱分类器
            v =  features_min + self.learning_rate*i

            if v not in features:
                #误差分类计算
                compare_array_positive = np.array(
                    [1 if features[k] > v else -1 for k in range(m)])
                weight_error_positive = sum([
                    weights[k] for k in range(m)
                    if compare_array_positive[k] != labels[k]
                ])

                compare_array_nagetive = np.array(
                    [-1 if features[k] > v else 1 for k in range(m)])
                weight_error_nagetive = sum([
                    weights[k] for k in range(m)
                    if compare_array_nagetive[k] != labels[k]
                ])

                if weight_error_positive < weight_error_nagetive:
                    weight_error = weight_error_positive
                    _compare_array = compare_array_positive
                    direct = 'positive'
                else:
                    weight_error = weight_error_nagetive
                    _compare_array = compare_array_nagetive
                    direct = 'negetive'

                if weight_error < error:
                    error = weight_error
                    compare_array = _compare_array
                    best_v = v
        return best_v,direct,error,compare_array

    #计算alpha
    def _alpha(self,error):
        return 0.5*np.log((1-error)/error)

    #规范化因子
    def _Z(self,weights,a,clf):
        return sum([
            weights[i]*np.exp(-1*a*self.Y[i]*clf[i])
            for i in range(self.M)
        ])

    #权值更新
    def _w(self,a,clf,Z):
        for i in range(self.M):
            self.weights[i] = self.weights[i]*np.exp(
                -1*a*self.Y[i]*clf[i])/Z

    def G(self,x,v,direct):
        if direct == 'positive':
            return 1 if x>v else -1
        else:
            return  -1 if x>v else 1

    def fit(self,X,y):
        self.init_args(X,y) #初始化各个参数

        for epoch in range(self.clf_num): #多个分类器的计算
            best_clf_error,best_v,clf_result = 100000,None,None
            #根据特征维度,选择误差最小的
            for j in range(self.N): #self.N是特征个数,选择哪一个特征列表现最好
                feature = self.X[:,j]
                #分类阈值,分类方向,分类误差,分类结果
                v,direct,error,compare_array = self._G(feature,self.Y,self.weights)

                if error < best_clf_error:
                    best_clf_error = error
                    best_v = v
                    final_direct = direct
                    clf_result = compare_array
                    axis = j
                if best_clf_error == 0: #如果最小的分类误差是0,跳出循环
                    break

            #计算G(x)系数a
            a = self._alpha(best_clf_error)
            self.alpha.append(a)
            #记录分类器
            self.clf_sets.append((axis,best_v,final_direct)) #哪维特征,分类阈值,分类方向
            #规范化因子
            Z = self._Z(self.weights,a,clf_result) #输入上一步的每个样本的权重,这一步分类器的权重,分类结果
            #每个样本权值更新
            self._w(a,clf_result,Z) #输入分类器权重,分类结果,规范化因子

        #预测函数
    def predict(self,feature): #这里的特征样本只有一个
        result = 0.0
        for i in range(len(self.clf_sets)): #self.clf_sets 弱分类器数据和集合
            axis,clf_v,direct = self.clf_sets[i] #哪维特征,分类阈值,分类方向
            f_input = feature[axis]
            result += self.alpha[i]*self.G(f_input,clf_v,direct) # 第几维特征,分类阈值,分类方向
        #sign
        return 1 if result>0 else -1

    def score(self,X_test,y_test):
        right_count = 0
        for i in range(len(X_test)):
            feature = X_test[i]
            if self.predict(feature) == y_test[i]:
                right_count += 1
        return right_count/len(X_test)

运行例子:
获取数据的方法

def create_data():
    iris = load_iris()
    df = pd.DataFrame(iris.data, columns=iris.feature_names)
    df['label'] = iris.target
    df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
    data = np.array(df.iloc[:100,[0,1,-1]]) #取第一列,第二列和最后一列的数据
    for i in range(len(data)):
        if data[i,-1] == 0:
            data[i,-1] = -1 #如果标签是0,则把标签变成-1
    return data[:,:2],data[:,-1] #返回前两维特征

运行

X, y = create_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
clf = AdaBoost(10, 0.2)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

在这里插入图片描述
当有100个分类器时

result = []
for i in range(1, 101):
    X, y = create_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
    clf = AdaBoost(n_estimators=100, learning_rate=0.2)
    clf.fit(X_train, y_train)
    r = clf.score(X_test, y_test)
    # print('{}/100 score:{}'.format(i, r))
    result.append(r)

print('average score:{:.3f}%'.format(sum(result)))

在这里插入图片描述
在这里插入图片描述
运行例子
在这里插入图片描述

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值