对二分类问题,4维特征数据的bagging(logstics的bagging)

这次的任务是在用logstic单一分类器的基础上,用bagging进行训练,看看有没有提升。

首先对bagging做一些介绍:

bagging主要是对样本进行重复放回的采样,对每一重采样都得到一个模型,最后取平均参数(或者进行投票)产生最后的分类器。其实从重采样就能感受到bagging实际上是在为减少variance做努力。首先每一次采样的子模型都不可能是完全互相独立的,也不可能是完全相同的。因此可以得出最后的模型是介于两者之间,因此variance介于\frac{Var(X_{i})}{n}Var(X_{i})之间,显然降了variance。对于bias则主要是boosting的功效,下一次博客再写。

下面给出代码

from numpy import *
import numpy as np
import math
import random
import array
import matplotlib.pyplot as plt
#构造函数来获取数据

# Boostrap to select subset
def Boostrap(data,k):
    sample = []
    for i in range(k):
        sample.append(data[random.randint(0, len(data) - 1)])
    return sample



def loadDataSet(fileName):
    xArr = [];
    yArr = []
    for line in open(fileName).readlines():
        curLine = line.strip().split()
        # curLine = line.strip().split('\t')  #中间有很多个空格、缩进或者tab,split的参数直接不用写就行
        xonerow = []  # 添加1.0作为第一个系数,则第一个系数的权重用来代表y=wx+b中的b变量
        for i in range(len(curLine) - 1):

            xonerow.append(float(curLine[i]))  # 最后一列为输出结果值y,前面的值为输入x值
        xArr.append(xonerow)
        yArr.append(int(curLine[-1]))  # 添加最后一列为结果值

    return xArr, yArr

# loadDataSet2
def loadDataSet2(fileName):
    xArr = [];
    for line in open(fileName).readlines():
        curLine = line.strip().split()
        # curLine = line.strip().split('\t')  #中间有很多个空格、缩进或者tab,split的参数直接不用写就行
        xonerow = []  # 添加1.0作为第一个系数,则第一个系数的权重用来代表y=wx+b中的b变量
        for i in range(len(curLine)):

            xonerow.append(float(curLine[i]))  # 最后一列为输出结果值y,前面的值为输入x值
        xArr.append(xonerow)
    return xArr
# def loadDataSet(fileName):
#     data_x=[];data_y=[]
#     # fr=open('machinelearninginaction/Ch05/testSet.txt')
#     for line in open(fileName).readlines():
#         lineArr=line.strip().split()
#         data_x.append([1.0,float(lineArr[0]),float(lineArr[1])])#特征数据集,添加1是构造常数项x0
#         data_y.append(int(lineArr[-1]))#分类数据集
#     return data_x,data_y

def sigmoid(X):
    return 1/(1+exp(-X))

def gradAscent(data_x,data_y):
    data_xrix=mat(data_x) #(m,n)
    data_y=mat(data_y).transpose() #(m,1)
    m,n=shape(data_xrix)
    Weights=ones((n,1)) #initialization(n,1)
    alpha=0.001 #define the step
    maxCycles=700 #times of loop
    #We can also define a regularization parameter to constrain some huge weight
    reg_lambda = math.exp(-8)
    for i in range(maxCycles):
        h=sigmoid(data_xrix * Weights) #f(thetax)
        error=data_y - h #y-h,(m,1)
        Weights=(1-reg_lambda)*Weights + alpha * data_xrix.transpose() * error #Gradient ascend
    return Weights

def Judgefunction(test_y):
    val=[]
    rel=[]
    for i in range(test_y.shape[0]):
        val.append(test_y[i,0])
        if val[i]<0.5:
            rel.append(0)
        else:
            rel.append(1)
    return rel

if __name__== "__main__":
    # Single classifier output
    data_x, data_y = loadDataSet('C:/Users/Carzolar/Desktop/bagging and boosting/Train.txt')
    # print(mat(data_x).shape[0])
    Weights = gradAscent(data_x,data_y)
    # print(data_x)
    # print(data_y)
    result=[]
    print('The single Weights is :')
    print(Weights)

    # test model
    test_x, real_y= loadDataSet('C:/Users/Carzolar/Desktop/bagging and boosting/Test.txt')
    test_y = sigmoid(test_x*Weights)
    real_test_y=test_y[0,0]
    real_test_y2=test_y[1,0]
    result=Judgefunction(test_y)

    # Result
    # print('test_y is:',test_y)



    # bagging classifier

    data = loadDataSet2('C:/Users/Carzolar/Desktop/bagging and boosting/Train.txt')
    times = input('Please input the bagging times:')
    result_mat=[]
    for i in range(int(times)):
        Sample = Boostrap(data,400)
        print('In times',times,'Sample is:')
        print(Sample)
        #extract the x and y in this loop
        sample_x = np.mat(Sample)[:,0:4]
        sample_y0 = np.mat(Sample)[:,-1]
        # print(sample_x)
        sample_y1=sample_y0.transpose()
        sample_y2 = sample_y1.tolist()
        sample_y = sample_y2[0]
        # print(sample_y)
        weights = gradAscent(sample_x,sample_y)
        print('The weight is:\n',weights)

        #using this weights to predict the test data and record
        sample_test_x,sample_real_y = loadDataSet('C:/Users/Carzolar/Desktop/bagging and boosting/Test.txt')
        sample_test_y = sigmoid(sample_test_x*weights)
        test_result = Judgefunction(sample_test_y)
        # real_result = Judgefunction(mat(sample_real_y))
        # print(sample_test_y)
        print('This time result is:\n',test_result)
        # print('while the real result is:\n',real_y)
        result_mat.append(test_result)

    print('The result matrix is :\n',result_mat)
    final_result = []
    # print(mat(result_mat).shape[1])
    # real_list1 = mat(result_mat)[:,1].transpose().tolist()
    # print(real_list1[0])
    # print(max(real_list1[0],key=real_list1[0].count))
    # real_list = real_list1[0]
    for i in range (int(mat(result_mat).shape[1])):
        real_list1 = mat(result_mat)[:,i].transpose().tolist()
        real_list = real_list1[0]
        final_result.append(max(real_list1[0],key=real_list1[0].count))

    print('The single logstic regression test_result is:\n ', result)
    print('After voting, the result is:\n',final_result)
    # print('real_y is: ', real_y)
    print('While the real result is :\n',real_y)

    #error rate calculation
    frag1=0
    frag2=0
    for i in range(len(real_y)):
        if result[i]!=real_y[i]:
            frag1 +=1
        if final_result[i]!=real_y[i]:
            frag2 +=1
    single_error_rate = frag1/len(real_y)
    final_error_rate = frag2/len(real_y)
    print('Single logistic error rate is : \n',single_error_rate)
    print('After bagging, the error rate is :\n',final_error_rate)




这里使用的是投票法的bagging,分别用10次,50次和100次迭代去观察:

发现三次bagging之后,错误率并没有减少。但是当我改变了SGD里的超参数(步长)发现:

其实应该很好解释这种情况,也就是当SGD并没有找到局部最优时,bagging能在此基础上帮忙减少错误率。可能在算法调优时能通过bagging的效果来发现原算法的优劣?

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值