集成学习之Bagging
① Bagging又叫自助聚集,是一种根据均匀概率分布从数据中重复抽样(有放回)的技术。
② 每个抽样生成的自助样本集上,训练一个基分类器;对训练过的分类器进行投票,将测试样本指派到得票最高的类中。
③ 每个自助样本集都和原数据一样大
④ 有放回抽样,一些样本可能在同一训练集中出现多次,一些可能被忽略
过程:
输入:训练集 D={(x1, y1), (x2, y2), (x3, y3), …(xm, ym)};
基学习算法#(网络模型net);
训练轮数T.
过程:
1:for t = 1, 2, 3,…T do
2: ht = net(D,Dbs)
3:end for
输出:H(x) = arg max…
对于分类,输出使用简单投票法,若两个类收到同样票数,就随机选择一个;
对于回归,使用简单平均法,对于不同模型结果进行平均。
bagging主要用来减小variance
代码示例:
# -*- coding: utf-8 -*-
"""
Created on Thu Nov 12 15:47:13 2020
@author: TBW
"""
import torch
import torch.nn as nn
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from torch.autograd import Variable
import RESauto_network_0806
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from math import sqrt
import pandas as pd
import pickle
import torch.utils.data as Data
from math import sqrt
# import math
# import multiprocessing
# from multiprocessing import Manager
# import os
# import shutil
# from baggingrnet.model.resnet import resAutoencoder
#from baggingrnet.util.pmetrics import r2K,r2KAuto,rmse,r2np,rmse2np
# from baggingrnet.util.pmetrics import r2K,r2KAuto,r2np,rmse2np
import matplotlib.pyplot as plt
class Bagging:
def __init__(self, baggingpath):
self.tasks={}
self.baggingpath=baggingpath if baggingpath is not None else '/tmp'
def addtask(self,name,noutput=1,sampling_fea=False,nepoch=20,nodes=[128,96,64,32,16],
minibatch=1280,isresidual=True,islog=True):
if name in self.tasks.keys():
print("Task:"+name + " already in tasks! please change the model name!")
return
nsz=self.Xn.shape[0]
nfea=self.Xn.shape[1]
trainIndex, testIndex = train_test_split(range(nsz), stratify=None,test_size=0.1)
# trainIndex, validIndex = train_test_split(trainIndex,test_size=0.1)
# taskPath=self.baggingpath+'/m_'+name
feaIndex=np.array([i for i in range(nfea)])
# if sampling_fea:
# feaIndex=np.unique(np.random.choice(range(nfea),nfea))
# if os.path.exists(taskPath) and os.path.isdir(taskPath):
# shutil.rmtree(taskPath)
# os.makedirs(taskPath, 0o777)
aTask={'name':name,'nepoch':nepoch,'noutput':noutput,'nodes':nodes,'trainIndex':trainIndex,
'isresidual':isresidual,'testIndex':testIndex,'feaIndex':feaIndex,'minibatch':minibatch,
'islog':islog}
# taskFl=taskPath+"/taskdict.pkl"
# with open(taskFl, "wb") as f:
# pickle.dump(aTask, f,pickle.HIGHEST_PROTOCOL)
self.tasks[name]=aTask
def getInputSample(self, sample):
self.gindex=np.array([i for i in range(sample.shape[0])])
data = sample[:, [0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12]]
lat2 = sample[:, 0] ** 2
lon2 = sample[:, 1] ** 2
latlon = sample[:, 0] * sample[:, 1]
x = np.column_stack((data, lat2, lon2, latlon))
y = sample[:, 13]
y = y.reshape((y.shape[0], 1))
self.scX = preprocessing.StandardScaler().fit(x)
self.scy = preprocessing.StandardScaler().fit(y)
self.Xn = self.scX.transform(x)
self.yn = self.scy.transform(y)
def subTrain(self, perm, istart, iend):
klist = list(self.tasks.keys())
klist.sort()
model_weights = []
for i in range(istart, iend):
key=klist[i]
print(i, key)
aTask=self.tasks[key]
name=aTask['name']
nodes=aTask['nodes']
trainIndex = aTask['trainIndex']
# validIndex = aTask['validIndex']
testIndex = aTask['testIndex']
# feaIndex = aTask['feaIndex']
# taskPath = aTask['taskPath']
batch=aTask['minibatch']
nepoch=aTask['nepoch']
# islog = aTask['islog']
net = RESauto_network_0806.Resautoblock(15, nodes, 0.02).cuda()
opt_adam = torch.optim.Adam(net.parameters(), lr=0.001, weight_decay=0.001)
scheduler = torch.optim.lr_scheduler.MultiStepLR(opt_adam, milestones=[30, 50, 70, 90], gamma=0.1, last_epoch=-1)
loss_func = nn.MSELoss()
x_train = self.Xn[trainIndex,:]
y_train = self.yn[trainIndex,:]
x_train, y_train = torch.tensor(x_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32)
# x_test, y_test = torch.tensor(x_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32)
# x_valid, y_valid = torch.tensor(x_valid, dtype=torch.float32), torch.tensor(y_valid, dtype=torch.float32)
traindataset = Data.TensorDataset(x_train, y_train)
trainloader = Data.DataLoader(traindataset, batch_size=batch, shuffle=False)
r2_2 = []
rmse_2 = []
loss_toal = []
for epoch in range(nepoch):
r2_1 = []
rmse_1 = []
for step, (b_x, b_y) in enumerate(trainloader):
b_x, b_y = Variable(b_x, requires_grad=True).cuda(), Variable(b_y, requires_grad=True).cuda()
pre = net(b_x)
# r2_1.append(r2_score(b_y.detach().cpu().numpy(), pre.detach().cpu().numpy()))
# rmse_1.append(sqrt(mean_squared_error(b_y.detach().cpu().numpy(), pre.detach().cpu().numpy())))
loss = loss_func(pre, b_y)
opt_adam.zero_grad()
loss.backward()
loss_toal.append(loss.item())
opt_adam.step()
r2_1.append(r2_score(b_y.detach().cpu().numpy(), pre.detach().cpu().numpy()))
rmse_1.append(sqrt(mean_squared_error(b_y.detach().cpu().numpy(), pre.detach().cpu().numpy())))
# print('step:', step, 'Train Loss:', loss.item(), 'R2', r2_score(b_y.detach().cpu().numpy(), pre.detach().cpu().numpy()))
scheduler.step()
r2_2.append(np.mean(r2_1))
rmse_2.append(np.mean(rmse_1))
y_test_pred = net(torch.tensor(self.Xn[testIndex,:], dtype=torch.float32).cuda())
y_test_pred = y_test_pred.detach().cpu().numpy()
obs = self.scy.inverse_transform(self.yn[testIndex,:])
pre = self.scy.inverse_transform(y_test_pred)
testDf=pd.DataFrame({'obs':obs.reshape(obs.shape[0]),'pre':pre.reshape(pre.shape[0])},
index=self.gindex[testIndex])
tPath = self.baggingpath + '\\dif_model_pre\\preds_' + name + '.csv'
testDf.to_csv(tPath,index_label="index")
r2 = r2_score(obs, pre)
rmse = sqrt(mean_squared_error(obs, pre))
ares=pd.DataFrame({'name':name,'testr2':r2,'testrmse':rmse},index=[0])
perm.append(ares)
# tPath = 'I:\\1.deep_learning\\1.PM2.5\\1.DATA\\Bagging\\dif_model_pre\\d_' + name + '_metric.csv'
# ares.to_csv(tPath)
print("indepdendent test:r2-", r2, "rmse:", rmse)
model_dict={'name':name,'model_weight':net.state_dict(),'loss':loss_toal}
model_weights.append(model_dict)
torch.save(model_weights, self.baggingpath + "\\model_pkl\\model_beijing_pm25_1km.pkl")
allres=pd.concat(perm, axis=0)
tfl=self.baggingpath+'/allresult.csv'
allres.to_csv(tfl,index_label='index')
tfl = self.baggingpath + "/tasks.pkl"
with open(tfl, "wb") as handle:
pickle.dump(self.tasks, handle, pickle.HIGHEST_PROTOCOL)