2021高通人工智能创新大赛垃圾分类赛题总结
一、关于最终比赛结果
非常遗憾这次比赛在垃圾分类赛题上只取得了第十名的成绩,只有前五名能够晋级决赛,所以目前来看也是暂时告一段落了。准确率上差距属实有些大,第一名86%左右而我们只取得了62%的准确率有些拉跨;下面会详细分析下问题所在。
二、最后时间的段调参与失误
- 主要任务关注点:数据增强针对类不均衡、算法模型调整(efficientnetb0-b1/ resnet50)、主要超参调整 batchsize、epoch、优化器的选择optimizer+学习率衰减策略、损失函数;
1.类不均衡的调整方案:
这里采用了暴力过采样进行处理,具体实现代码如下,通过对少样本类别进行重复采样最后实现各个类别的样本与样本最多数量类别相当:
import os
import random
import numpy as np
import glob
from PIL import Image
from torch.utils.data import Dataset
random.seed(1)
#确定是在测试还是在训练
trash_name = {}
f = open('/project/train/src_repo/class.txt','r')
a = f.read()
trash_name = eval(a)
f.close()
class TrashDataset(Dataset):
def __init__(self, data_dir, transform, classes):
self.data_info = self.get_img_info(classes,data_dir) # data_info存储所有图片路径和标签,在DataLoader中通过index读取样本
self.transform = transform
self.labelname = trash_name
self.classes = classes
self.data_dir = data_dir
def __getitem__(self, index):
path_img, label = self.data_info[index]
img = Image.open(path_img).convert('RGB') # 0~255
if self.transform is not None:
img = self.transform(img) # 在这里做transform,转为tensor等等
return img, label
def __len__(self):
return len(self.data_info)
@staticmethod
def get_img_info(classes,data_dir):
num = []
for i in classes:
im_pth = '{}/{}/*.jpg'.format(data_dir, i)
path_file_number=glob.glob(im_pth)#或者指定文件下个数
num.append(len(path_file_number))
num = np.array(num)
max_num = np.max(num)
data_info = list()
for root, dirs, _ in os.walk(data_dir):
# 遍历类别
for sub_dir in dirs:
img_names = os.listdir(os.path.join(root, sub_dir))
img_names = list(filter(lambda x: x.endswith('.jpg'), img_names))
if len(img_names) < max_num:
n = max_num / len(img_names)
img_names = [val for val in img_names for i in range(int(n))]
# 遍历图片
for i in range(len(img_names)):
img_name = img_names[i]
path_img = os.path.join(root, sub_dir, img_name)
label = trash_name[sub_dir]
data_info.append((path_img, int(label)))
if len(data_info) == 0:
raise Exception('\n data dir: {} is empty!'.format(data_dir))
return data_info
主要是在原来的dataloader上面加上了对少数样本的判断和标签倍增
2.数据增强
import random
import math
import torch
from PIL import Image, ImageOps, ImageFilter
from torchvision import transforms
class Resize(object):
def __init__(self, size, interpolation=Image.BILINEAR):
self.size = size
self.interpolation = interpolation
def __call__(self, img):
# padding
ratio = self.size[0] / self.size[1]
w, h = img.size
if w / h < ratio:
t = int(h * ratio)
w_padding = (t - w) // 2
img = img.crop((-w_padding, 0, w+w_padding, h))
else:
t = int(w / ratio)
h_padding = (t - h) // 2
img = img.crop((0, -h_padding, w, h+h_padding))
img = img.resize(self.size, self.interpolation)
return img
class RandomRotate(object):
def __init__(self, degree, p=0.5):
self.degree = degree
self.p = p
def __call__(self, img):
if random.random() < self.p:
rotate_degree = random.uniform(-1*self.degree, self.degree)
img = img.rotate(rotate_degree, Image.BILINEAR)
return img
class RandomGaussianBlur(object):
def __init__(self, p=0.5):
self.p = p
def __call__(self, img):
if random.random() < self.p:
img = img.filter(ImageFilter.GaussianBlur(
radius=random.random()))
return img
def get_train_transform(mean, std, size):
train_transform = transforms.Compose([
transforms.Resize([224,224]),
transforms.RandomCrop(size),
transforms.RandomHorizontalFlip(),
# RandomRotate(15, 0.3),
# RandomGaussianBlur(),
transforms.ToTensor(),
transforms.Normalize(mean=mean, std=std),
])
return train_transform
def get_test_transform(mean, std, size):
return transforms.Compose([
transforms.Resize([224,224]),
transforms.CenterCrop(size),
transforms.ToTensor(),
transforms.Normalize(mean=mean, std=std),
])
def get_transforms(input_size, test_size, backbone=None):
mean, std = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]
if backbone is not None and backbone in ['pnasnet5large', 'nasnetamobile']:
mean, std = [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]
transformations = {}
transformations['train'] = get_train_transform(mean, std, input_size)
transformations['val'] = get_test_transform(mean, std, test_size)
return transformations
基于分析对图像进行简单的数据增强操作,包括图像的等比填充缩放裁剪,水平翻转、高斯噪声等。其中第一项目,对结果影响较大。这里是先将原始图像以最大边为基准做等比缩放,不足的地方填充0,这里缩放后的边是最终输入边长的256/224倍,然后在进行剪切,这里输入模型的尺寸为288*288。下图是对比图,如果不进行等比缩放,最终的结果是最右边的图片,最后的输出就极易识别为筷子。
3.模型设计与训练
模型结构 | baseline准确率 |
---|---|
resnet50 | 62.216% |
resnext50_32*4d | 64.361% |
efficientnet b0 | 60.112% |
首先对原始的数据进行分组,8:2的比例分为训练集和测试集,基于此做验证。基于resnet50网络,现在训练5个epoch就能收敛到最高分,训练时间大概1个小时左右,其网络结构如下:
4.模型参数设置
这一块比较考验炼丹技术,首次参赛显得有些无从下手于是乎疯狂参考其他开源项目和代码,主要从batchsize、epoch、优化器的选择optimizer+学习率衰减策略、损失函数进行了尝试,较为容易获得最优解的参数如下:
- batchsize:128
- epoch:resnet50为5 ,efficientnet为40
接下来就是优化函数了,这里主要尝试了传统的SGD和对新手菜鸡比较有好的Adam以及网上开源的自定义optmizer,后期才意识到带有动量(momentum)的SGD才是不容易陷入局部最优的方案;
self_optimizer.py 开源代码参考
import errno
import os
import sys
import time
import math
import torch.nn as nn
import torch.nn.init as init
from torch.autograd import Variable
import torch
import shutil
# import adabound
# from utils.radam import RAdam, AdamW
import torchvision.transforms as transforms
import math
import torch
from torch.optim.optimizer import Optimizer, required
class RAdam(Optimizer):
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
self.buffer = [[None, None, None] for ind in range(10)]
super(RAdam, self).__init__(params, defaults)
def __setstate__(self, state):
super(RAdam, self).__setstate__(state)
def step(self, closure=None):
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data.float()
if grad.is_sparse:
raise RuntimeError('RAdam does not support sparse gradients')
p_data_fp32 = p.data.float()
state = self.state[p]
if len(state) == 0:
state['step'] = 0
state['exp_avg'] = torch.zeros_like(p_data_fp32)
state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
else:
state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
beta1, beta2 = group['betas']
exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
exp_avg.mul_(beta1).add_(1 - beta1, grad)
state['step'] += 1
buffered = self.buffer[int(state['step'] % 10)]
if state['step'] == buffered[0]:
N_sma, step_size = buffered[1], buffered[2]
else:
buffered[0] = state['step']
beta2_t = beta2 ** state['step']
N_sma_max = 2 / (1 - beta2) - 1
N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
buffered[1] = N_sma
# more conservative since it's an approximated value
if N_sma >= 5:
step_size = math.sqrt(
(1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (
N_sma_max - 2)) / (1 - beta1 ** state['step'])
else:
step_size = 1.0 / (1 - beta1 ** state['step'])
buffered[2] = step_size
if group['weight_decay'] != 0:
p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
# more conservative since it's an approximated value
if N_sma >= 5:
denom = exp_avg_sq.sqrt().add_(group['eps'])
p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom)
else:
p_data_fp32.add_(-step_size * group['lr'], exp_avg)
p.data.copy_(p_data_fp32)
return loss
class PlainRAdam(Optimizer):
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
super(PlainRAdam, self).__init__(params, defaults)
def __setstate__(self, state):
super(PlainRAdam, self).__setstate__(state)
def step(self, closure=None):
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data.float()
if grad.is_sparse:
raise RuntimeError('RAdam does not support sparse gradients')
p_data_fp32 = p.data.float()
state = self.state[p]
if len(state) == 0:
state['step'] = 0
state['exp_avg'] = torch.zeros_like(p_data_fp32)
state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
else:
state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
beta1, beta2 = group['betas']
exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
exp_avg.mul_(beta1).add_(1 - beta1, grad)
state['step'] += 1
beta2_t = beta2 ** state['step']
N_sma_max = 2 / (1 - beta2) - 1
N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
if group['weight_decay'] != 0:
p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
# more conservative since it's an approximated value
if N_sma >= 5:
step_size = group['lr'] * math.sqrt(
(1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (
N_sma_max - 2)) / (1 - beta1 ** state['step'])
denom = exp_avg_sq.sqrt().add_(group['eps'])
p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
else:
step_size = group['lr'] / (1 - beta1 ** state['step'])
p_data_fp32.add_(-step_size, exp_avg)
p.data.copy_(p_data_fp32)
return loss
class AdamW(Optimizer):
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, warmup=0):
defaults = dict(lr=lr, betas=betas, eps=eps,
weight_decay=weight_decay, warmup=warmup)
super(AdamW, self).__init__(params, defaults)
def __setstate__(self, state):
super(AdamW, self).__setstate__(state)
def step(self, closure=None):
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data.float()
if grad.is_sparse:
raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
p_data_fp32 = p.data.float()
state = self.state[p]
if len(state) == 0:
state['step'] = 0
state['exp_avg'] = torch.zeros_like(p_data_fp32)
state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
else:
state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
beta1, beta2 = group['betas']
state['step'] += 1
exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
exp_avg.mul_(beta1).add_(1 - beta1, grad)
denom = exp_avg_sq.sqrt().add_(group['eps'])
bias_correction1 = 1 - beta1 ** state['step']
bias_correction2 = 1 - beta2 ** state['step']
if group['warmup'] > state['step']:
scheduled_lr = 1e-8 + state['step'] * group['lr'] / group['warmup']
else:
scheduled_lr = group['lr']
step_size = scheduled_lr * math.sqrt(bias_correction2) / bias_correction1
if group['weight_decay'] != 0:
p_data_fp32.add_(-group['weight_decay'] * scheduled_lr, p_data_fp32)
p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
p.data.copy_(p_data_fp32)
return loss
__all__ = ['get_mean_and_std', 'init_params', 'mkdir_p', 'AverageMeter', 'get_optimizer', 'save_checkpoint']
def get_mean_and_std(dataset):
'''Compute the mean and std value of dataset.'''
dataloader = trainloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True, num_workers=2)
mean = torch.zeros(3)
std = torch.zeros(3)
print('==> Computing mean and std..')
for inputs, targets in dataloader:
for i in range(3):
mean[i] += inputs[:,i,:,:].mean()
std[i] += inputs[:,i,:,:].std()
mean.div_(len(dataset))
std.div_(len(dataset))
return mean, std
def init_params(net):
'''Init layer parameters.'''
for m in net.modules():
if isinstance(m, nn.Conv2d):
init.kaiming_normal(m.weight, mode='fan_out')
if m.bias:
init.constant(m.bias, 0)
elif isinstance(m, nn.BatchNorm2d):
init.constant(m.weight, 1)
init.constant(m.bias, 0)
elif isinstance(m, nn.Linear):
init.normal(m.weight, std=1e-3)
if m.bias:
init.constant(m.bias, 0)
def mkdir_p(path):
'''make dir if not exist'''
try:
os.makedirs(path)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else:
raise
class AverageMeter(object):
"""Computes and stores the average and current value
Imported from https://github.com/pytorch/examples/blob/master/imagenet/main.py#L247-L262
"""
def __init__(self):
self.reset()
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
def get_optimizer(model, args):
parameters = []
for name, param in model.named_parameters():
if 'fc' in name or 'class' in name or 'last_linear' in name or 'ca' in name or 'sa' in name:
parameters.append({'params': param, 'lr': args.lr * args.lr_fc_times})
else:
parameters.append({'params': param, 'lr': args.lr})
if args.optimizer == 'sgd':
return torch.optim.SGD(parameters,
# model.parameters(),
args.lr,
momentum=args.momentum, nesterov=args.nesterov,
weight_decay=args.weight_decay)
elif args.optimizer == 'rmsprop':
return torch.optim.RMSprop(parameters,
# model.parameters(),
args.lr,
alpha=args.alpha,
weight_decay=args.weight_decay)
elif args.optimizer == 'adam':
return torch.optim.Adam(parameters,
# model.parameters(),
args.lr,
betas=(args.beta1, args.beta2),
weight_decay=args.weight_decay)
elif args.optimizer == 'AdaBound':
return adabound.AdaBound(parameters,
# model.parameters(),
lr=args.lr, final_lr=args.final_lr)
elif args.optimizer == 'radam':
return RAdam(parameters, lr=args.lr, betas=(args.beta1, args.beta2),
weight_decay=args.weight_decay)
else:
raise NotImplementedError
def save_checkpoint(state, is_best, single=True, checkpoint='checkpoint', filename='checkpoint.pth.tar'):
if single:
fold = ''
else:
fold = str(state['fold']) + '_'
cur_name = 'checkpoint.pth.tar'
filepath = os.path.join(checkpoint, fold + cur_name)
curpath = os.path.join(checkpoint, fold + 'model_cur.pth')
torch.save(state, filepath)
torch.save(state['state_dict'], curpath)
if is_best and state['epoch'] >= 5:
model_name = 'model_' + str(state['epoch']) + '_' + str(int(round(state['train_acc']*100, 0))) + '_' + str(int(round(state['acc']*100, 0))) + '.pth'
model_path = os.path.join(checkpoint, fold + model_name)
torch.save(state['state_dict'], model_path)
def save_checkpoint2(state, is_best, checkpoint='checkpoint', filename='checkpoint.pth.tar'):
# best_model = '/application/search/qlmx/clover/garbage/code/image_classfication/predict/'
fold = str(state['fold']) + '_'
filepath = os.path.join(checkpoint, fold + filename)
model_path = os.path.join(checkpoint, fold + 'model_cur.pth')
torch.save(state, filepath)
torch.save(state['state_dict'], model_path)
if is_best:
shutil.copyfile(filepath, os.path.join(checkpoint, fold + 'model_best.pth.tar'))
shutil.copyfile(model_path, os.path.join(checkpoint, fold + 'model_best.pth'))
这里主要通过get_optimizer导入优化器。具体修改内容还有待挖掘,使用之后收敛速度着实比原始的SGD要快,但是貌似过拟合也相当严重,在测试集上表现不好
- 传统的SGD
先来看SGD。SGD没有动量的概念,也就是说:
代入步骤3,可以看到下降梯度就是最简单的
SGD最大的缺点是下降速度慢,而且可能会在沟壑的两边持续震荡,停留在一个局部最优点,所以尼玛巨垃圾我称之为
optimizer = optim.SGD(net.parameters(), lr=LR, momentum=0.9)
- Adam
谈到这里,Adam和Nadam的出现就很自然而然了——它们是前述方法的集大成者。我们看到,SGD-M在SGD基础上增加了一阶动量,AdaGrad和AdaDelta在SGD基础上增加了二阶动量。把一阶动量和二阶动量都用起来,就是Adam了——Adaptive + Momentum。
SGD的一阶动量:
加上AdaDelta的二阶动量:
优化算法里最常见的两个超参数 [外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-YhmUNdOh-1621562663368)(https://www.zhihu.com/equation?tex=+\beta_1%2C+\beta_2)] 就都在这里了,前者控制一阶动量,后者控制二阶动量。
torch.optim.Adam(params,lr=0.001,betas=(0.9, 0.999),eps=1e-08,weight_decay=0,amsgrad=False)
事实证明这个方法还是容易掉到过拟合的坑里,很少的epoch就在训练集上收敛了,但是在测试集就停滞不前
- 带动量的SGD,这里引入了衰减权值的正则化项来防止过拟合,之前不晓得血亏,但是最后一版还是没搞清楚是不是还有加入权值衰减策略
optimizer = torch.optim.SGD(net.parameters(), lr=0.1,momentum=0.9,
weight_decay=1e-4)
Learning Rate
学习率决定了权值更新的速度,设置得太大会使结果超过最优值,太小会使下降速度过慢。仅靠人为干预调整参数需要不断修改学习率,因此后面3种参数都是基于自适应的思路提出的解决方案。后面3中参数分别为:Weight Decay 权值衰减,Momentum 动量和Learning Rate Decay 学习率衰减。
Weight decay
在实际应用中,为了避免网络的过拟合,必须对价值函数(Cost function)加入一些正则项,在SGD中加入
上面这个公式基本思想就是减小不重要的参数对最后结果的影响,网络中有用的权重则不会收到Weight decay影响。
在机器学习或者模式识别中,会出现overfitting,而当网络逐渐overfitting时网络权值逐渐变大,因此,为了避免出现overfitting,会给误差函数添加一个惩罚项,常用的惩罚项是所有权重的平方乘以一个衰减常量之和。其用来惩罚大的权值。
Momentum
动量来源于牛顿定律,基本思想是为了找到最优加入“惯性”的影响,当误差曲面中存在平坦区域,SGD就可以更快的学习。
Learning Rate Decay
该方法是为了提高SGD寻优能力,具体就是每次迭代的时候减少学习率的大小。
**weight decay(权值衰减)**的使用既不是为了提高收敛精确度也不是为了提高收敛速度,其最终目的是防止过拟合。在损失函数中,weight decay是放在正则项(regularization)前面的一个系数,正则项一般指示模型的复杂度,所以weight decay的作用是调节模型复杂度对损失函数的影响,若weight decay很大,则复杂的模型损失函数的值也就大。
momentum是梯度下降法中一种常用的加速技术。对于一般的SGD,其表达式为
沿负梯度方向下降。而带momentum项的SGD则写生如下形式:
其中即momentum系数,通俗的理解上面式子就是,如果上一次的momentum(即[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Oo619uai-1621562663376)(http://www.zhihu.com/equation?tex=v)])与这一次的负梯度方向是相同的,那这次下降的幅度就会加大,所以这样做能够达到加速收敛的过程。
normalization(batch normalization)。
batch normalization的是指在神经网络中激活函数的前面,将[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-DJepH1FH-1621562663377)(http://www.zhihu.com/equation?tex=wx%2Bb)]按照特征进行normalization,这样做的好处有三点:
- 损失函数
最终尝试的损失函数如下三种:
criterion = nn.CrossEntropyLoss()
criterion = LabelSmoothSoftmaxCE()
criterion = FocalLoss()
针对focalloss和labelsmooth标签平滑给出具体代码和参考文献
focalloss.py
参考文献:目标检测领域的-- https://arxiv.org/abs/1708.02002
- 总述
Focal loss主要是为了解决one-stage目标检测中正负样本比例严重失衡的问题。该损失函数降低了大量简单负样本在训练中所占的权重,也可理解为一种困难样本挖掘。
- 损失函数形式
Focal loss是在交叉熵损失函数基础上进行的修改,首先回顾二分类交叉上损失:
是经过激活函数的输出,所以在0-1之间。可见普通的交叉熵对于正样本而言,输出概率越大损失越小。对于负样本而言,输出概率越小则损失越小。此时的损失函数在大量简单样本的迭代过程中比较缓慢且可能无法优化至最优。那么Focal loss是怎么改进的呢?
首先在原有的基础上加了一个因子,其中gamma>0使得减少易分类样本的损失。使得更关注于困难的、错分的样本。
例如gamma为2,对于正类样本而言,预测结果为0.95肯定是简单样本,所以(1-0.95)的gamma次方就会很小,这时损失函数值就变得更小。而预测概率为0.3的样本其损失相对很大。对于负类样本而言同样,预测0.1的结果应当远比预测0.7的样本损失值要小得多。对于预测概率为0.5时,损失只减少了0.25倍,所以更加关注于这种难以区分的样本。这样减少了简单样本的影响,大量预测概率很小的样本叠加起来后的效应才可能比较有效。
此外,加入平衡因子alpha,用来平衡正负样本本身的比例不均:文中alpha取0.25,即正样本要比负样本占比小,这是因为负例易分。
只添加alpha虽然可以平衡正负样本的重要性,但是无法解决简单与困难样本的问题。
gamma调节简单样本权重降低的速率,当gamma为0时即为交叉熵损失函数,当gamma增加时,调整因子的影响也在增加。实验发现gamma为2是最优。
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
class FocalLoss(nn.Module):
def __init__(self, class_num=146, alpha=None, gamma=2, size_average=True):
super(FocalLoss, self).__init__()
if alpha is None:
self.alpha = Variable(torch.ones(class_num, 1))
else:
if isinstance(alpha, Variable):
self.alpha = alpha
else:
self.alpha = Variable(alpha)
self.gamma = gamma
self.class_num = class_num
self.size_average = size_average
def forward(self, inputs, targets):
N = inputs.size(0)
C = inputs.size(1)
P = F.softmax(inputs)
class_mask = inputs.data.new(N, C).fill_(0)
class_mask = Variable(class_mask)
ids = targets.view(-1, 1)
class_mask.scatter_(1, ids.data, 1.)
#print(class_mask)
if inputs.is_cuda and not self.alpha.is_cuda:
self.alpha = self.alpha.cuda()
alpha = self.alpha[ids.data.view(-1)]
probs = (P*class_mask).sum(1).view(-1,1)
log_p = probs.log()
#print('probs size= {}'.format(probs.size()))
#print(probs)
batch_loss = -alpha*(torch.pow((1-probs), self.gamma))*log_p
#print('-----bacth_loss------')
#print(batch_loss)
if self.size_average:
loss = batch_loss.mean()
else:
loss = batch_loss.sum()
return loss
标签平滑labelsmooth
原理:
标签平滑后的分布就相当于往真实分布中加入了噪声,避免模型对于正确标签过于自信,使得预测正负样本的输出值差别不那么大,从而避免过拟合,提高模型的泛化能力。
参考文献:https://papers.nips.cc/paper/2019/file/f1748d6b0fd9d439f71450117eba2725-Paper.pdf
label_smooth.py
#!/usr/bin/python
# -*- encoding: utf-8 -*-
import torch
import torch.nn as nn
class LabelSmoothSoftmaxCE(nn.Module):
def __init__(self,
lb_pos=0.9,
lb_neg=0.005,
reduction='mean',
lb_ignore=255,
):
super(LabelSmoothSoftmaxCE, self).__init__()
self.lb_pos = lb_pos
self.lb_neg = lb_neg
self.reduction = reduction
self.lb_ignore = lb_ignore
self.log_softmax = nn.LogSoftmax(1)
def forward(self, logits, label):
logs = self.log_softmax(logits)
ignore = label.data.cpu() == self.lb_ignore
n_valid = (ignore == 0).sum()
label = label.clone()
label[ignore] = 0
lb_one_hot = logits.data.clone().zero_().scatter_(1, label.unsqueeze(1), 1)
label = self.lb_pos * lb_one_hot + self.lb_neg * (1-lb_one_hot)
ignore = ignore.nonzero()
_, M = ignore.size()
a, *b = ignore.chunk(M, dim=1)
label[[a, torch.arange(label.size(1)), *b]] = 0
if self.reduction == 'mean':
loss = -torch.sum(torch.sum(logs*label, dim=1)) / n_valid
elif self.reduction == 'none':
loss = -torch.sum(logs*label, dim=1)
return loss
if __name__ == '__main__':
torch.manual_seed(15)
criteria = LabelSmoothSoftmaxCE(lb_pos=0.9, lb_neg=5e-3)
net1 = nn.Sequential(
nn.Conv2d(3, 3, kernel_size=3, stride=2, padding=1),
)
net1.cuda()
net1.train()
net2 = nn.Sequential(
nn.Conv2d(3, 3, kernel_size=3, stride=2, padding=1),
)
net2.cuda()
net2.train()
with torch.no_grad():
inten = torch.randn(2, 3, 5, 5).cuda()
lbs = torch.randint(0, 3, [2, 5, 5]).cuda()
lbs[1, 3, 4] = 255
lbs[1, 2, 3] = 255
print(lbs)
import torch.nn.functional as F
logits1 = net1(inten)
logits1 = F.interpolate(logits1, inten.size()[2:], mode='bilinear')
logits2 = net2(inten)
logits2 = F.interpolate(logits2, inten.size()[2:], mode='bilinear')
# loss1 = criteria1(logits1, lbs)
loss = criteria(logits1, lbs)
# print(loss.detach().cpu())
loss.backward()
n.Sequential(
nn.Conv2d(3, 3, kernel_size=3, stride=2, padding=1),
)
net1.cuda()
net1.train()
net2 = nn.Sequential(
nn.Conv2d(3, 3, kernel_size=3, stride=2, padding=1),
)
net2.cuda()
net2.train()
with torch.no_grad():
inten = torch.randn(2, 3, 5, 5).cuda()
lbs = torch.randint(0, 3, [2, 5, 5]).cuda()
lbs[1, 3, 4] = 255
lbs[1, 2, 3] = 255
print(lbs)
import torch.nn.functional as F
logits1 = net1(inten)
logits1 = F.interpolate(logits1, inten.size()[2:], mode='bilinear')
logits2 = net2(inten)
logits2 = F.interpolate(logits2, inten.size()[2:], mode='bilinear')
# loss1 = criteria1(logits1, lbs)
loss = criteria(logits1, lbs)
# print(loss.detach().cpu())
loss.backward()
三、summary
最后没有获得happy endding 因为最后几天调参结果并没有任何进步,我觉得一方面可能是数据集切分的时候有些问题,再一个就是需要在检查下class.txt;比赛的最后一波测试没有成功因为时间到了也很心塞,也不知道是否通过权值衰减的方式避免了过拟合。总而言之调参是一个漫长艰巨的过程,还需要多花时间去练习,积累论文的经验和一些tricks,这样才能做到之后的有的放矢。
最后附上训练代码和测试代码:
训练代码:训练代码仓库src_repo
测试代码:测试代码仓库 ev_sdk