EELU激活函数
论文链接:Elastic exponential linear units for convolutional neural networks
年份:2020
简介
该激活函数称为弹性指数线性单元(EELU),它将激活函数的优点一般化的结合起来。EELU在正区域具有弹性斜率,并通过使用小的非零梯度来保持负信号。EELU可以在训练阶段加入更大范围的噪声和神经元的失活,从而实现更好的模型泛化性。EELU的政协率是由随机标准差的高斯分布修正而来的,具体的EELU的公式为:
f
(
x
)
=
{
k
×
x
,
x
>
0
α
(
e
β
x
−
1
)
,
x
≤
0
f(x)=\begin{aligned} \begin{cases} k\times x, &x>0\\ \alpha (e^{\beta x}-1), &x\le 0 \end{cases} \end{aligned}
f(x)={k×x,α(eβx−1),x>0x≤0
其中系数
k
k
k从一个随机标准差固定均值的高斯分布采样,采样系数从0到2,具体可以表示为:
k
=
max
(
0
,
min
(
s
,
2
)
)
,
s
∼
N
(
1
,
σ
)
k = \max(0, \min(s, 2)), s\sim N(1, \sigma)
k=max(0,min(s,2)),s∼N(1,σ)
σ ∼ U ( 0 , ϵ ) , ϵ ∈ ( 0 , 1 ] \sigma \sim U(0, \epsilon), \epsilon\in(0,1] σ∼U(0,ϵ),ϵ∈(0,1]
其中 ϵ \epsilon ϵ为超参数,表示高斯分布的最大标准差,取值范围是 0.1 ∼ 1.0 0.1 \sim1.0 0.1∼1.0。最大标准差 ϵ \epsilon ϵ越小,弹性范围越小。如果 k k k采样自高斯分布大于1,则这个函数为放大函数。同样,如果从高斯分布采样的值小于1,则输出减小。
对深度神经网络进行大量样本训练或使用数据增强时,使用较小的 ϵ \epsilon ϵ,如 0.1 ∼ 0.2 0.1\sim 0.2 0.1∼0.2,以降低神经元噪声的影响。当神经网络用较小数量的样本训练时,使用更大的 ϵ \epsilon ϵ,如 0.7 ∼ 1.0 0.7\sim 1.0 0.7∼1.0,通过在训练阶段引入信号显著性的噪声信号,有助于提高模型泛化。
该函数对x的导数为:
f
′
(
x
)
=
{
k
,
x
>
0
α
β
e
β
x
,
x
≤
0
f^\prime(x) = \begin{cases} k, &x>0\\ \alpha \beta e^{\beta x}, &x\le0 \end{cases}
f′(x)={k,αβeβx,x>0x≤0
该函数的图像如下图所示:
EELU的优点是可以在随机噪声的情况下表示各种输出特征。随机噪声与输入无关可以赋予神经网络各种输入的敏感性,在某种程度上类似于数据增强。
EELU对x的导数为:
∂
f
(
x
)
∂
x
=
{
k
,
x
>
0
α
β
e
β
x
,
x
≤
0
\frac{\partial f(x)}{\partial x} = \begin{cases}k, &x>0\\ \alpha \beta e^{\beta x}, &x\le 0\end{cases}
∂x∂f(x)={k,αβeβx,x>0x≤0
EELU对
α
\alpha
α的导数为:
∂
f
(
x
)
∂
α
=
{
0
,
x
>
0
e
β
x
−
1
,
x
≤
0
\frac{\partial f(x)}{\partial \alpha} = \begin{cases}0, &x>0\\ e^{\beta x}-1, &x\le 0 \end{cases}
∂α∂f(x)={0,eβx−1,x>0x≤0
EELU对
β
\beta
β的导数为:
∂
f
(
x
)
∂
β
=
{
0
,
x
>
0
α
x
e
β
x
−
1
,
x
≤
0
\frac{\partial f(x)}{\partial \beta} = \begin{cases} 0, &x>0\\ \alpha x e^{\beta x}-1, & x\le 0\end{cases}
∂β∂f(x)={0,αxeβx−1,x>0x≤0
具体的代码如下
# Inherit from Function
import torch
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
class c_EELU(torch.autograd.Function):
# Note that both forward and backward are @staticmethods
# bias is an optional argument
@staticmethod
def forward(ctx, input, pa, pb, odd, eps, training):
neg = (input < 0)
pa = pa.clamp(min=0)
pb = pb.clamp(min=0)
if training: # 学习
if odd:
k = torch.cuda.FloatTensor([1.])
# if x<0, a*(torch.exp(b*x)-1),else x>0, x
output = torch.where(neg, pa * (torch.expm1(pb * input)), input)
ctx.save_for_backward(input, pa, pb, k)
ctx.odd = odd
else:
# epsilon ∈ (0,1)--->sigma ~U(0, epsilon)
sigma = np.random.uniform(0, eps)
# s ~ N(1, sigma),s满足正态分布,再将s截断到(0,2)之间,比2小的选s的,比0大的选s
k = torch.cuda.FloatTensor(input.shape).normal_(mean=1, std=sigma).clamp(0, 2)
# k = torch.cuda.FloatTensor(input.shape).uniform_(1-eps, 1+eps)
ctx.save_for_backward(input, pa, pb, k)
ctx.odd = odd
# if x<0, a*(torch.exp(b*x)-1),else x>0,k*x
output = torch.where(neg, pa * (torch.expm1(pb * input)), input * k)
else: # 测试
# k = torch.cuda.FloatTensor(input.shape).uniform_(1, 1)
# if x<0, a*(torch.exp(b*x)-1),else x>0, x
output = torch.where(neg, pa * (torch.expm1(pb * input)), input)
# if output 等于 nan,则将output=0,else output=output
output = torch.where(torch.isnan(output), torch.cuda.FloatTensor([0.]), output)
return output
# 这个函数只有一个输出,所以它只有一个梯度
@staticmethod
def backward(ctx, grad_output):
# This is a pattern that is very convenient - at the top of backward
# unpack saved_tensors and initialize all gradients w.r.t. inputs to
# None. Thanks to the fact that additional trailing Nones are
# ignored, the return statement is simple even when the function has
# optional inputs.
input, pa, pb, k = ctx.saved_variables
odd = ctx.odd
neg = (input < 0)
if odd:
if pa.size(0) == 1:
grad_pa = torch.sum(
torch.where(neg, grad_output * (torch.expm1(pb * input)), torch.cuda.FloatTensor([0.]))).view(-1)
grad_pb = torch.sum(
torch.where(neg, grad_output * pa * torch.exp(pb * input) * input, torch.cuda.FloatTensor([0.]))).view(-1)
else:
grad_pa = torch.sum(
torch.where(neg, grad_output * (torch.expm1(pb * input)), torch.cuda.FloatTensor([0.])),
dim=(0, 2, 3)).view(-1, 1, 1)
grad_pb = torch.sum(
torch.where(neg, grad_output * pa * torch.exp(pb * input) * input, torch.cuda.FloatTensor([0.])),
dim=(0, 2, 3)).view(-1, 1, 1)
grad_pa = torch.where(torch.isnan(grad_pa), torch.cuda.FloatTensor([0.]), grad_pa)
grad_pb = torch.where(torch.isnan(grad_pb), torch.cuda.FloatTensor([0.]), grad_pb)
grad_input = torch.where(neg, grad_output * pa * torch.exp(pb * input) * pb, grad_output)
else:
if pa.size(0) == 1:
grad_pa = torch.cuda.FloatTensor(1).fill_(0)
grad_pb = torch.cuda.FloatTensor(1).fill_(0)
else:
grad_pa = torch.cuda.FloatTensor(pa.size(0), 1, 1).fill_(0)
grad_pb = torch.cuda.FloatTensor(pb.size(0), 1, 1).fill_(0)
grad_input = torch.where(neg, grad_output * pa * torch.exp(pb * input) * pb, k * grad_output)
grad_input = torch.where(torch.isnan(grad_input), torch.cuda.FloatTensor([0.]), grad_input)
return grad_input, grad_pa, grad_pb, None, None, None
class EELU(torch.nn.Module):
"""
Linear neural network module based on the operation defined above.
"""
def __init__(self, num_parameters=1, pa_init=0.25, pb_init=1, eps=1.0):
super(EELU, self).__init__()
self.num_parameters = num_parameters
self.eps = eps
self.odd = True
self.pa_init = pa_init
self.pb_init = pb_init
if self.num_parameters == 1:
self.pa = nn.Parameter(torch.cuda.FloatTensor(num_parameters).fill_(pa_init))
self.pb = nn.Parameter(torch.cuda.FloatTensor(num_parameters).fill_(pb_init))
else:
self.pa = nn.Parameter(torch.cuda.FloatTensor(num_parameters, 1, 1).fill_(pa_init))
self.pb = nn.Parameter(torch.cuda.FloatTensor(num_parameters, 1, 1).fill_(pb_init))
# torch.nn.Module.__init__(self)
# self.register_parameter('k', None)
def forward(self, input):
if self.odd:
self.odd = False
else:
self.odd = True
return c_EELU.apply(input, self.pa, self.pb, self.odd, self.eps, self.training)
def __repr__(self):
return self.__class__.__name__ + '(' + 'num_parameters=' + str(self.num_parameters * 2) + ', eps=' + str(
self.eps) + ', pa = ' + str(self.pa_init) + ', pb = ' + str(self.pb_init) + ')'