前面讲解了pp-liteseg的论文部分,但是其中有些细节不太明确,比如在UAFM结构中attention模块有spatial和channel两种注意力,但是怎么融合的不知道,细节怎么处理的也不太清楚,为此看下代码部分内容
UAFM代码部分
融合代码部分
附录
UAFM部分
class UAFM(nn.Layer):
"""
The base of Unified Attention Fusion Module.
Args:
x_ch (int): The channel of x tensor, which is the low level feature.
y_ch (int): The channel of y tensor, which is the high level feature.
out_ch (int): The channel of output tensor.
ksize (int, optional): The kernel size of the conv for x tensor. Default: 3.
resize_mode (str, optional): The resize model in unsampling y tensor. Default: bilinear.
"""
def __init__(self, x_ch, y_ch, out_ch, ksize=3, resize_mode='bilinear'):
super().__init__()
self.conv_x = layers.ConvBNReLU(
x_ch, y_ch, kernel_size=ksize, padding=ksize // 2, bias_attr=False)
self.conv_out = layers.ConvBNReLU(
y_ch, out_ch, kernel_size=3, padding=1, bias_attr=False)
self.resize_mode = resize_mode
def check(self, x, y):
assert x.ndim == 4 and y.ndim == 4
x_h, x_w = x.shape[2:]
y_h, y_w = y.shape[2:]
assert x_h >= y_h and x_w >= y_w
def prepare(self, x, y):
x = self.prepare_x(x, y)
y = self.prepare_y(x, y)
return x, y
def prepare_x(self, x, y):
x = self.conv_x(x)
return x
def prepare_y(self, x, y):
y_up = F.interpolate(y, paddle.shape(x)[2:], mode=self.resize_mode)
return y_up
def fuse(self, x, y):
out = x + y
out = self.conv_out(out)
return out
def forward(self, x, y):
"""
Args:
x (Tensor): The low level feature.
y (Tensor): The high level feature.
"""
self.check(x, y)
x, y = self.prepare(x, y)
out = self.fuse(x, y)
return out
x_ch是Flow的feature,y_ch是Fhigh的feature
输入x、y这两个分支后,首先经过self.check(x,y);
x_h, x_w = x.shape[2:]
y_h, y_w = y.shape[2:]
我们知道排序是n c h w,所以这里相当于获取了其feature的高宽。
然后是x, y = self.prepare(x, y)
def prepare(self, x, y):
x = self.prepare_x(x, y)
y = self.prepare_y(x, y)
return x, y
def prepare_x(self, x, y):
x = self.conv_x(x)
return x
#这里对应了Fhigh里的Fup操作
def prepare_y(self, x, y):
y_up = F.interpolate(y, paddle.shape(x)[2:], mode=self.resize_mode)
return y_up
spatial 和channel的attention模块
首先看下channel的attention模块:
采用的是平均池化和最大池化,然后经过sigmoid操作
class UAFM_ChAtten(UAFM):
"""
The UAFM with channel attention, which uses mean and max values.
Args:
x_ch (int): The channel of x tensor, which is the low level feature.
y_ch (int): The channel of y tensor, which is the high level feature.
out_ch (int): The channel of output tensor.
ksize (int, optional): The kernel size of the conv for x tensor. Default: 3.
resize_mode (str, optional): The resize model in unsampling y tensor. Default: bilinear.
"""
def __init__(self, x_ch, y_ch, out_ch, ksize=3, resize_mode='bilinear'):
super().__init__(x_ch, y_ch, out_ch, ksize, resize_mode)
self.conv_xy_atten = nn.Sequential(
layers.ConvBNAct(
4 * y_ch,
y_ch // 2,
kernel_size=1,
bias_attr=False,
act_type="leakyrelu"),
layers.ConvBN(
y_ch // 2, y_ch, kernel_size=1, bias_attr=False))
def fuse(self, x, y):
"""
Args:
x (Tensor): The low level feature.
y (Tensor): The high level feature.
"""
atten = helper.avg_max_reduce_hw([x, y], self.training)
atten = F.sigmoid(self.conv_xy_atten(atten))
out = x * atten + y * (1 - atten)
out = self.conv_out(out)
return out
class UAFM_ChAtten_S(UAFM):
"""
The UAFM with channel attention, which uses mean values.
Args:
x_ch (int): The channel of x tensor, which is the low level feature.
y_ch (int): The channel of y tensor, which is the high level feature.
out_ch (int): The channel of output tensor.
ksize (int, optional): The kernel size of the conv for x tensor. Default: 3.
resize_mode (str, optional): The resize model in unsampling y tensor. Default: bilinear.
"""
def __init__(self, x_ch, y_ch, out_ch, ksize=3, resize_mode='bilinear'):
super().__init__(x_ch, y_ch, out_ch, ksize, resize_mode)
self.conv_xy_atten = nn.Sequential(
layers.ConvBNAct(
2 * y_ch,
y_ch // 2,
kernel_size=1,
bias_attr=False,
act_type="leakyrelu"),
layers.ConvBN(
y_ch // 2, y_ch, kernel_size=1, bias_attr=False))
def fuse(self, x, y):
"""
Args:
x (Tensor): The low level feature.
y (Tensor): The high level feature.
"""
atten = helper.avg_reduce_hw([x, y])
atten = F.sigmoid(self.conv_xy_atten(atten))
out = x * atten + y * (1 - atten)
out = self.conv_out(out)
return out
def avg_max_reduce_hw_helper(x, is_training, use_concat=True):
assert not isinstance(x, (list, tuple))
avg_pool = F.adaptive_avg_pool2d(x, 1)
# TODO(pjc): when axis=[2, 3], the paddle.max api has bug for training.
if is_training:
max_pool = F.adaptive_max_pool2d(x, 1)
else:
max_pool = paddle.max(x, axis=[2, 3], keepdim=True)
if use_concat:
res = paddle.concat([avg_pool, max_pool], axis=1)
else:
res = [avg_pool, max_pool]
return res
def avg_max_reduce_hw(x, is_training):
# Reduce hw by avg and max
# Return cat([avg_pool_0, avg_pool_1, ..., max_pool_0, max_pool_1, ...])
if not isinstance(x, (list, tuple)):
return avg_max_reduce_hw_helper(x, is_training)
elif len(x) == 1:
return avg_max_reduce_hw_helper(x[0], is_training)
else:
res_avg = []
res_max = []
for xi in x:
avg, max = avg_max_reduce_hw_helper(xi, is_training, False)
res_avg.append(avg)
res_max.append(max)
res = res_avg + res_max
return paddle.concat(res, axis=1)
def avg_reduce_hw(x):
# Reduce hw by avg
# Return cat([avg_pool_0, avg_pool_1, ...])
if not isinstance(x, (list, tuple)):
return F.adaptive_avg_pool2d(x, 1)
elif len(x) == 1:
return F.adaptive_avg_pool2d(x[0], 1)
else:
res = []
for xi in x:
res.append(F.adaptive_avg_pool2d(xi, 1))
return paddle.concat(res, axis=1)
class UAFM_SpAtten(UAFM):
"""
The UAFM with spatial attention, which uses mean and max values.
Args:
x_ch (int): The channel of x tensor, which is the low level feature.
y_ch (int): The channel of y tensor, which is the high level feature.
out_ch (int): The channel of output tensor.
ksize (int, optional): The kernel size of the conv for x tensor. Default: 3.
resize_mode (str, optional): The resize model in unsampling y tensor. Default: bilinear.
"""
def __init__(self, x_ch, y_ch, out_ch, ksize=3, resize_mode='bilinear'):
super().__init__(x_ch, y_ch, out_ch, ksize, resize_mode)
self.conv_xy_atten = nn.Sequential(
layers.ConvBNReLU(
4, 2, kernel_size=3, padding=1, bias_attr=False),
layers.ConvBN(
2, 1, kernel_size=3, padding=1, bias_attr=False))
def fuse(self, x, y):
"""
Args:
x (Tensor): The low level feature.
y (Tensor): The high level feature.
"""
atten = helper.avg_max_reduce_channel([x, y])
atten = F.sigmoid(self.conv_xy_atten(atten))
out = x * atten + y * (1 - atten)
out = self.conv_out(out)
return out
spatial的attention
class UAFM_SpAtten(UAFM):
"""
The UAFM with spatial attention, which uses mean and max values.
Args:
x_ch (int): The channel of x tensor, which is the low level feature.
y_ch (int): The channel of y tensor, which is the high level feature.
out_ch (int): The channel of output tensor.
ksize (int, optional): The kernel size of the conv for x tensor. Default: 3.
resize_mode (str, optional): The resize model in unsampling y tensor. Default: bilinear.
"""
def __init__(self, x_ch, y_ch, out_ch, ksize=3, resize_mode='bilinear'):
super().__init__(x_ch, y_ch, out_ch, ksize, resize_mode)
self.conv_xy_atten = nn.Sequential(
layers.ConvBNReLU(
4, 2, kernel_size=3, padding=1, bias_attr=False),
layers.ConvBN(
2, 1, kernel_size=3, padding=1, bias_attr=False))
def fuse(self, x, y):
"""
Args:
x (Tensor): The low level feature.
y (Tensor): The high level feature.
"""
atten = helper.avg_max_reduce_channel([x, y])
atten = F.sigmoid(self.conv_xy_atten(atten))
out = x * atten + y * (1 - atten)
out = self.conv_out(out)
return out
class UAFM_SpAtten_S(UAFM):
"""
The UAFM with spatial attention, which uses mean values.
Args:
x_ch (int): The channel of x tensor, which is the low level feature.
y_ch (int): The channel of y tensor, which is the high level feature.
out_ch (int): The channel of output tensor.
ksize (int, optional): The kernel size of the conv for x tensor. Default: 3.
resize_mode (str, optional): The resize model in unsampling y tensor. Default: bilinear.
"""
def __init__(self, x_ch, y_ch, out_ch, ksize=3, resize_mode='bilinear'):
super().__init__(x_ch, y_ch, out_ch, ksize, resize_mode)
self.conv_xy_atten = nn.Sequential(
layers.ConvBNReLU(
2, 2, kernel_size=3, padding=1, bias_attr=False),
layers.ConvBN(
2, 1, kernel_size=3, padding=1, bias_attr=False))
def fuse(self, x, y):
"""
Args:
x (Tensor): The low level feature.
y (Tensor): The high level feature.
"""
atten = helper.avg_reduce_channel([x, y])
atten = F.sigmoid(self.conv_xy_atten(atten))
out = x * atten + y * (1 - atten)
out = self.conv_out(out)
return out
def avg_reduce_channel(x):
# Reduce channel by avg
# Return cat([avg_ch_0, avg_ch_1, ...])
if not isinstance(x, (list, tuple)):
return paddle.mean(x, axis=1, keepdim=True)
elif len(x) == 1:
return paddle.mean(x[0], axis=1, keepdim=True)
else:
res = []
for xi in x:
res.append(paddle.mean(xi, axis=1, keepdim=True))
return paddle.concat(res, axis=1)