[SwapAutoEncoder 源码解析] Baselayer & Encoder
文章目录
一、StyleGAN2_layer 部分
EqualConv2d
EqualConv2d与nn.conv2d有细微的差别,增加了对卷积核权重的调整和学习率的自定义
class EqualConv2d(nn.Module):
def __init__(
self, in_channel, out_channel, kernel_size, stride=1, padding=0, bias=True, lr_mul=1.0,
):
super().__init__()
self.weight = nn.Parameter(
torch.randn(out_channel, in_channel, kernel_size, kernel_size)
)
self.scale = 1 / math.sqrt(in_channel * kernel_size ** 2) * lr_mul
self.stride = stride
self.padding = padding
if bias:
self.bias = nn.Parameter(torch.zeros(out_channel))
else:
self.bias = None
def forward(self, input):
out = F.conv2d(
input,
self.weight * self.scale,
bias=self.bias,
stride=self.stride,
padding=self.padding,
)
return out
def __repr__(self):
return (
f'{self.__class__.__name__}({self.weight.shape[1]}, {self.weight.shape[0]},'
f' {self.weight.shape[2]}, stride={self.stride}, padding={self.padding})'
)
- lr_mul
参数更新的过程中,针对卷积核参数自定义设置参数更新的速度,反向传播求导的过程中会增加一个倍数 - self.scale
参考 https://blog.csdn.net/weixin_41943311/article/details/105657139
为了保证在训练过程中尺度的度量标准一致,避免不同大小的卷积核产生不同的影响,因此要对卷积的计算结果求平均值,反应在参数上就是权重除以每个输出通道下卷积核中参数的个数
upfirdn2d
def upfirdn2d(input, kernel, up=1, down=1, pad=(0, 0)):
global use_custom_kernel
if use_custom_kernel:
out = UpFirDn2d.apply(
input, kernel, (up, up), (down, down), (pad[0], pad[1], pad[0], pad[1])
)
else:
out = upfirdn2d_native(input, kernel, up, up, down, down, pad[0], pad[1], pad[0], pad[1])
return out
用默认的kernel,upfirdn2d_native 函数
def upfirdn2d_native(
input, kernel, up_x, up_y, down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1
):
bs, ch, in_h, in_w = input.shape
minor = 1
kernel_h, kernel_w = kernel.shape
#assert kernel_h == 1 and kernel_w == 1
#print("original shape ", input.shape, up_x, down_x, pad_x0, pad_x1)
out = input.view(-1, in_h, 1, in_w, 1, minor)
if up_x > 1 or up_y > 1:
out = F.pad(out, [0, 0, 0, up_x - 1, 0, 0, 0, up_y - 1])
#print("after padding ", out.shape)
out = out.view(-1, in_h * up_y, in_w * up_x, minor)
#print("after reshaping ", out.shape)
if pad_x0 > 0 or pad_x1 > 0 or pad_y0 > 0 or pad_y1 > 0:
out = F.pad(out, [0, 0, max(pad_x0, 0), max(pad_x1, 0), max(pad_y0, 0), max(pad_y1, 0)])
#print("after second padding ", out.shape)
out = out[
:,
max(-pad_y0, 0) : out.shape[1] - max(-pad_y1, 0),
max(-pad_x0, 0) : out.shape[2] - max(-pad_x1, 0),
:,
]
#print("after trimming ", out.shape)
out = out.permute(0, 3, 1, 2)
out = out.reshape(
[-1, 1, in_h * up_y + pad_y0 + pad_y1, in_w * up_x + pad_x0 + pad_x1]
)
#print("after reshaping", out.shape)
w = torch.flip(kernel, [0, 1]).view(1, 1, kernel_h, kernel_w)
out = F.conv2d(out, w)
#print("after conv ", out.shape)
out = out.reshape(
-1,
minor,
in_h * up_y + pad_y0 + pad_y1 - kernel_h + 1,
in_w * up_x + pad_x0 + pad_x1 - kernel_w + 1,
)
out = out.permute(0, 2, 3, 1)
#print("after permuting ", out.shape)
out = out[:, ::down_y, ::down_x, :]
out = out.view(bs, ch, out.size(1), out.size(2))
#print("final shape ", out.shape)
return out
- F.pad() https://blog.csdn.net/binbinczsohu/article/details/106359426 对每个维度的开头和结尾进行填充
- 第一次填充:利用填充操作,扩充feature map的大小,为上采样做准备,在宽和高上做倍数的增加
out = input.view(-1, in_h, 1, in_w, 1, minor)
if up_x > 1 or up_y > 1:
out = F.pad(out, [0, 0, 0, up_x - 1, 0, 0, 0, up_y - 1])
out = out.view(-1, in_h * up_y, in_w * up_x, minor)
3. 第二次填充:单纯在宽和高上填充,不做倍数变换
4. 卷积操作:根据给定的卷积核进行卷积操作,近似blur
5. 下采样:按照下采样倍数,间隔取出元素作为最后的结果
至此 近似完成Blur操作
Blur
def make_kernel(k):
k = torch.tensor(k, dtype=torch.float32)
if k.dim() == 1:
k = k[None, :] * k[:, None]
k /= k.sum()
return k
根据输入构造核
# If antialiasing is used, create a very lightweight Gaussian kernel.
blur_kernel = [1, 2, 1] if self.opt.use_antialias else [1]
class Blur(nn.Module):
def __init__(self, kernel, pad, upsample_factor=1, reflection_pad=False):
super().__init__()
kernel = make_kernel(kernel)
if upsample_factor > 1:
kernel = kernel * (upsample_factor ** 2)
self.register_buffer('kernel', kernel)
self.pad = pad
self.reflection = reflection_pad
if self.reflection:
self.reflection_pad = nn.ReflectionPad2d((pad[0], pad[1], pad[0], pad[1]))
self.pad = (0, 0)
def forward(self, input):
if self.reflection:
input = self.reflection_pad(input)
out = upfirdn2d(input, self.kernel, pad=self.pad)
return out
Blur的作用是通过blur kernel对图片的上下采样
nn.ReflectionPad2d https://zhuanlan.zhihu.com/p/351958361
先进行reflectionpad,
再进行upfirdn2d
解释为什么self.pad归零?
如果已经填充后,在upfirdn2d中不再进行填充,否则仍需填充
ConvLayer
class ConvLayer(nn.Sequential):
def __init__(
self,
in_channel,
out_channel,
kernel_size,
downsample=False,
blur_kernel=[1, 3, 3, 1],
bias=True,
activate=True,
pad=None,
reflection_pad=False,
):
layers = []
if downsample:
factor = 2
if pad is None:
pad = (len(blur_kernel) - factor) + (kernel_size - 1)
pad0 = (pad + 1) // 2
pad1 = pad // 2
layers.append(("Blur", Blur(blur_kernel, pad=(pad0, pad1), reflection_pad=reflection_pad)))
stride = 2
self.padding = 0
else:
stride = 1
self.padding = kernel_size // 2 if pad is None else pad
if reflection_pad:
layers.append(("RefPad", nn.ReflectionPad2d(self.padding)))
self.padding = 0
layers.append(("Conv",
EqualConv2d(
in_channel,
out_channel,
kernel_size,
padding=self.padding,
stride=stride,
bias=bias and not activate,
))
)
if activate:
if bias:
layers.append(("Act", FusedLeakyReLU(out_channel)))
else:
layers.append(("Act", ScaledLeakyReLU(0.2)))
super().__init__(OrderedDict(layers))
def forward(self, x):
out = super().forward(x)
return out
继承自nn.Sequential
带有特殊参数
downsample
:bool 卷积过程中是否需要下采样(缩小feature map的大小)
blur_kernel
: list 在下采样的过程中使用的核
activate
:bool 选择激活函数
reflection_pad
:bool 是否使用nn.ReflectionPad2d()
- 使用下采样
stride=2 加入Blur层 - 不使用下采样
stride=1 加入RefPad层 - 卷积层
EqualConv2d - 激活函数
bias FusedleakyRelu
not bias ScaledLeakyReLU(0.2)
FusedleakyRelu
class FusedLeakyReLU(nn.Module):
def __init__(self, channel, negative_slope=0.2, scale=2 ** 0.5):
super().__init__()
self.bias = nn.Parameter(torch.zeros(channel))
self.negative_slope = negative_slope
self.scale = scale
def forward(self, input):
return fused_leaky_relu(input, self.bias, self.negative_slope, self.scale)
def fused_leaky_relu(input, bias, negative_slope=0.2, scale=2 ** 0.5):
global use_custom_kernel
if use_custom_kernel:
return FusedLeakyReLUFunction.apply(input, bias, negative_slope, scale)
else:
dims = [1, -1] + [1] * (input.dim() - 2)
bias = bias.view(*dims)
return F.leaky_relu(input + bias, negative_slope) * scale
在算leaky relu之前手动加了一个bias
ScaledLeakyReLU
class ScaledLeakyReLU(nn.Module):
def __init__(self, negative_slope=0.2):
super().__init__()
self.negative_slope = negative_slope
def forward(self, input):
out = F.leaky_relu(input, negative_slope=self.negative_slope)
return out * math.sqrt(2)
leaky relu https://zhuanlan.zhihu.com/p/172254089
ResBlock
class ResBlock(nn.Module):
def __init__(self, in_channel, out_channel, blur_kernel=[1, 3, 3, 1], reflection_pad=False, pad=None, downsample=True):
super().__init__()
self.conv1 = ConvLayer(in_channel, in_channel, 3, reflection_pad=reflection_pad, pad=pad)
self.conv2 = ConvLayer(in_channel, out_channel, 3, downsample=downsample, blur_kernel=blur_kernel, reflection_pad=reflection_pad, pad=pad)
self.skip = ConvLayer(
in_channel, out_channel, 1, downsample=downsample, blur_kernel=blur_kernel, activate=False, bias=False
)
def forward(self, input):
#print("before first resnet layeer, ", input.shape)
out = self.conv1(input)
#print("after first resnet layer, ", out.shape)
out = self.conv2(out)
#print("after second resnet layer, ", out.shape)
skip = self.skip(input)
out = (out + skip) / math.sqrt(2)
return out
两个卷积层以及skip layer 使用ConvLayer
在skip和conv结果相加后,方差加倍,为了消除方差变化的影响需要除以根号2,在一般的resnet中通过batchnormalization来消除这种影响(StyleGAN2论文中有解释)
目的是为了避免texture code中编码了位置信息,使用reflectpad
EqualLinear
class EqualLinear(nn.Module):
def __init__(
self, in_dim, out_dim, bias=True, bias_init=0, lr_mul=1, activation=None
):
super().__init__()
self.weight = nn.Parameter(torch.randn(out_dim, in_dim).div_(lr_mul))
if bias:
self.bias = nn.Parameter(torch.zeros(out_dim).fill_(bias_init))
else:
self.bias = None
self.activation = activation
self.scale = (1 / math.sqrt(in_dim)) * lr_mul
self.lr_mul = lr_mul
def forward(self, input):
if self.activation:
if input.dim() > 2:
out = F.conv2d(input, self.weight[:, :, None, None] * self.scale)
else:
out = F.linear(input, self.weight * self.scale)
out = fused_leaky_relu(out, self.bias * self.lr_mul)
else:
if input.dim() > 2:
out = F.conv2d(input, self.weight[:, :, None, None] * self.scale,
bias=self.bias * self.lr_mul
)
else:
out = F.linear(
input, self.weight * self.scale, bias=self.bias * self.lr_mul
)
return out
def __repr__(self):
return (
f'{self.__class__.__name__}({self.weight.shape[1]}, {self.weight.shape[0]})'
)
和nn.Linear()区别
- 处理高维数据、二维数据:高维数据使用1×1卷积,不改变feature map大小,只改变通道的数量
- 设置是否使用激活函数
- 设置参数更新的速度lr_mul
- 对权重进行尺度的缩放
二、StyleGAN2ResnetEncoder 部分
FromRGB layer
self.add_module("FromRGB", ConvLayer(3, self.nc(0), 1))
对输入图像的第一次卷积,增加通道的数量,不改变大小
本质
EqualConv2D
in_channel
3 out_channel
32 padding
0 stride
1 kernel size
1*1
input
B*3*H*W output
B*32*H*W
DownToSpatialCode
self.DownToSpatialCode = nn.Sequential()
for i in range(self.opt.netE_num_downsampling_sp):
self.DownToSpatialCode.add_module(
"ResBlockDownBy%d" % (2 ** i),
ResBlock(self.nc(i), self.nc(i + 1), blur_kernel,
reflection_pad=True)
)
四组下采样的残差块进行下采样
本质
nn.Sequential([ResBlock1, ResBlock2, ResBlock3, ResBlock4])
每个resblock都是带下采样的,feature map的大小减半,通道数翻倍
input
B*32*H*W output
B*512*H/16*W /16
ToSpatialCode
nchannels = self.nc(self.opt.netE_num_downsampling_sp)
self.add_module(
"ToSpatialCode",
nn.Sequential(
ConvLayer(nchannels, nchannels, 1, activate=True, bias=True),
ConvLayer(nchannels, self.opt.spatial_code_ch, kernel_size=1,
activate=False, bias=True)
)
)
获得structrue code
两个卷积核大小为1*1的卷积层
改变通道的大小为空间特征最后的数目,不改变feature map的大小
本质
nn.Sequential([Conv1, Conv2])
input
B*512*H/16*W /16 output
B*8*H/16*W /16
DownToGlobalCode
self.DownToGlobalCode = nn.Sequential()
for i in range(self.opt.netE_num_downsampling_gl):
idx_from_beginning = self.opt.netE_num_downsampling_sp + i
self.DownToGlobalCode.add_module(
"ConvLayerDownBy%d" % (2 ** idx_from_beginning),
ConvLayer(self.nc(idx_from_beginning),
self.nc(idx_from_beginning + 1), kernel_size=3,
blur_kernel=[1], downsample=True, pad=0)
)
本质
nn.Sequential([Conv1, Conv2])
input
B*512*H/16*W /16 output
B*2048*H/64*W /64
使用两个卷积核大小为3*3的卷积层,通道翻倍,feature map大小减半
因为填充和卷积运算的原因,此处不能严格整除64
ToGlobalCode
nchannels = self.nc(self.opt.netE_num_downsampling_sp + self.opt.netE_num_downsampling_gl)
self.add_module(
"ToGlobalCode",
nn.Sequential(
EqualLinear(nchannels, self.opt.global_code_ch)
)
)
获得texture_code
本质
EqualLinear
input
B*in_dim output
B*out_dim
相当于一个全连接层 dense layer
normalize
def normalize(v):
if type(v) == list:
return [normalize(vv) for vv in v]
return v * torch.rsqrt((torch.sum(v ** 2, dim=1, keepdim=True) + 1e-8))
StyleGAN2ResnetEncoder总体流程
- FromRGB
- DownToSpatialCode
- ToSpatialCode -> 获得空间编码信息(structure code->sp)
- DownToGlobalCode -> 获得全局编码信息(texture code->gl)
- 在texture code后两个维度上取均值
input
B*2048*H/64*W /64output
B*2048 - ToGlobalCode 全连接层
- sp和gl分别通过normalize获得最后的编码结果(除以对应的L2范数)