代码注释如下:
import math import numpy as np import torch import torch.nn as nn import torch.nn.functional as F #1.dropout函数,消除梯度消失的作用 #drop_prob(0,1):在一批样本中,不参与计算的样本的概率 def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True): #如果不参与计算的样本的概率为0, 则返回样本本身 if drop_prob == 0. or not training: return x # keep_prob:在一批样本中,参与计算的样本的概率 keep_prob = 1 - drop_prob #shape:样本的特征维数,比如4维样本[n,h,x,y],则其shape为[n,1,1,1] shape = (x.shape[0],) + (1,) * (x.ndim - 1) #根据参与计算的样本的概率keep_prob,依据泊松分布随机获得参与计算和不参与计算的样本,比如样本总数n=3,参与计算的位置为0,2,不参与计算的位置为1, #则输出值为[[[1]],[[0]],[[1]]],即对参与计算的赋值为1,不参与计算的赋值为0 random_tensor = x.new_empty(shape).bernoulli_(keep_prob) #如果参与计算的概率大于0, 则对上面的random_tensor乘以2,此时random_tensor=[[[2]],[[0]],[[2]]] if keep_prob > 0.0 and scale_by_keep: random_tensor.div_(keep_prob) #x * random_tensor:表示对不参与计算的样本(图片)的矩阵的所有值修改为0,即导数为0,对参与计算的样本(图片)的矩阵的所有值乘以2,对其方法2倍。 return x * random_tensor class DropPath(nn.Module): """ Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). """ def __init__(self, drop_prob=None, scale_by_keep=True): super(DropPath, self).__init__() self.drop_prob = drop_prob self.scale_by_keep = scale_by_keep def forward(self, x): return drop_path(x, self.drop_prob, self.training, self.scale_by_keep) # --------------------------------------# # 2.Gelu激活函数的实现 # 利用近似的数学公式 # --------------------------------------# class GELU(nn.Module): def __init__(self): super(GELU, self).__init__() def forward(self, x): return 0.5 * x * (1 + torch.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * torch.pow(x, 3)))) # ---------------------------------------------------------------------------------# # LayerNorm 支持两种形式channels_last (default) or channels_first. # channels_last 对应具有形状的输入(batch_size, height, width, channels) # channels_first 对应具有形状的输入(batch_size, channels, height, width). # ---------------------------------------------------------------------------------# #3.LN的标准化,表示对某一层的归一化(相对于BN而言)。就是对每张图片的行的所有元素进行归一化 #BN的标准化,表示对批次里所有样本(图片)的列进行归一化 class LayerNorm(nn.Module): # normalized_shape=5表示图片有多少列,即某一行的所有元素 def __init__(self, normalized_shape=7, eps=1e-6, data_format="channels_last"): super().__init__() # 对3维图片的列进行归一化,注意:对于3通道的图片,每个元素是1X3维的值,相当于对img.shape[k,:](第k行)的所有元素求归一化 self.weight = nn.Parameter(torch.ones(normalized_shape)) # Layer_Norm的归一化,加了一个偏差值bias self.bias = nn.Parameter(torch.zeros(normalized_shape)) # Layer_Norm的归一化,为了防止分母等于0,分母加了一个很小的大于0的值bias self.eps = eps self.data_format = data_format if self.data_format not in ["channels_last", "channels_first"]: raise NotImplementedError self.normalized_shape = (normalized_shape,) def forward(self, x): # Layer_Norm的归一化函数,normalized_shape表示图片的列数,即img.shape[1] if self.data_format == "channels_last": return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps) elif self.data_format == "channels_first": # 以下是LN归一化的公式 # x的均值 u = x.mean(1, keepdim=True) # (x-u)的平方再求均值,类似x的方差 s = (x - u).pow(2).mean(1, keepdim=True) # 参考分母带根号的公式 x = (x - u) / torch.sqrt(s + self.eps) # 将上面的值赋值给weight,再加上偏差bias x = self.weight[:, None, None] * x + self.bias[:, None, None] return x # --------------------------------------------------------------------------------------------------------------# # 4.ConvNeXt Block有两种等效的实现: # (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W) # (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back # 代码中使用(2),因为这个在PyTorch中稍微快一点 # --------------------------------------------------------------------------------------------------------------# class Block(nn.Module): def __init__(self, dim=7, drop_path=0., layer_scale_init_value=1e-6): super().__init__() #--------------------------# # 7x7的逐层卷积 #--------------------------# self.dwconv=nn.Conv2d(dim,dim,kernel_size=7,padding=3,groups=dim) self.norm=LayerNorm(dim,eps=1e-6) # --------------------------# # 利用全连接层代替1x1卷积 # --------------------------# self.pwconv1=nn.Linear(dim,4*dim) self.act=GELU() # --------------------------# # 利用全连接层代替1x1卷积 # --------------------------# self.pwconv2 = nn.Linear(4 * dim, dim) # --------------------------# # 加入缩放系数 # --------------------------# self.gamma = nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True) if layer_scale_init_value > 0 else None # --------------------------# # 加入Drop_path正则化 # --------------------------# self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() def forward(self, x): input = x #--------------------------# # 7x7的逐层卷积 #--------------------------# x = self.dwconv(x) x=x.permute(0,2,3,1)# (N, C, H, W) -> (N, H, W, C) x=self.norm(x) # --------------------------# # 利用全连接层代替1x1卷积 # --------------------------# x = self.pwconv1(x) x = self.act(x) # --------------------------# # 利用全连接层代替1x1卷积 # --------------------------# x = self.pwconv2(x) # --------------------------# # 加入缩放系数 # --------------------------# if self.gamma is not None: x = self.gamma * x x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W) # --------------------------# # 加入Drop_path正则化 # --------------------------# x = input + self.drop_path(x) return x if __name__ == "__main__": input=torch.randn(4,7,7,7) dim=7 print(input.shape) ss=Block() s1=ss(input) print(s1.shape) print(s1)