1.PatchEmbedding
图1 下
class PatchEmbed ( nn. Module) :
"""
2D Image to Patch Embedding
"""
def __init__ ( self, img_size= 224 , patch_size= 16 , in_c= 3 , embed_dim= 768 , norm_layer= None ) :
super ( ) . __init__( )
img_size = ( img_size, img_size)
patch_size = ( patch_size, patch_size)
self. img_size = img_size
self. patch_size = patch_size
self. grid_size = ( img_size[ 0 ] // patch_size[ 0 ] , img_size[ 1 ] // patch_size[ 1 ] )
self. num_patches = self. grid_size[ 0 ] * self. grid_size[ 1 ]
self. proj = nn. Conv2d( in_c, embed_dim, kernel_size= patch_size, stride= patch_size)
self. norm = norm_layer( embed_dim) if norm_layer else nn. Identity( )
def forward ( self, x) :
B, C, H, W = x. shape
assert H == self. img_size[ 0 ] and W == self. img_size[ 1 ] , \
f"Input image size ( { H} * { W} ) doesn't match model ( { self. img_size[ 0 ] } * { self. img_size[ 1 ] } )."
x = self. proj( x) . flatten( 2 ) . transpose( 1 , 2 )
x = self. norm( x)
return x
2.multihead Attention
图1 中encoder部分 图2 详解部分的multihead
class Attention ( nn. Module) :
def __init__ ( self,
dim,
num_heads= 8 ,
qkv_bias= False ,
qk_scale= None ,
attn_drop_ratio= 0 . ,
proj_drop_ratio= 0 . ) :
super ( Attention, self) . __init__( )
self. num_heads = num_heads
head_dim = dim // num_heads
self. scale = qk_scale or head_dim ** - 0.5
self. qkv = nn. Linear( dim, dim * 3 , bias= qkv_bias)
self. attn_drop = nn. Dropout( attn_drop_ratio)
self. proj = nn. Linear( dim, dim)
self. proj_drop = nn. Dropout( proj_drop_ratio)
def forward ( self, x) :
B, N, C = x. shape
qkv = self. qkv( x) . reshape( B, N, 3 , self. num_heads, C // self. num_heads) . permute( 2 , 0 , 3 , 1 , 4 )
q, k, v = qkv[ 0 ] , qkv[ 1 ] , qkv[ 2 ]
attn = ( q @ k. transpose( - 2 , - 1 ) ) * self. scale
attn = attn. softmax( dim= - 1 )
attn = self. attn_drop( attn)
x = ( attn @ v) . transpose( 1 , 2 ) . reshape( B, N, C)
x = self. proj( x)
x = self. proj_drop( x)
return x
3.MLP module
图2 上mlp,图三是图2 上mlp结构的详解
class Mlp ( nn. Module) :
"""
MLP as used in Vision Transformer, MLP-Mixer and related networks
"""
def __init__ ( self, in_features, hidden_features= None , out_features= None , act_layer= nn. GELU, drop= 0 . ) :
super ( ) . __init__( )
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self. fc1 = nn. Linear( in_features, hidden_features)
self. act = act_layer( )
self. fc2 = nn. Linear( hidden_features, out_features)
self. drop = nn. Dropout( drop)
def forward ( self, x) :
x = self. fc1( x)
x = self. act( x)
x = self. drop( x)
x = self. fc2( x)
x = self. drop( x)
return x