import paddle
import paddle.nn as nn
import numpy as np
import copy
#格式化代码 ctrl+alt+l
class Identity(nn.Layer):
def __init__(self):
super().__init__()
def forward(self, x):
return x
class MLP(nn.Layer):
def __init__(self, embed_dim, mlp_ratio, dropout=0.):
super().__init__()
self.fc1 = nn.Linear(embed_dim, int(embed_dim * mlp_ratio))
self.fc2 = nn.Linear(int(embed_dim * mlp_ratio), embed_dim)
self.act = nn.GELU()
self.dorpout = nn.Dropout(dropout)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.dorpout(x)
x = self.fc2(x)
x = self.dorpout(x)
return x
# [4,3,224,224]->[4,196+2(token个数),768(embed_dim)]
class PatchEmbedding(nn.Layer):
def __init__(self, img_size=224, in_channels=3, patch_size=16, embed_dim=768, dropout=0.):
super().__init__()
n_patches = (img_size // patch_size) ** 2 #14*14个
self.patch_embedding = nn.Conv2D(in_channels=in_channels,
out_channels=embed_dim,
kernel_size=patch_size,
stride=patch_size,
bias_attr=False)
self.dropout = nn.Dropout(dropout)
self.cls_token = paddle.create_parameter(shape=[1, 1, embed_dim],
dtype="float32",
default_initializer=nn.initializer.Constant(0), )
self.distill_token = paddle.create_parameter(shape=[1, 1, embed_dim],
dtype="float32",
default_initializer=nn.initializer.TruncatedNormal(std=.02))
self.position_embeddings = paddle.create_parameter(shape=[1, n_patches + 2, embed_dim],
dtype="float32",
default_initializer=nn.initializer.TruncatedNormal(std=0.02))
def forward(self, x):
# [n,c,h,w]
cls_tokens = self.cls_token.expand([x.shape[0], -1, -1]) # xiecheng [4,1,768]yechengba?
distill_tokens = self.distill_token.expand((x.shape[0], -1, -1))
x = self.patch_embedding(x) # [n,c',h',w'] ??为什么输入进去的是思维的,n是干嘛的到底[4 768 14 14]
x = x.flatten(2) # 为什么这个没有提示[4,768,196]
x = x.transpose([0, 2, 1]) # [n,h'*w',c'][4,196,768]
x = paddle.concat([cls_tokens, distill_tokens, x], axis=1)#[4,198,768]
out = x + self.position_embeddings
out = self.dropout(out)
return out
class Attention(nn.Layer):
def __init__(self, embed_dim, num_heads, qkv_bias=True, dropout=0., attention_dropout=0.):
super().__init__()
self.num_heads = num_heads
self.head_dim = int(embed_dim / num_heads) # 为什么要转换一下类型嫩,因为算出来的是float
self.all_head_dim = self.head_dim * self.num_heads
self.scales = self.head_dim ** -0.5 # 不应该是all——head——dim吗??
self.qkv = nn.Linear(embed_dim, self.all_head_dim * 3)
self.proj = nn.Linear(embed_dim, embed_dim)
self.dropout = nn.Dropout(dropout)
self.attention_dropout = nn.Dropout(attention_dropout)
self.softmax = nn.Softmax(axis=-1)
def transpose_multihead(self, x):
# x: [N, num_patches, all_head_dim] -> [N, n_heads, num_patches, head_dim]
new_shape = x.shape[:-1] + [self.num_heads, self.head_dim]
# print("\nshape到底长啥样:", new_shape)#[4, 198, 4, 192]
x = x.reshape(new_shape)
x = x.transpose([0, 2, 1, 3])
return x
def forward(self, x):
# [N,num_patches,dim]
qkv = self.qkv(x).chunk(3, axis=-1)
q, k, v = map(self.transpose_multihead, qkv)
attn = paddle.matmul(q, k, transpose_y=True)
attn = attn * self.scales
attn = self.softmax(attn)
attn = self.dropout(attn)
out = paddle.matmul(attn, v)
##这两句没看懂
out = out.transpose([0, 2, 1, 3])
out = out.reshape(out.shape[:-2] + [self.all_head_dim])
out = self.proj(out)
out = -self.dropout(out)
return out
class EncoderLayer(nn.Layer):
# 先归一化,再接个个子层
def __init__(self, embed_dim=768, num_heads=4, qkv_bias=True, mlp_ratio=4.0, dropout=0., attention_dropout=0.):
super().__init__()
self.attn_norm = nn.LayerNorm(embed_dim)
self.attn = Attention(embed_dim, num_heads)
self.mlp_norm = nn.LayerNorm(embed_dim)
self.mlp = MLP(embed_dim, mlp_ratio)
def forward(self, x):
h = x
print("x:",x.shape)
x = self.attn_norm(x)
x = self.attn(x)
x = h + x
h = x
x = self.mlp_norm(x)
x = self.mlp(x)
x = x + h
return x
class Encoder(nn.Layer):
def __init__(self, embed_dim, depth):
super().__init__()
layer_list = []
for i in range(depth):
encoder_layer = EncoderLayer()
layer_list.append(encoder_layer)
self.layers = nn.LayerList(layer_list)
self.norm = nn.LayerNorm(embed_dim)
def forward(self, x):
for layer in self.layers:
x = layer(x)
x = self.norm(x)
return x[:, 0], x[:, 1] # [4,198,emnbed_dim]中第一个和第二个是那两个特殊的token
class DeiT(nn.Layer):
def __init__(self,
img_size=224,
patch_size=16,
in_channels=3,
num_classes=1000,
embed_dim=768,
depth=3,
num_heads=8,
mlp_ratio=4,
qkv_bias=True,
dropout=0.,
attention_dropout=0.,
droppath=0.):
super().__init__()
# [4, 3, 224, 224]->[4, 196 + 2(token个数), 768(embed_dim)]
self.patch_embedding = PatchEmbedding(img_size=img_size, in_channels=in_channels, patch_size=patch_size,
embed_dim=embed_dim, dropout=dropout)
self.encoder=Encoder(embed_dim=embed_dim,depth=depth)
self.head=nn.Linear(embed_dim,num_classes)
self.head_distill=nn.Linear(embed_dim,num_classes)
def forward(self,x):
x=self.patch_embedding(x)
x,x_distill=self.encoder(x)
x=self.head(x)
x_distill=self.head_distill(x_distill)
if(self.training):
return x,x_distill
else:
return (x+x_distill)/2
def main():
model=DeiT()
# print(model)
paddle.summary(model,(4,3,224,224))
if __name__=="__main__":
main()
DeiTransformer
最新推荐文章于 2024-08-20 17:28:36 发布