前言
以下内容为小白学习vit内容记录,如理解有误,望帮助指出修正。基于Paddle框架学习,aistudio课程即可学习。此次记录课程中图像与Transformer基础的学习内容。
一、Transformer主体
大致结构如图所示,输入的图片经过PatchEmbedding后划分成image Tokens输入到模型中,经过Encoders后(大致是像CNN一个个卷积模块,但是结构不同),最后Linear、softmax进行分类。这次课程主要介绍了VIT中的PatchEmbedding、MLP以及简单展示Encoder的结构。
二、PatchEmbedding
对于一个输入的图片张量(维度可能[H,W,3]),把它切成一块块拟定大小(win_w,win_h)的特征图沿着光栅方向拼接(行方向)每一个方块就是一个image patche,并对每一个特征方块映射得到自己拟定的embed_dim通道大小。自己的解释十分粗糙。
将卷积块在输入图片中滑动,在vit中卷积大小为7*7,其步长和卷积核大小相同(这里就可以将图片按着前面的说法切成一块块拟定大小),out_channels就等于我们的embed_dim,就可以将每一块特征图映射。
这张图似乎更好理解(转载他人博客)
下面为PatchEmbedding的代码
import paddle
import paddle.nn
import numpy as np
class PatchEmbedding(paddle.nn.Layer):
def __init__(self,input_size,in_channels,patch_size,embed_dim,dropout = 0.):
super(PatchEmbedding, self).__init__()
self.patch_embedding = paddle.nn.Conv2D(in_channels = in_channels,
out_channels = embed_dim,
kernel_size = patch_size,
stride = patch_size)
self.dropout = paddle.nn.Dropout(dropout)
def forward(self, x):
# x[4, 3, 224, 224]
x = self.patch_embedding(x)
#x [4, 16, 32, 32]
#将其拉成一维num_patch
x = x.flatten(2)
# x[4,16,1024]
x = x.transpose((0,2,1))
# x[4,1024,16] 对应[batchsize,num_patch,embed_dim]
x = self.dropout(x)
return x
t = paddle.randn([4, 3, 224, 224])
model = PatchEmbedding(224,3,7,16)
y = model(t)
print(f'out shape {y.shape}')
### out shape [4, 1024, 16]
三、MLP
课程中没详细说明,不过大致就是全连接层的作用(个人理解)
import paddle
import paddle.nn
import numpy as np
class Mlp(paddle.nn.Layer):
def __init__(self,embed_dim,mlp_ratio = 4.0,dropout = 0.):
super(Mlp,self).__init__()
self.fc1 = paddle.nn.Linear(embed_dim,int(embed_dim*mlp_ratio))
self.fc2 = paddle.nn.Linear(int(embed_dim*mlp_ratio),embed_dim)
#VIT的激活函数不同于relu
self.act = paddle.nn.GELU()
self.dropout = paddle.nn.Dropout(dropout)
def forward(self,x):
x = self.fc1(x)
x = self.act(x)
x = self.dropout(x)
x = self.fc2(x)
x = self.dropout(x)
return x
四、Encoder
参考vit中的vit结构图堆叠。(很像resnet中的卷积块)
这次的课程中未完全讲解Encoder所以atten部分暂时什么也不做,就传参数,下次的课程中会带上。对于layer_normal层文档中的解释,其中实现了层归一化层(Layer Normalization Layer)的功能,其可以应用于小批量输入数据。(感觉应该和目标检测的batch_normal很像)
import paddle
import paddle.nn
class Atten(paddle.nn.Layer):
def __init__(self):
super(Atten, self).__init__()
def forward(self, x):
return x
class Encoder(paddle.nn.Layer):
def __init__(self,embed_dim):
super(Encoder,self).__init__()
self.atten = Atten()
self.layer_nomer = paddle.nn.LayerNorm(embed_dim)
self.mlp = Mlp(embed_dim)
self.mlp_nomer = paddle.nn.LayerNorm(embed_dim)
def forward(self,x):
h = x
x = self.layer_nomer(x)
x = self.atten(x)
x = h + x
h = x
x = self.mlp_nomer(x)
x = self.mlp(x)
x = h + x
return x
这几个模块的完成,vit各个模块部分已经完成,按着开头的图片将模块拼接就可以得到VIT的主体。
五、VIT
import paddle
import paddle.nn
import numpy as np
class Atten(paddle.nn.Layer):
def __init__(self):
super(Atten, self).__init__()
def forward(self, x):
return x
class PatchEmbedding(paddle.nn.Layer):
def __init__(self,input_size,in_channels,patch_size,embed_dim,dropout = 0.):
super(PatchEmbedding, self).__init__()
self.patch_embedding = paddle.nn.Conv2D(in_channels = in_channels,
out_channels = embed_dim,
kernel_size = patch_size,
stride = patch_size)
self.dropout = paddle.nn.Dropout(dropout)
def forward(self, x):
x = self.patch_embedding(x)
x = x.flatten(2)
x = x.transpose((0,2,1))
x = self.dropout(x)
return x
class Mlp(paddle.nn.Layer):
def __init__(self,embed_dim,mlp_ratio = 4.0,dropout = 0.):
super(Mlp,self).__init__()
self.fc1 = paddle.nn.Linear(embed_dim,int(embed_dim*mlp_ratio))
self.fc2 = paddle.nn.Linear(int(embed_dim*mlp_ratio),embed_dim)
self.act = paddle.nn.GELU()
self.dropout = paddle.nn.Dropout(dropout)
def forward(self,x):
x = self.fc1(x)
x = self.act(x)
x = self.dropout(x)
x = self.fc2(x)
x = self.dropout(x)
return x
class Encoder(paddle.nn.Layer):
def __init__(self,embed_dim):
super(Encoder,self).__init__()
self.atten = Atten()
self.layer_nomer = paddle.nn.LayerNorm(embed_dim)
self.mlp = Mlp(embed_dim)
self.mlp_nomer = paddle.nn.LayerNorm(embed_dim)
def forward(self,x):
h = x
x = self.layer_nomer(x)
x = self.atten(x)
x = h + x
h = x
x = self.mlp_nomer(x)
x = self.mlp(x)
x = h + x
return x
class Vit(paddle.nn.Layer):
def __init__(self):
super(Vit, self).__init__()
self.PatchEmbedding = PatchEmbedding(input_size = 224,in_channels = 3,patch_size = 7,embed_dim = 16)
encoder = Encoder(16)
Encoders = [encoder for i in range(5)]
self.Encoder = paddle.nn.Sequential(*Encoders)
self.avgpool = paddle.nn.AdaptiveAvgPool1D(1)
self.head = paddle.nn.Linear(16,10)
def forward(self,x):
x = self.PatchEmbedding(x)
x = self.Encoder(x)
x = x.transpose((0,2,1))
x = self.avgpool(x)
x = x.flatten(1)
x = self.head(x)
return x
def main():
t = paddle.randn((4,3,224,224))
model = Vit()
out = model(t)
print(out.shape)
if __name__ == '__main__':
main()
总结
尝试入门深度学习,其中的理解与解释比较牵强,后面会不断学习此课程,学习记录。