2.关于Transformer-CSDN博客

本文链接：https://blog.csdn.net/sgr011215/article/details/141857592

关于Transformer

模型架构

在这里插入图片描述

举例输入图像为3x224x224

Embedded Patches

将一张图的多个区域进行卷积，将每个区域转换成多维度向量（多少卷积核就有多少维向量）

self.patch_embeddings = Conv2d(in_channels=in_channels, # 颜色通道 3
                               out_channels=config.hidden_size, # 卷积核个数，也就是输出通道数 768
                               kernel_size=patch_size, # 卷积核大小 16
                               stride=patch_size) # 步长 16
 # 197个向量，每个向量768维度，196 = 14*14 剩下一个是cls_token (1,1,768)
self.position_embeddings = nn.Parameter(torch.zeros(1, n_patches+1, config.hidden_size))
 # (1,1,768) 用来收集每个向量的信息
self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) # 填充为0

Norm

类似于nlp，不使用batch norm，因为每个维度相同位置表示的信息不一样，所以对于transformer来说，只使用layer norm

# 类似于nlp，不使用batch norm，因为每个维度相同位置表示的信息不一样，所以对于transformer来说，只使用layer norm
self.encoder_norm = LayerNorm(config.hidden_size, eps=1e-6)

Multi-head Attention

这里是采用12头，输入数据位（16,197,768）经过多头转换后（16,12,197,64）分别处理，最后在进行汇总

# 初始化多头机制
self.num_attention_heads = config.transformer["num_heads"]
self.attention_head_size = int(config.hidden_size / self.num_attention_heads) # 每一头处理一个向量中的一个数据 768/12 = 64 
self.all_head_size = self.num_attention_heads * self.attention_head_size
# 初始化 attention 的q，k，v
self.query = Linear(config.hidden_size, self.all_head_size)
self.key = Linear(config.hidden_size, self.all_head_size)
self.value = Linear(config.hidden_size, self.all_head_size)

多头计算

def forward(self, hidden_states):

    mixed_query_layer = self.query(hidden_states)#Linear(in_features=768, out_features=768, bias=True)
    mixed_key_layer = self.key(hidden_states)
    mixed_value_layer = self.value(hidden_states)

    query_layer = self.transpose_for_scores(mixed_query_layer)
    key_layer = self.transpose_for_scores(mixed_key_layer)
    value_layer = self.transpose_for_scores(mixed_value_layer)

    # 计算得分 
    attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
    attention_scores = attention_scores / math.sqrt(self.attention_head_size)
    attention_probs = self.softmax(attention_scores)
    weights = attention_probs if self.vis else None
    attention_probs = self.attn_dropout(attention_probs)

     # 将q计算出来的权重，对value进行加权，不改变value的维度
    context_layer = torch.matmul(attention_probs, value_layer)
    context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
    new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
    context_layer = context_layer.view(*new_context_layer_shape) # 重新升维成（16,197,768）
 
    attention_output = self.out(context_layer)
    attention_output = self.proj_dropout(attention_output)

    return attention_output, weights

MPL

前面多维向量计算的结果进行汇总，一般会有两层的MPL

def forward(self, x):
    # 全连接
    x = self.fc1(x)
    x = self.act_fn(x)
    x = self.dropout(x)
    # 全连接
    x = self.fc2(x)
    x = self.dropout(x)
    return x

残差连接

class Block(nn.Module):
    def __init__(self, config, vis):
        super(Block, self).__init__()
        self.hidden_size = config.hidden_size
        self.attention_norm = LayerNorm(config.hidden_size, eps=1e-6)
        self.ffn_norm = LayerNorm(config.hidden_size, eps=1e-6)
        self.ffn = Mlp(config)
        self.attn = Attention(config, vis)

    def forward(self, x):
        
        h = x # 保存原始数据，后续做残差连接
        x = self.attention_norm(x) # 标准化
        x, weights = self.attn(x) # 注意力机制
        x = x + h # 做了一个残差连接

        h = x
        x = self.ffn_norm(x)
        x = self.ffn(x)
        x = x + h

        return x, weights