1 modeling.py
1)导入包
# coding=utf-8
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import torchsnooper as ts
import copy
import logging
import math
from os.path import join as pjoin
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.nn import CrossEntropyLoss, Dropout, Softmax, Linear, Conv2d, LayerNorm
from torch.nn.modules.utils import _pair
from scipy import ndimage
import models.configs as configs
logger = logging.getLogger(__name__)
ATTENTION_Q = "MultiHeadDotProductAttention_1/query"
ATTENTION_K = "MultiHeadDotProductAttention_1/key"
ATTENTION_V = "MultiHeadDotProductAttention_1/value"
ATTENTION_OUT = "MultiHeadDotProductAttention_1/out"
FC_0 = "MlpBlock_3/Dense_0"
FC_1 = "MlpBlock_3/Dense_1"
ATTENTION_NORM = "LayerNorm_0"
MLP_NORM = "LayerNorm_2"
2)if _ name _=="_main _"
为了能够直接观察本节代码,因此在此代码后,进行了调用,创建了输入张量以及输出。
if __name__=="__main__":
config = CONFIGS['ViT-B_16']
config.split = 'overlap'
net = VisionTransformer(config,num_classes=200)
x = torch.rand((2,3,224,224)).type((torch.float32))
y = net(x)
3)Class Attention
此处有多个注意力抽头,有的文章强调,多个注意力抽头可以将嵌入向量映射到不同的空间,也就是可以关注不同的信息。害,神奇,不知道他能关注姿态么?
class Attention(nn.Module):
def __init__(self, config):
super(Attention, self).__init__()
self.num_attention_heads = config.transformer["num_heads"]
self.attention_head_size = int(config.hidden_size / self.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = Linear(config.hidden_size, self.all_head_size)
self.key = Linear(config.hidden_size, self.all_head_size)
self.value = Linear(config.hidden_size, self.all_head_size)
self.out = Linear(config.hidden_size, config.hidden_size)
self.attn_dropout = Dropout(config.transformer["attention_dropout_rate"])
self.proj_dropout = Dropout(config.transformer["attention_dropout_rate"])
self.softmax = Softmax(dim=-1)
def transpose_for_scores(self, x):
with ts.snoop():
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)#(2,325,12,64)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)#(2,12,325,64)
def forward(self, hidden_states):
with ts.snoop():#torchsnooper.snoop(),打印每一行运行结果,有利于调试
#hidden_states(2, 325, 768),325=324+1,是嵌入向量。
#self.query\key\value是各自的映射,输出为(2,325,768).。此处的768=12*64是12个注意力抽头,每个是64维向量
mixed_query_layer = self.query(hidden_states)
mixed_key_layer = self.key(hidden_states)
mixed_value_layer = self.value(hidden_states)
#(2,12,325,64)12表示12个注意力,325是嵌入向量的个数。计算时,计算每个注意力抽头中的不同嵌入向量间的关系,
#最后将12个注意力的输出结果进行合并,因此此处通过transpose_for_scores函数进行reshape
query_layer = self.transpose_for_scores(mixed_query_layer)#(2,12,325,64)
key_layer = self.transpose_for_scores(mixed_key_layer)#(2,12,325,64)
value_layer = self.transpose_for_scores(mixed_value_layer)#(2,12,325,64)
#计算相似度q*kT,输出后两维度的每一行表示所有嵌入向量与当前向量的相似度
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))#(2,12,325,325)
attention_scores = attention_scores / math.sqrt(self.attention_head_size) #q*kT/根号d
attention_probs = self.softmax(attention_scores)
weights = attention_probs#(2,12,325,325)
attention_probs = self.attn_dropout(attention_probs)
#给各个向量分配权重,q*kT*v
context_layer = torch.matmul(attention_probs, value_layer)#(2,12,325,64)
#合并所有的注意力抽头
context_layer = context_layer.permute(0, 2, 1, 3).contiguous() #(2,325,12,64)
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(*new_context_layer_shape)#(2,325,768)
attention_output = self.out(context_layer) #(2,325,768)
attention_output = self.proj_dropout(attention_output)
return attention_output, weights
#输出的是各个向量计算多个注意力抽头,分配权重,合并抽头,并映射为嵌入维度的向量、12个heads对应的vectors间的相似度关系
创建以及调用代码:
Block:
self.attn = Attention(config)
------
x, weights = self.attn(x)