【transformer】【pytorch】TransFG代码【modeling.py】

1 modeling.py1)导入包# coding=utf-8from __future__ import absolute_importfrom __future__ import divisionfrom __future__ import print_functionimport torchsnooper as tsimport copyimport loggingimport mathfrom os.path import join as pjoinimport to
摘要由CSDN通过智能技术生成

1 modeling.py

1)导入包
# coding=utf-8
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import torchsnooper as ts

import copy
import logging
import math

from os.path import join as pjoin

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

from torch.nn import CrossEntropyLoss, Dropout, Softmax, Linear, Conv2d, LayerNorm
from torch.nn.modules.utils import _pair
from scipy import ndimage

import models.configs as configs

logger = logging.getLogger(__name__)

ATTENTION_Q = "MultiHeadDotProductAttention_1/query"
ATTENTION_K = "MultiHeadDotProductAttention_1/key"
ATTENTION_V = "MultiHeadDotProductAttention_1/value"
ATTENTION_OUT = "MultiHeadDotProductAttention_1/out"
FC_0 = "MlpBlock_3/Dense_0"
FC_1 = "MlpBlock_3/Dense_1"
ATTENTION_NORM = "LayerNorm_0"
MLP_NORM = "LayerNorm_2"
2)if _ name _=="_main _"
为了能够直接观察本节代码,因此在此代码后,进行了调用,创建了输入张量以及输出。
if __name__=="__main__":
    config = CONFIGS['ViT-B_16']
    config.split = 'overlap'
    net = VisionTransformer(config,num_classes=200)
    x = torch.rand((2,3,224,224)).type((torch.float32))
    y = net(x)
3)Class Attention

此处有多个注意力抽头,有的文章强调,多个注意力抽头可以将嵌入向量映射到不同的空间,也就是可以关注不同的信息。害,神奇,不知道他能关注姿态么?

class Attention(nn.Module):
    def __init__(self, config):
        super(Attention, self).__init__()
        self.num_attention_heads = config.transformer["num_heads"]
        self.attention_head_size = int(config.hidden_size / self.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.query = Linear(config.hidden_size, self.all_head_size)
        self.key = Linear(config.hidden_size, self.all_head_size)
        self.value = Linear(config.hidden_size, self.all_head_size)

        self.out = Linear(config.hidden_size, config.hidden_size)
        self.attn_dropout = Dropout(config.transformer["attention_dropout_rate"])
        self.proj_dropout = Dropout(config.transformer["attention_dropout_rate"])

        self.softmax = Softmax(dim=-1)

    def transpose_for_scores(self, x):
        with ts.snoop():
            new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)#(2,325,12,64)
            x = x.view(*new_x_shape)
            return x.permute(0, 2, 1, 3)#(2,12,325,64)

    def forward(self, hidden_states):
        with ts.snoop():#torchsnooper.snoop(),打印每一行运行结果,有利于调试
        
        	#hidden_states(2, 325, 768),325=324+1,是嵌入向量。
        	#self.query\key\value是各自的映射,输出为(2,325,768).。此处的768=12*64是12个注意力抽头,每个是64维向量
            mixed_query_layer = self.query(hidden_states)
            mixed_key_layer = self.key(hidden_states)
            mixed_value_layer = self.value(hidden_states)
			
			#(2,12,325,64)12表示12个注意力,325是嵌入向量的个数。计算时,计算每个注意力抽头中的不同嵌入向量间的关系,
			#最后将12个注意力的输出结果进行合并,因此此处通过transpose_for_scores函数进行reshape
            query_layer = self.transpose_for_scores(mixed_query_layer)#(2,12,325,64)
            key_layer = self.transpose_for_scores(mixed_key_layer)#(2,12,325,64)
            value_layer = self.transpose_for_scores(mixed_value_layer)#(2,12,325,64)

			#计算相似度q*kT,输出后两维度的每一行表示所有嵌入向量与当前向量的相似度
            attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))#(2,12,325,325)
            attention_scores = attention_scores / math.sqrt(self.attention_head_size)  #q*kT/根号d
            attention_probs = self.softmax(attention_scores)
            weights = attention_probs#(2,12,325,325)
            attention_probs = self.attn_dropout(attention_probs)
	
			#给各个向量分配权重,q*kT*v
            context_layer = torch.matmul(attention_probs, value_layer)#(2,12,325,64)
            #合并所有的注意力抽头
            context_layer = context_layer.permute(0, 2, 1, 3).contiguous() #(2,325,12,64)
            new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
            context_layer = context_layer.view(*new_context_layer_shape)#(2,325,768)
            attention_output = self.out(context_layer) #(2,325,768)
            attention_output = self.proj_dropout(attention_output)
            return attention_output, weights 
            #输出的是各个向量计算多个注意力抽头,分配权重,合并抽头,并映射为嵌入维度的向量、12个heads对应的vectors间的相似度关系

创建以及调用代码:

Block:
self.attn = Attention(config)
------
x, weights = self.attn(x)
  • 3
    点赞
  • 11
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值