Transformer Model Architecture代码

本文所呈现代码,仅为复现模型各部分代码,不用于训练。如果需要进行训练的完整代码,请见代码块中的注释部分。

#完整可运行代码,详见https://github.com/hyunwoongko/transformer
import numpy as np
import torch
import torch.nn as nn
import math
#Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self,max_len,d_model,device):
        """
        加载位置编码,用于宇input embedding相加
        :param max_len: 序列的最大长度,也就是最大的位置编码
        :param d_model:模型的dimension,论文里是512
        :param device:cpu or cuda
        """
        super(PositionalEncoding,self).__init__()
        self.encoding=torch.zeros(max_len,d_model,device=device)
        self.encoding.requires_grad=False
        #位置编码不需要计算梯度
        pos=torch.arange(0,max_len,device=device)#生成序列,左闭右开
        pos=torch.unsqueeze(pos,1).to(torch.float32)#转换成2维,变成浮点数

        _2i=torch.arange(0,d_model,device=device)
        _2i=torch.unsqueeze(pos,1).to(torch.float32)

        self.encoding[:,0::2]=torch.sin(pos/(10000**(_2i/d_model)))#偶数列赋值
        self.encoding[:,1::2]=torch.cos(pos/(10000**(_2i/d_model)))#奇数列赋值

    def forward(self,x):
        batch_size,seq_len=x.size()
        #seq_len是实际输入序列长度小于等于max_len
        return self.encoding[:seq_len,:]

#Scaled Dot-Product Attention
class ScaledDotProductAttention(nn.Module):
    """
    计算注意力分数
    """
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()
        self.softmax=nn.Softmax(dim=-1)

    def forward(self,q,k,v,mask=None,e=1e-12):
        #由于在multi-head attention开头,对q\k\v都进行了增维处理,也就是有多个head
        #[batch,head,seq_len,d_k/v/q],d_model
        batch,head,seq,d_k=k.size
        k_t=k.transpose(2,3)
        score=(q@k_t)/math.sqrt(d_k)#@表示点积,得分矩阵的形状是seq_len*seq_len
        #对于decoder部分,还会有mask
        if mask is not None:
            score=score.masked_fill(mask==0,-10000)#给得分矩阵为0的地方赋值为极小值
        score=self.softmax(score)
        v=score@v
        return v,score

#Multi-Head Attention
class MultiHeadAttention(nn.Module):
    def __init__(self,d_model,d_head):
        super(MultiHeadAttention, self).__init__()
        self.head=d_head
        self.w_q=nn.Linear(d_model,d_model)
        self.w_k=nn.Linear(d_model,d_model)
        self.w_v=nn.Linear(d_model,d_model)
        self.attention=ScaledDotProductAttention()
        self.w_cat=nn.Linear(d_model,d_model)

    def forward(self,q,k,v,mask=None):
        q,k,v=self.w_q(q),self.w_k(k),self.w_v(v)
        #把q,k,v拆分成n个head的形式
        q,k,v=self.split(q),self.split(k),self.split(v)
        value,score=self.attention(q,k,v,mask=mask)
        value=self.concat(value)
        out=self.w_cat(value)
        return out
    def split(self,tensor):
        """
        切分向量
        :param tensor: [batch,seq_len,d_model]
        :return: [batch,head,seq_len,d_tensor],d_tensor=d_q/k/v
        """
        batch_size,seq_len,d_model=tensor.size()
        d_tensor=d_model//self.head
        tensor=tensor.view(batch_size,seq_len,self.head,d_tensor).transpose(1,2)
        return tensor
    def concat(self,tensor):
        """
        split的反向操作
        :param tensor: [batch,head,seq_len,d_tensor]
        :return: [batch,seq_len,d_model]
        """
        batch_size,head,seq_len,d_tensor=tensor.size()
        d_model=head*d_tensor
        tensor=tensor.transpose(1,2).contiguous().view(batch_size,seq_len,d_model)#contiguous() 确保了内存的连续性
        return tensor

class LayerNorm(nn.Module):
    def __init__(self,d_model,eps=1e-12):
        """
        对每个样本的特征维求normalization
        batch_norm是对每个minibatch做normalization
        :param d_model: 特征维上的数量
        :param eps:一个小常数,避免分母除0错误
        """
        super(LayerNorm, self).__init__()
        self.eps=eps
        self.gamma=nn.Parameter(torch.ones(d_model))#将张量包装成一个模型的参数。这使得张量在模型训练过程中可以被优化器更新。
        self.beta=nn.Parameter(torch.zeros(d_model))
    def forward(self,x):
        #x shape[batch_size,seq_length,d_model]
        #对样本的特征维(隐藏层那一个维度)求均值和方差
        mean=x.mean(-1,keepdim=True)#-1表示最后一个维度,keepdim表示结果是否与原维度相同
        #mean的shape[batch_size,seq_length,1]
        var=x.var(-1,unbiased=True,keepdim=True)#unbiased表示无偏估计
        out=(x-mean)/np.sqrt(var+self.eps)
        out=self.gamma*out+self.beta
        return out

class PointwiseFeedForward(nn.Module):
    def __init__(self,d_model,d_ff=2048,drop_rate=0.1):
        """
        2个全连接层中间跟一个relu激活函数
        :param d_model: input 和output
        :param d_ff: 中间层
        :param drop_rate: dropout的概率
        """
        super(PointwiseFeedForward, self).__init__()
        self.layer1=nn.Linear(d_model,d_ff)
        self.layer2=nn.Linear(d_ff,d_model)
        self.relu=nn.ReLU()
        self.dropout=nn.Dropout(p=drop_rate)

    def forward(self,x):
        x=self.layer1(x)
        x=self.relu(x)
        x=self.dropout(x)
        x=self.layer2(x)
        return x

#encoder块
class EncoderLayer(nn.Module):
    def __init__(self,d_model,n_heads,d_ff,drop_rate):
        super(EncoderLayer, self).__init__()
        self.attention=MultiHeadAttention(d_model=d_model,d_head=n_heads)
        self.norm1=LayerNorm(d_model=d_model)
        self.dropout1=nn.Dropout(p=drop_rate)

        self.ff=PointwiseFeedForward(d_model=d_model,d_ff=d_ff,drop_rate=drop_rate)
        self.norm2=LayerNorm(d_model=d_model)
        self.dropout2=nn.Dropout(p=drop_rate)

    def forward(self,x,src_mask):
        _x=x
        x=self.attention(q=x,k=x,v=x,mask=src_mask)
        x=self.dropout1(x)
        x=self.norm1(_x+x)
        _x=x
        x=self.ff(x)
        x=self.norm2(_x+x)
        return x
#Encoder
class Encoder(nn.Module):
    def __init__(self,enc_voc_size,max_len,d_ff,n_head,d_model,drop,device,n_layers=6):
        super(Encoder, self).__init__()
        #embedding操作
        #生成encoder
        self.layers=nn.ModuleList([EncoderLayer(d_model=d_model,d_ff=d_ff,n_heads=n_head,drop_rate=drop) for _ in range(n_layers)])

    def forward(self,x,src_mask):
        #进行embedding
        #放入每个encoder层
        for layer in self.layers:
            x=layer(x,src_mask)

        return x

#decoder layer
class DecoderLayer(nn.Module):
    def __init__(self,d_model,d_ff,n_head,drop_rate):
        super(DecoderLayer, self).__init__()
        self.attention=MultiHeadAttention(d_model=d_model,d_head=n_head)
        self.norm1=LayerNorm(d_model=d_model)
        self.dropout1=nn.Dropout(p=drop_rate)

        self.enc_dec_attention=MultiHeadAttention(d_model=d_model,d_head=n_head)
        self.norm2=LayerNorm(d_model=d_model)
        self.dropout2=nn.Dropout(p=drop_rate)

        self.ff=PointwiseFeedForward(d_model=d_model,d_ff=d_ff,drop_rate=drop_rate)
        self.norm3=LayerNorm(d_model=d_model)
        self.dropout3=nn.Dropout(p=drop_rate)

    def forward(self,dec,enc,tgt_mask,src_mask):
        _x=dec
        x=self.attention(q=dec,k=dec,v=dec,mask=tgt_mask)
        x = self.dropout1(x)
        x=self.norm1(_x+x)

        if enc is not None:
            _x=x
            x=self.enc_dec_attention(q=x,k=enc,v=enc,mask=src_mask)

            x=self.dropout2(x)
            x=self.norm2(_x+x)
        _x=x
        x=self.ff(x)
        x=self.dropout3(x)
        x=self.norm3(_x+x)
        return x

#decoder
class Decoder(nn.Module):
    def __init__(self,dec_voc_size,max_len,d_model,d_ff,n_head,drop_rate,n_layers):
        super(Decoder, self).__init__()
        #embedding
        #生成decoder layer
        self.layer=nn.ModuleList([DecoderLayer(d_model=d_model,d_ff=d_ff,n_head=n_head,drop_rate=drop_rate) for _ in range(n_layers)])
        self.linear=nn.Linear(d_model,dec_voc_size)

    def forward(self,tgt,src,tgt_mask,src_mask):
        #对tgt embedding
        for layers in self.layer:
            tgt=layers(tgt,src,tgt_mask,src_mask)

        output=self.linear(tgt)
        output=nn.Softmax(output)
        return output

  • 3
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
Swin Transformer is a recent deep learning architecture that has shown impressive results in various computer vision tasks such as object detection and image classification. To perform inference using a pre-trained Swin Transformer model, you can follow these steps: 1. Load the pre-trained Swin Transformer model and set it to evaluation mode. 2. Prepare your input data (e.g., an image) and preprocess it according to the requirements of the model (e.g., resizing and normalization). 3. Pass the preprocessed input data through the model and obtain the output (e.g., predicted class probabilities or bounding boxes). 4. Postprocess the output as necessary (e.g., applying non-maximum suppression to remove redundant bounding boxes). Here is some example code: ``` import torch import torchvision.transforms as T from swin_transformer import SwinTransformer # Load the pre-trained Swin Transformer model model = SwinTransformer() # Set the model to evaluation mode model.eval() # Prepare the input data image = ... # Load an image using PIL or OpenCV transform = T.Compose([ T.Resize((224, 224)), # Resize the image to the required size T.ToTensor(), # Convert the image to a PyTorch tensor T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # Normalize the image ]) input_data = transform(image) # Pass the input data through the model with torch.no_grad(): output = model(input_data.unsqueeze(0)) # Add a batch dimension # Postprocess the output # ... ``` Note that the exact details of the input data preparation and output postprocessing will depend on the specific task and model architecture.
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值