搭建transformer模型结构

搭建transformer模型结构

b站视频详解地址:https://www.bilibili.com/video/BV18S4y1E7QA?p=1&vd_source=f43f8e6ac75d926e15b60a75c043633b

利用torch详细分模块搭建transformer,具体细节可看b站视频

import os
import re
import sys
import copy
import json
import math
import time
import scipy
import shutil
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
# 掩码张量
def subsequent_mask(size):
     atten_shape = (1,size,size)
     mask = np.triu(np.ones(atten_shape),k=-1).astype("int8")
     return torch.from_numpy(1 - mask)
# 词嵌入表示
class Embeddings(nn.Module):
    def __init__(self,d_model,vocab):
        super(Embeddings,self).__init__()
        self.lut = nn.Embedding(vocab,d_model)
        self.d_model = d_model
    def forward(self,x):
        return self.lut(x) * math.sqrt(self.d_model)

# 位置编码
class PositionalEncodings(nn.Module):
    def __init__(self,d_model,dropout,max_size=5000):
        super(PositionalEncodings,self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_size,d_model)
        postion = torch.arange(0,max_size).unsqueeze(1)
        div_term = torch.exp(torch.arange(0,d_model,2) * -(math.log(10000.0) / d_model))
        pe[:,0::2] = torch.sin(postion * div_term)
        pe[:,1::2] = torch.cos(postion * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe",pe)

    def forward(self,x):
        x = x + Variable(self.pe[:,:x.size(1)],requires_grad=False)
        return self.dropout(x)
def clones(model,N):
    return nn.ModuleList([copy.deepcopy(model) for _ in range(N)])

# 注意力机制实现
def attention(Q,K,V,mask=None,dropout=None):
    d_k = Q.size(-1)
    scores = torch.matmul(Q,K.transpose(-2,-1)) / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0,-1e9)
    p_atten = F.softmax(scores,dim=-1)
    if dropout is not None:
        p_atten = dropout(p_atten)
    return torch.matmul(p_atten,V),p_atten
# 多头注意力机制
class MultiHeadAttendion(nn.Module):
    def __init__(self,head,embedding_dim,dropout=0.1):
        super(MultiHeadAttendion,self).__init__()

        assert embedding_dim // head
        self.head = head
        self.d_k = embedding_dim // head
        self.embedding_dim = embedding_dim
        self.linears = clones(nn.Linear(embedding_dim,embedding_dim),4)
        self.atten = None
        self.dropout = nn.Dropout(p=dropout)
    def forward(self,Q,K,V,mask=None):

        if mask is not None:
            mask = mask.unsqueeze(1)
        batch_size = Q.size(0)
        Q,K,V = \
        [model(x).view(batch_size,-1,self.head,self.d_k).transpose(1,2) for model,x in zip(self.linears,(Q,K,V))]
        x,self.atten = attention(Q,K,V,mask=mask,dropout=self.dropout)
        x = x.transpose(1,2).contiguous().view(batch_size,-1,self.head * self.d_k)
        return self.linears[-1](x)

# 前向反馈网络
class PostionWiseFeedForward(nn.Module):
    def __init__(self,d_dim,diff,dropout=0.1):
        super(PostionWiseFeedForward,self).__init__()
        self.L1 = nn.Linear(d_dim,diff)
        self.L2 = nn.Linear(diff,d_dim)
        self.dropout = nn.Dropout(p=0.1)
    def forward(self,x):
        return self.L2(self.dropout(F.relu(self.L1(x))))

# 规范化层
class LayerNorm(nn.Module):
    def __init__(self,feature,eps=1e-6):
        super(LayerNorm,self).__init__()
        self.a1 = nn.Parameter(torch.ones(feature))
        self.a2 = nn.Parameter(torch.zeros(feature))
        self.eps = eps
    def forward(self,x):
        mean = x.mean(-1,keepdim=True)
        std = x.std(-1,keepdim=True)
        return self.a1 * (x - mean) / (std + self.eps) + self.a2
# 子连接层
class SublearConnection(nn.Module):
    def __init__(self,d_dim,dropout=0.1):
        super(SublearConnection,self).__init__()
        self.norm = LayerNorm(d_dim)
        self.dropout = nn.Dropout(p=dropout)
    def forward(self,x,Change):
        return x + self.dropout(Change(self.norm(x)))

# 编码层
class EncoderLayer(nn.Module):
    def __init__(self,size,self_atten,feed_word,dropout):
        super(EncoderLayer,self).__init__()
        self.size = size
        self.self_atten = self_atten
        self.feed_word = feed_word
        self.sublayer = clones(SublearConnection(size,dropout),2)
    def forward(self,x,mask=None):
        x = self.sublayer[0](x,lambda x:self.self_atten(x,x,x,mask))
        return self.sublayer[1](x,self.feed_word)
# 编码器
class Encoder(nn.Module):
    def __init__(self,layer,N):
        super(Encoder,self).__init__()
        self.layers = clones(layer,N)
        self.norm = LayerNorm(layer.size)
    def forward(self,x,mask):
        for layer in self.layers:
            x = layer(x,mask)
        return self.norm(x)

# 解码层
class DecoderLayer(nn.Module):
    def __init__(self,size,self_atten,src_atten,feed_ward,dropout):
        super(DecoderLayer,self).__init__()
        self.size = size
        self.self_atten = self_atten
        self.src_atten = src_atten
        self.feed_ward = feed_ward
        self.dropout = dropout
        self.sublayers = clones(SublearConnection(size,dropout),3)
    def forward(self,x,memory,mask,m_mask):
        m = memory
        x = self.sublayers[0](x,lambda x:self.self_atten(x,x,x,mask))
        x = self.sublayers[1](x,lambda x:self.src_atten(x,x,x,m_mask))
        return self.sublayers[2](x,self.feed_ward)
# 编码器
class Decoder(nn.Module):
    def __init__(self,layer,N):
        super(Decoder,self).__init__()
        self.layer = clones(layer,N)
        self.norm = LayerNorm(layer.size)
    def forward(self,x,memory,mask,m_mask):
        for layer in self.layer:
            x = layer(x,memory,mask,m_mask)
        return self.norm(x)

# 输出层
class Generator(nn.Module):
    def __init__(self,d_model,vocal_size):
        super(Generator,self).__init__()
        self.precoss = nn.Linear(d_model,vocal_size)
    def forward(self,x):
        return F.log_softmax(self.process(x),dim=-1)


# 编码解码整体结构
class EncoderDecoder(nn.Module):
    def __init__(self,encoder,decoder,source_emd,target_emd,generator):
        super(EncoderDecoder,self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.source_emd = source_emd
        self.target_emd = target_emd
        self.generator = generator
    def forward(self,source,target,source_mask,target_mask):
        return self.decode(self.encode(source,source_mask),source_mask,target,target_mask)
    def encode(self,source,source_mask):
        return self.encoder(self.source_emd(source),source_mask)
    def decode(self,memory,source_mask,target,target_mask):
        return self.decoder(self.target_emd(target),memory,source_mask,target_mask)

def make_model(source_vocab,target_vocab,N=6,d_model=512,d_ff=2048,head=8,dropout=0.1):
    c = copy.deepcopy
    atten = MultiHeadAttendion(head,d_model)
    ff = PostionWiseFeedForward(d_model,d_ff,dropout)
    postion = PositionalEncodings(d_model,dropout)

    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model,c(atten),c(ff),dropout),N),
        Decoder(DecoderLayer(d_model,c(atten),c(atten),c(ff),dropout),N),
        nn.Sequential(Embeddings(d_model,source_vocab),c(postion)),
        nn.Sequential(Embeddings(d_model,target_vocab),c(postion)),
        Generator(d_model,target_vocab)
    )
    for p in model.parameters():
        if p.dim()>1:
            nn.init.xavier_uniform(p)
    return model

if __name__ == '__main__':
    source_vocab = target_vocab = 12
    model = make_model(source_vocab,target_vocab)
    print(model)
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值