搭建transformer模型结构
b站视频详解地址:https://www.bilibili.com/video/BV18S4y1E7QA?p=1&vd_source=f43f8e6ac75d926e15b60a75c043633b
利用torch详细分模块搭建transformer,具体细节可看b站视频
import os
import re
import sys
import copy
import json
import math
import time
import scipy
import shutil
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
# 掩码张量
def subsequent_mask(size):
atten_shape = (1,size,size)
mask = np.triu(np.ones(atten_shape),k=-1).astype("int8")
return torch.from_numpy(1 - mask)
# 词嵌入表示
class Embeddings(nn.Module):
def __init__(self,d_model,vocab):
super(Embeddings,self).__init__()
self.lut = nn.Embedding(vocab,d_model)
self.d_model = d_model
def forward(self,x):
return self.lut(x) * math.sqrt(self.d_model)
# 位置编码
class PositionalEncodings(nn.Module):
def __init__(self,d_model,dropout,max_size=5000):
super(PositionalEncodings,self).__init__()
self.dropout = nn.Dropout(p=dropout)
pe = torch.zeros(max_size,d_model)
postion = torch.arange(0,max_size).unsqueeze(1)
div_term = torch.exp(torch.arange(0,d_model,2) * -(math.log(10000.0) / d_model))
pe[:,0::2] = torch.sin(postion * div_term)
pe[:,1::2] = torch.cos(postion * div_term)
pe = pe.unsqueeze(0)
self.register_buffer("pe",pe)
def forward(self,x):
x = x + Variable(self.pe[:,:x.size(1)],requires_grad=False)
return self.dropout(x)
def clones(model,N):
return nn.ModuleList([copy.deepcopy(model) for _ in range(N)])
# 注意力机制实现
def attention(Q,K,V,mask=None,dropout=None):
d_k = Q.size(-1)
scores = torch.matmul(Q,K.transpose(-2,-1)) / math.sqrt(d_k)
if mask is not None:
scores = scores.masked_fill(mask == 0,-1e9)
p_atten = F.softmax(scores,dim=-1)
if dropout is not None:
p_atten = dropout(p_atten)
return torch.matmul(p_atten,V),p_atten
# 多头注意力机制
class MultiHeadAttendion(nn.Module):
def __init__(self,head,embedding_dim,dropout=0.1):
super(MultiHeadAttendion,self).__init__()
assert embedding_dim // head
self.head = head
self.d_k = embedding_dim // head
self.embedding_dim = embedding_dim
self.linears = clones(nn.Linear(embedding_dim,embedding_dim),4)
self.atten = None
self.dropout = nn.Dropout(p=dropout)
def forward(self,Q,K,V,mask=None):
if mask is not None:
mask = mask.unsqueeze(1)
batch_size = Q.size(0)
Q,K,V = \
[model(x).view(batch_size,-1,self.head,self.d_k).transpose(1,2) for model,x in zip(self.linears,(Q,K,V))]
x,self.atten = attention(Q,K,V,mask=mask,dropout=self.dropout)
x = x.transpose(1,2).contiguous().view(batch_size,-1,self.head * self.d_k)
return self.linears[-1](x)
# 前向反馈网络
class PostionWiseFeedForward(nn.Module):
def __init__(self,d_dim,diff,dropout=0.1):
super(PostionWiseFeedForward,self).__init__()
self.L1 = nn.Linear(d_dim,diff)
self.L2 = nn.Linear(diff,d_dim)
self.dropout = nn.Dropout(p=0.1)
def forward(self,x):
return self.L2(self.dropout(F.relu(self.L1(x))))
# 规范化层
class LayerNorm(nn.Module):
def __init__(self,feature,eps=1e-6):
super(LayerNorm,self).__init__()
self.a1 = nn.Parameter(torch.ones(feature))
self.a2 = nn.Parameter(torch.zeros(feature))
self.eps = eps
def forward(self,x):
mean = x.mean(-1,keepdim=True)
std = x.std(-1,keepdim=True)
return self.a1 * (x - mean) / (std + self.eps) + self.a2
# 子连接层
class SublearConnection(nn.Module):
def __init__(self,d_dim,dropout=0.1):
super(SublearConnection,self).__init__()
self.norm = LayerNorm(d_dim)
self.dropout = nn.Dropout(p=dropout)
def forward(self,x,Change):
return x + self.dropout(Change(self.norm(x)))
# 编码层
class EncoderLayer(nn.Module):
def __init__(self,size,self_atten,feed_word,dropout):
super(EncoderLayer,self).__init__()
self.size = size
self.self_atten = self_atten
self.feed_word = feed_word
self.sublayer = clones(SublearConnection(size,dropout),2)
def forward(self,x,mask=None):
x = self.sublayer[0](x,lambda x:self.self_atten(x,x,x,mask))
return self.sublayer[1](x,self.feed_word)
# 编码器
class Encoder(nn.Module):
def __init__(self,layer,N):
super(Encoder,self).__init__()
self.layers = clones(layer,N)
self.norm = LayerNorm(layer.size)
def forward(self,x,mask):
for layer in self.layers:
x = layer(x,mask)
return self.norm(x)
# 解码层
class DecoderLayer(nn.Module):
def __init__(self,size,self_atten,src_atten,feed_ward,dropout):
super(DecoderLayer,self).__init__()
self.size = size
self.self_atten = self_atten
self.src_atten = src_atten
self.feed_ward = feed_ward
self.dropout = dropout
self.sublayers = clones(SublearConnection(size,dropout),3)
def forward(self,x,memory,mask,m_mask):
m = memory
x = self.sublayers[0](x,lambda x:self.self_atten(x,x,x,mask))
x = self.sublayers[1](x,lambda x:self.src_atten(x,x,x,m_mask))
return self.sublayers[2](x,self.feed_ward)
# 编码器
class Decoder(nn.Module):
def __init__(self,layer,N):
super(Decoder,self).__init__()
self.layer = clones(layer,N)
self.norm = LayerNorm(layer.size)
def forward(self,x,memory,mask,m_mask):
for layer in self.layer:
x = layer(x,memory,mask,m_mask)
return self.norm(x)
# 输出层
class Generator(nn.Module):
def __init__(self,d_model,vocal_size):
super(Generator,self).__init__()
self.precoss = nn.Linear(d_model,vocal_size)
def forward(self,x):
return F.log_softmax(self.process(x),dim=-1)
# 编码解码整体结构
class EncoderDecoder(nn.Module):
def __init__(self,encoder,decoder,source_emd,target_emd,generator):
super(EncoderDecoder,self).__init__()
self.encoder = encoder
self.decoder = decoder
self.source_emd = source_emd
self.target_emd = target_emd
self.generator = generator
def forward(self,source,target,source_mask,target_mask):
return self.decode(self.encode(source,source_mask),source_mask,target,target_mask)
def encode(self,source,source_mask):
return self.encoder(self.source_emd(source),source_mask)
def decode(self,memory,source_mask,target,target_mask):
return self.decoder(self.target_emd(target),memory,source_mask,target_mask)
def make_model(source_vocab,target_vocab,N=6,d_model=512,d_ff=2048,head=8,dropout=0.1):
c = copy.deepcopy
atten = MultiHeadAttendion(head,d_model)
ff = PostionWiseFeedForward(d_model,d_ff,dropout)
postion = PositionalEncodings(d_model,dropout)
model = EncoderDecoder(
Encoder(EncoderLayer(d_model,c(atten),c(ff),dropout),N),
Decoder(DecoderLayer(d_model,c(atten),c(atten),c(ff),dropout),N),
nn.Sequential(Embeddings(d_model,source_vocab),c(postion)),
nn.Sequential(Embeddings(d_model,target_vocab),c(postion)),
Generator(d_model,target_vocab)
)
for p in model.parameters():
if p.dim()>1:
nn.init.xavier_uniform(p)
return model
if __name__ == '__main__':
source_vocab = target_vocab = 12
model = make_model(source_vocab,target_vocab)
print(model)