11 (完结)Transformer 中 Transformer框架 的 forward 的构建,终于结束了,期待 GPT 和 BERT_哔哩哔哩_bilibili
###只是大致结构,缺少了mask和padding
1. Embedding构造
class WordEmbedding(nn.Module):
"""
把向量构造成d_model维度的词向量,以便后续送入编码器
"""
def __init__(self, vocab_size, d_model):
"""
:param vocab_size: 字典长度
:param d_model: 词向量维度
"""
super(WordEmbedding, self).__init__()
self.d_model = d_model
# 字典中有vocab_size个词,词向量维度是d_model,每个词将会被映射成d_model维度的向量
self.embedding = nn.Embedding(vocab_size, d_model)
self.embed = self.embedding
def forward(self, x):
# TODO:为什么要乘以一个sqrt
#The reason we increase the embedding values before the addition is to make the positional encoding relatively smaller.
# This means the original meaning in the embedding vector won’t be lost when we add them together.
return self.embed(x) * math.sqrt(self.d_model)
2.位置编码
class Positional_Encoding(nn.Module):
#实现位置编码
def __init__(self, dim, dropout,max_len=5000):
super(Positional_Encoding, self).__init__()
if dim%2!=0:
raise ValueError('dimension should be even')
'''
位置编码的公式:
PE(pos,2i) = sin(pos/10000^(2i/d_model))
PE(pos,2i+1) = cos(pos/10000^(2i/d_model))
'''
pe = torch.zeros(max_len,dim) #max_len是生成句子的最长的长度
position = torch.arange(0,max_len).unsqueeze(1) #生成一个从0到max_len的数组
div_term = torch.exp(torch.arange(0,dim,2,dtype=torch.float)*-torch.log(torch.tensor(10000.0))/dim) #为了上gpu并行计算进行的变换
pe[:,0::2] = torch.sin(position.float()*div_term)
pe[:,1::2] = torch.cos(position.float()*div_term)
pe = pe.unsqueeze(1)
self.register_buffer('pe',pe)
self.dropout = nn.Dropout(p=dropout)
self.dim = dim
def forward(self,emb):
emb = emb +self.pe[:emb.size(0)] #
emb = self.dropout(emb)
return emb
3.多头注意力
def self_attention(Q,K,V,mask=None,dropout=None):
#实现self-attention层
d_k = Q.size(-1)
scores = torch.matmul(Q,K.transpose(-2,-1)/sqrt(d_k)) #将最后两个维度进行转置,然后进行点乘操作 scores=(QK^T)/sqrt(d_k)
#if mask is not None:
self_attn = F.softmax(scores, dim=-1)
if dropout is not None:
self_attn = dropout(self_attn)
return torch.matmul(self_attn,V),self_attn
class MutilHeadAttention(nn.Module):
#多头注意力机制
def __init__(self, head, d_model, dropout=0.1):
super(MutilHeadAttention, self).__init__()
assert (d_model%head==0) #确保d_model可以被head整除
self.d_k = d_model // head
self.head = head #头数
self.d_model = d_model #输入的维度
self.linear_Q = nn.Linear(d_model,d_model) #线性变换层,用于将输入转换为Q
self.linear_K = nn.Linear(d_model, d_model) #线性变换层,用于将输入转换为K
self.linear_V = nn.Linear(d_model, d_model) #线性变换层,用于将输入转换为V
#自注意力机制的QKV同源,线性变换
self.linear_out = nn.Linear(d_model, d_model) #线性变换层,用于将输出转换为d_model维度的向量
self.dropout = nn.Dropout(dropout)
self.attn = None
def forward(self, Q,K,V,mask=None):
#实现多头注意力机制
#if mask is not None:
#有几个batch
n_batch = Q.size(0)
#对X进行切分现成多头
# Q.shape [b,32,512]
# Q经过线性变换后 [batch,8,32,64]
# 每个batch里面有32个token,每个token的维度8*64=512,head=8,-1是自动计算->每个头里面使用embedding的64维
Q = self.linear_Q(Q).view(n_batch,-1,self.head,self.d_k).transpose(1,2)
K = self.linear_Q(K).view(n_batch,-1,self.head,self.d_k).transpose(1,2)
V = self.linear_Q(V).view(n_batch,-1,self.head,self.d_k).transpose(1,2)
x, self.attn = self_attention(Q,K,V,mask,self.dropout)
#将多头注意力机制的输出进行拼接
x = x.transpose(1,2).continuate().view(n_batch,-1,self.head*self.d_k)
#将拼接后的向量进行线性变换,并返回结果
return self.linear_out(x)
4.残差链接和标准化
class LayerNorm(nn.Module):
#实现LayerNorm层
def __init__(self, feature, eps=1e-6):
super(LayerNorm, self).__init__()
#初始化两个可训练参数,分别用于缩放和偏移
self.a_2 = nn.Parameter(torch.ones(feature))
self.b_2 = nn.Parameter(torch.zeros(feature))
self.eps = eps
def forward(self, x):
mean = x.mean(-1, keepdim=True)
var = x.var(-1, keepdim=True)
return self.a_2 * (x-mean)/ sqrt(var+self.eps) + self.b_2
class SublayerConnection(nn.Module):
#残差和标准化模块
#在Transformer中,每个子层模块都包含残差连接和标准化模块,以保证模型的稳定性。
def __init__(self, size, dropout=0.1):
super(SublayerConnection, self).__init__()
#初始化一个dropout层,用于在训练过程中随机丢弃一些神经元,以防止过拟合
self.dropout = nn.Dropout(dropout)
#初始化一个LayerNorm层,用于对输入进行标准化处理
self.layer_norm = LayerNorm(size)
def forward(self, x, sublayer):
'''
:param x: self-attention的输入
:param sublayer: self-attrntion的输出 self-attention(x)
:return: 残差连接和标准化模块后的结果
'''
return self.dropout(self.layer_norm(x+sublayer(x)))
5.前馈神经网络
class PositionWiseFeedForward(nn.Module):
def __init__(self,d_model,d_ff,dropout=0.1):
super(PositionWiseFeedForward,self).__init__()
self.w_1 = nn.Linear(d_model,d_ff)
self.w_2 = nn.Linear(d_ff,d_model)
self.layer_norm = nn.LayerNorm(d_model,eps=1e-6)
self.dropout_1 = nn.Dropout(dropout)
self.relu = nn.ReLU()
self.dropout_2 = nn.Dropout(dropout)
def forward(self,x):
inter = self.dropout_1(self.relu(self.w_1(self.layer_norm(x))))
output = self.dropout_2(self.w_2(inter))
return output
6.最后输出
class WordProbGenerator(nn.Module):
"""
文本生成器,即把Decoder层的输出通过最后一层softmax层变化为词概率
"""
"""
:param d_model: 词向量维度
:param vocab_size: 词典大小
"""
def __init__(self,d_model,vocab_size):
super(WordProbGenerator,self).__init__()
self.linear = nn.Linear(d_model,vocab_size)
def forward(self,x):
return F.log_softmax(self.linear(x),dim=-1)
7.组合成Encoder
class EncoderLayer(nn.Module):
"""
一层编码Encoder层
MultiHeadAttention -> Add & Norm -> Feed Forward -> Add & Norm
"""
def __init__(self,size,attn,feed_forward,dropout=0.1):
super(EncoderLayer,self).__init__()
self.attn = attn
self.feed_forward = feed_forward
# clone两个残差连接
self.sublayer_connection = clones(SublayerConnection(size,dropout),2)
def forward(self,x,mask):
x = self.sublayer_connection[0](x,lambda x:self.attn(x,x,x,mask))
return self.sublayer_connection[1](x,self.feed_forward(x))
class Encoder(nn.Module):
"""
编码器
多层EncoderLayer
"""
def __init__(self,n,encoder_layer):
super(Encoder,self).__init__()
self.encoder_layer = clones(encoder_layer,n)
def forward(self,x,mask):
for layer in self.encoder_layer:
x = layer(x,mask)
return x
8.组合成Decoder
class DecoderLayer(nn.Module):
"""
一层解码Decoder层
Mask-MultiHeadAttention -> Add & Norm -> MultiHeadAttention -> Add & Norm -> Feed Forward -> Add & Norm
"""
def __init__(self,size,attn,feed_forward,dropout=0.1):
super(DecoderLayer,self).__init__()
self.attn = attn
self.feed_forward = feed_forward
self.sublayer_connection = clones(SublayerConnection(size,dropout),3)
def forward(self,x,memory,trg_mask):
#memory是编码器的输出
x = self.sublayer_connection[0](x,lambda x:self.attn(x,x,x,trg_mask))
x = self.sublayer_connection[1](x,lambda x:self.attn(x,memory,memory,None))
return self.sublayer_connection[-1](x,self.feed_forward(x))
class Decoder(nn.Module):
"""
解码器
多层DecoderLayer
"""
def __init__(self,n,decoder_layer):
super(Decoder,self).__init__()
self.decoder_layer = clones(decoder_layer,n)
def forward(self,mask):
for layer in self.decoder_layer:
x = layer(x,mask)
return x
9.组合Encoder和Decoder形成Transformer
class Transformer(nn.Module):
def __init__(self,vocab,d_model,d_ff,n_heads,n_layers,dropout,device='cuda'):
super(Transformer,self).__init__()
self.vocab = vocab
self.device = device
attn = MutilHeadAttention(n_heads,d_model,dropout)
feed_forward = PositionWiseFeedForward(d_model,d_ff)
self.trg_embed = WordEmbedding(vocab.n_vocabs,d_model)
self.pos_embed = Positional_Encoding(d_model,dropout)
c = copy.deepcopy()
self.encoder = Encoder(n_layers,EncoderLayer(d_model,c(attn),c(feed_forward),dropout=dropout))
self.decoder = Decoder(n_layers,DecoderLayer(d_model,c(attn),c(feed_forward),dropout=dropout))
self.generator = WordProbGenerator(d_model,vocab.n_vocabs)
def forward(self, src, trg, padding_mask,trg_mask):
encoding_output = self.encoder(src, padding_mask)
decoding_output = self.decoder(encoding_output, trg, padding_mask,trg_mask)
pred = self.generator(decoding_output)
return pred