大模型基础——从零实现一个Transformer(1)-CSDN博客
大模型基础——从零实现一个Transformer(2)-CSDN博客
大模型基础——从零实现一个Transformer(3)-CSDN博客
一、前言
上一篇文章已经把Encoder模块的单个EncodeBlock已经实现了
本文我们继续了解Transformer中剩下的其他组件.
二、编码器(Encoder)
按照图把相应的子模块堆叠起来就可以了
from torch import nn,Tensor
# 自己实现的EncoderBlock
from llm_base.block.encoder_block import EncoderBlock
from llm_base.layer_norm.normal_layernorm import LayerNorm
class Encoder(nn.Module):
def __init__(self,
d_model: int,
n_layers: int,
n_heads: int,
d_ff: int,
dropout: float = 0.1,
norm_first: bool = False) -> None:
'''
:param d_model: dimension of embeddings
:param n_layers: number of encoder blocks
:param n_heads: number of heads
:param d_ff: dimension of inner feed-forward network
:param drop: dropout ratio. Defaults to 0.1.
:param norm_first:
'''
super().__init__()
# stack n_layers encoder blocks
self.layers = nn.ModuleList(
[
EncoderBlock(d_model,n_heads,d_ff,dropout,norm_first) for _ in range(n_layers)
]
)
self.norm = LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
def forward(self,src: Tensor,src_mask: Tensor = None,keep_attentions:bool = False) -> Tensor:
'''
:param src: (batch_size, seq_length, d_model)
:param src_mask: (batch_size, 1, seq_length)
:param keep_attentions: whether keep attention weigths or not. Defaults to False.
:return: (batch_size, seq_length, d_model)
'''
x = src
# pass through each layer
for layer in self.layers:
x = layer(x,src_mask,keep_attentions)
return self.norm(x)
三、解码器(Decoder)
解码器(Decoder)相比 编码器(Encoder):
- 增加了个掩码多头注意力(Masked Multi Head Attention),这里的掩码是为了防止解码器看到目标序列中当前位置的下一个标记,强制模型仅使用现有的标记作为上下文来预测下一个标记。
- 多头注意力,它将编码器(Encoder)的输出作为附加输入——即Key和Value,来自掩码多头注意力的输出作为Query
3.1 解码器块(Decoder Block)
from torch import nn,Tensor
from typing import *
# 引入自己实现的模块
from llm_base.attention.MultiHeadAttention1 import MultiHeadAttention
from llm_base.layer_norm.normal_layernorm import LayerNorm
from llm_base.ffn.PositionWiseFeedForward import PositonWiseFeedForward
class DecoderBlock(nn.Module):
def __init__(self,
d_model: int,
n_heads: int,
d_ff: int,
dropout: float,
norm_first: bool = False) -> None:
'''
:param d_model: dimension of embeddings
:param n_heads: number of heads
:param d_ff: dimension of inner feed-forward network
:param dropout: dropout ratio
:param norm_first: if True, layer norm is done prior to attention and feedforward operations(Pre-Norm).
Otherwise it's done after(Post-Norm). Default to False.
'''
super().__init__()
self.norm_first = norm_first
# masked multi-head attention 定义
self.masked_attention = MultiHeadAttention(d_model,n_heads,dropout)
self.norm1 = LayerNorm(d_model)
# cross multi-head attention 定义
self.cross_attention = MultiHeadAttention(d_model,n_heads,dropout)
self.norm2 = LayerNorm(d_model)
# position-wise feed-forward network 定义
self.ff = PositonWiseFeedForward(d_model,d_ff,dropout)
self.norm3 = LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
self.dropout3 = nn.Dropout(dropout)
# self attention sub layer
def _self_attention_sub_layer(self,x: Tensor,attn_mask: Tensor,keep_attentions: bool) -> Tensor:
x = self.masked_attention(x,x,x,attn_mask,keep_attentions)
return self.dropout1(x)
# cross attention sub layer
def _cross_attention_sub_layer(self,x: Tensor,encode_mem: Tensor,atten_mask:Tensor,keep_attentions:bool) -> Tensor:
x = self.cross_attention(x,encode_mem,encode_mem,atten_mask,keep_attentions)
return self.dropout2(x)
def _feedforward_sub_layer(self,x: Tensor) -> Tensor:
x = self.ff(x)
return self.dropout3(x)
def forward(self,
target_tensor: Tensor,
encoder_mem: Tensor,
target_mask: Tensor = None,
memory_mask: Tensor = None,
keep_attentions: bool = False)-> Tuple[Tensor,Tensor,Tensor]:
'''
:param target_tensor: (batch_size, tgt_seq_length, d_model) the (target) sequence to the decoder block.
:param encoder_mem: (batch_size, src_seq_length, d_model) the sequence from the last layer of the encoder.
:param target_mask: (batch_size, 1, tgt_seq_length, tgt_seq_length) the mask for the tgt sequence.
:param memory_mask: (batch_size, 1, 1, src_seq_length) the mask for the memory sequence.
:param keep_attentions: whether keep attention weigths or not. Defaults to False.
:return: (batch_size, tgt_seq_length, d_model) output of decoder block
'''
# pass througth masked multi-head attention
# target_tensor (batch_size, tgt_seq_length, d_model)
# masked_attn_score (batch_size, n_heads, tgt_seq_length, tgt_seq_length)
x = target_tensor
if self.norm_first:
# Post Norm
x = x + self._self_attention_sub_layer(self.norm1(x),target_mask,keep_attentions)
x = x + self._cross_attention_sub_layer(self.norm2(x),encoder_mem,memory_mask,keep_attentions)
x = x + self._feedforward_sub_layer(self.norm3(x))
else:
# Pre Norm
x = self.norm1(x + self._self_attention_sub_layer(x,target_mask,keep_attentions))
x = self.norm2(x + self._cross_attention_sub_layer(x,encoder_mem,memory_mask,keep_attentions))
x = self.norm3(x + self._feedforward_sub_layer(x))
return x
3.2 掩码多头注意力Mask
掩码多头注意力的时候,希望解码器只看到当前和之前的输入,而屏蔽未来的输入。
以:<bos>我很高兴认识你 为例
import torch
from torch import nn ,Tensor
def make_target_mask(target_tensor,pad_idx: int =0) -> Tensor:
'''
make mask tensor for target sequences
:param target_tensor: (batch_size, seq_length) raw sequences with padding
:param pad_idx: (int, optional): pad index. Defaults to 0.
:return: (batch_size, 1, seq_length, seq_length)
'''
seq_len = target_tensor.size()[-1]
# padding mask
# target_mask (batch_size,1,1,seq_length)
# 中间两个维度为1,在后面操作的时候可以通过BroadCast机制自动补全大搜相应的维度
target_mask = (target_tensor != pad_idx).unsqueeze(1).unsqueeze(2)
# subsentence mask
# subseq_mask (batch_size, 1, seq_length, seq_length)
subseq_mask = torch.tril(torch.ones((seq_len,seq_len)).bool())
target_mask = target_mask & subseq_mask
return target_mask
if __name__ == '__main__':
seq_len = 10
batch_size = 1
target_tensor = torch.randint(1,1000,(batch_size,seq_len))
# 模拟padding
target_tensor[:,int(seq_len/2):] = 0
print(target_tensor)
target_mask = make_target_mask(target_tensor)
print(target_mask)
print(target_mask.shape)
3.3 Decoder模块实现
from torch import nn,Tensor
from llm_base.block.decoder_block import DecoderBlock
from llm_base.layer_norm.normal_layernorm import LayerNorm
class Decoder(nn.Module):
def __init__(self,
d_model: int,
n_layers: int,
n_heads: int,
d_ff: int,
dropout: float = 0.1,
norm_first: bool = False) -> None:
'''
:param d_model: dimension of embeddings
:param n_layers: number of encoder blocks
:param n_heads: number of heads
:param d_ff: dimension of inner feed-forward network
:param dropout: dropout ratio. Defaults to 0.1.
:param norm_first:
'''
super().__init__()
# 开始堆叠n_layers个 decoder blocks
self.layers = nn.ModuleList(
[ DecoderBlock(d_model,n_heads,d_ff,dropout,norm_first) for _ in range(n_layers)]
)
self.norm = LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
def forward(self,
target_tensor: Tensor,
encoder_memory: Tensor,
target_mask: Tensor = None,
memory_mask: Tensor = None,
keep_attentions: bool = False) -> Tensor:
'''
:param target_tensor: (batch_size, tgt_seq_length, d_model) the (target) sequence to the decoder.
:param encoder_memory: (batch_size, src_seq_length, d_model) the sequence from the last layer of the encoder.
:param target_mask: (batch_size, 1, tgt_seq_length, tgt_seq_length) the mask for the tgt sequence.
:param memory_mask: (batch_size, 1, 1, src_seq_length) the mask for the memory sequence.
:param keep_attentions: whether keep attention weigths or not. Defaults to False.
:return: (batch_size, tgt_seq_length, d_model) model output (logits)
'''
x = target_tensor
# pass through each layer
for layer in self.layers:
x = layer(x,encoder_memory,target_mask,memory_mask,keep_attentions)
x = self.norm(x)
return x