从0-1搭建Transformer架构
架构图
本文主要讲解
1)输入层的词嵌入
2)输入层的位置编码
3)编码层的多头注意力机制
4)编码层的前馈全连接
1)输入层的词嵌入
class Embeddings(nn.Module):
"""构建embedding类实现文本嵌入"""
def __init__(self, d_model, vocab):
# d_model: 词嵌入维度
# vocab: 词表的大小
super(Embeddings, self).__init__()
self.lut = nn.Embedding(vocab, d_model)
self.d_model = d_model
def forward(self, x):
return self.lut(x) * math.sqrt(self.d_model)
2)输入层的位置编码
class PositionalEncoding(nn.Module):
"""位置编码"""
def __init__(self, d_model, pad_size=5000):
# d_model 词嵌入维度
# pad_size 默认词汇大小
super(PositionalEncoding, self).__init__()
self.d_model = d_model
self.pad_size = pad_size
pe = torch.zeros(pad_size, d_model)
for t in range(pad_size):
for i in range(d_model // 2):
angle_rate = 1 / (10000 ** (2 * i / d_model))
pe[t, 2 * i] = np.sin(t * angle_rate)
pe[t, 2 * i + 1] = np.cos(t * angle_rate)
# # 双层循环等价写法
# pe = torch.tensor(
# [[pad / (10000.0 ** (i // 2 * 2.0 / d_model)) for i in range(d_model)] for pad in range(pad_size)])
#
# pe[:, 0::2] = np.sin(pe[:, 0::2])
# pe[:, 1::2] = np.cos(pe[:, 1::2])
# 将位置编码扩展到三维
pe = pe.unsqueeze(0)
# 将位置编码矩阵注册成模型的buffer,buffer不是模型的参数,不跟随优化器更新
# 注册成buffer后,在模型保存后重新加载模型的时候,将这个位置编码将和参数一起加载进来
self.register_buffer('pe', pe)
def forward(self, x):
# 位置编码不需要反向更新
x = x + Variable(self.pe[:, :x.size(1)], requires_grad=False)
return x
3)编码层的多头注意力机制
三个辅助函数注意力机制、module拷贝函数、
def attention(q, k, v, dropout=None, mask=None):
# 计算公式 AT(Q,K,V) = softmax(\frac{QK^{T}}{\sqrt{d_k}})V
# 词嵌入维度
d_k = q.shape[-1]
score = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)
if mask is not None:
score = score.masked_fill(mask == 0, -1e6)
score = F.softmax(score, dim=-1)
if dropout is not None:
score = dropout(score)
return torch.matmul(score, v), score
def clones(module, N):
"""
:param module: 需要复制的网络模块
:param N: copy数量
"""
return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
class SublayerConnection(nn.Module):
""" 子层连接结构,根据传入的sublayer(实例对象)处理
在编码层sublayer可以是多头注意机制或者前馈全连接
在解码层sublayer也可以是带有掩码的多头注意力机制
SublayerConnection处理流程:规范化 -> 掩码多头/多头/前馈 -> 残差连接"""
def __init__(self, d_k, dropout=0.1):
super(SublayerConnection, self).__init__()
self.norm = nn.LayerNorm(d_k)
self.dropout = nn.Dropout(p=dropout)
def forward(self, x, sublayer):
# 先规范化处理在由具体子层函数处理
out = sublayer(self.norm(x))
out = self.dropout(out)
# 残差连接
return x + out
多头注意力机制
class MultiHeadAttention(nn.Module):
"""多头注意力机制"""
def __init__(self, d_k, head_num, dropout=0.0):
super(MultiHeadAttention, self).__init__()
self.d_k = d_k
self.head_num = head_num
assert d_k % head_num == 0
self.head_dim = d_k // head_num
self.dropout = nn.Dropout(p=dropout)
# 深度copy4个线性层,3个用于Q、K、V矩阵,一个将用于指定维度转换
self.linears = clones(nn.Linear(d_k, d_k), 4)
self.attn = None
def forward(self, query, key, value, mask=None):
if mask is not None:
mask = mask.unsqueeze(0)
batch_size = query.size(0)
# 三个线性层对输入进行进行隐空间特征提取
query, key, value = \
[model(x).view(batch_size, -1, self.head_num, self.head_dim).transpose(1, 2) for model, x in
zip(self.linears, (query, key, value))]
score, self.attn = attention(query, key, value, dropout=self.dropout, mask=mask)
score = score.transpose(1, 2).contiguous().view(batch_size, -1, self.head_dim * self.head_num)
return self.linears[-1](score)
# 多头注意力机制的另一种实现 建议理解这一个代码,比较好理解
# def forward2(self, query, key, value, mask=None):
# if mask is not None:
# mask = mask.unsqueeze(0)
# batch_size = query.size(0)
# query, key, value = \
# [model(x).view(batch_size * self.head_num, -1, self.head_dim) for model, x in
# zip(self.linears, (query, key, value))]
# score, self.attn = attention(query, key, value, dropout=self.dropout, mask=mask)
# score = score.view(batch_size, -1, self.head_dim * self.head_num)
# return self.linears[-1](score)
前馈全连接
class PositionalWiseFeedForward(nn.Module):
"""前馈全连接"""
def __init__(self, d_k, hidden_size, dropout=0.1):
super(PositionalWiseFeedForward, self).__init__()
self.w1 = nn.Linear(d_k, hidden_size)
self.w2 = nn.Linear(hidden_size, d_k)
self.dropout = nn.Dropout(p=dropout)
def forward(self, x):
out = self.w1(x)
out = F.relu(out)
out = self.dropout(out)
return self.w2(out)
编码层
class EncoderLayer(nn.Module):
""" 子层连接结构,将多头注意力机制和前馈全连接组装"""
def __init__(self, d_k, attn, feed_forward, dropout):
"""
attn 多头注意力实例
feed_forward 前馈全连接实例
dropout 置零率实例
"""
super(EncoderLayer, self).__init__()
self.attn = attn
self.feed_forward = feed_forward
# 拷贝2个子层连接结构,具体处理方式(多头/前馈)调用时指定
self.sublayer = clones(SublayerConnection(d_k, dropout), 2)
# 保存词嵌入维度,方便后续使用
self.size = d_k
def forward(self, x, mask):
""" 先走多头注意力机制,在过前馈全连接。 Transformer编码顺序"""
x = self.sublayer[0](x, lambda x: self.attn(x, x, x, mask))
return self.sublayer[1](x, self.feed_forward)
编码器实现
class Encoder(nn.Module):
""" 编码器实现,N个编码层EncoderLayer的堆叠"""
def __init__(self, encoder_layer, N):
super(Encoder, self).__init__()
self.layers = clones(encoder_layer, N)
# 使用自定义规范会层 encoder_layer.size 词嵌入维度
self.norn = LayerNorm(encoder_layer.size)
# torch中规范会层
# self.norn = nn.LayerNorm(encoder_layer.size)
def forward(self, x, mask=None):
for layer in self.layers:
x = layer(x, mask)
return self.norn(x)