自注意力
key query values怎么选
n是序列长度 d是输入输出通道数
CNN:一串序列,可以认为是一维的数据。k是窗口大小。因为是一维数据,所以可以n个数据同时计算,并行度O(n)。最长路径,当kernel足够大时,一眼就能看到目标,O(n/k),k越大,路径越短
RNN:计算复杂度对n来说是线性关系。只能1个1个来,所以并行度为O(1)。且最长路径O(n),要每个都过去。强时序模型,在解码的时候,可以回溯到编码器,记忆力很好,很擅长记忆序列
自注意力:计算复杂度对n来说是平方。因为O(1)的最长路径,他能看的很远 ,适合大数据模型
在处理词元序列时,循环神经网络是逐个的重复地处理词元的,而自注意力则因为并行计算而放弃了顺序操作。为了使用序列的顺序信息,我们通过在输入表示中添加位置编码(positional encoding)来注入绝对的或相对的位置信息。位置编码可以通过学习得到也可以直接固定得到。下面使用基于正弦函数和余弦函数的固定位置编码
import torch
import d2l.torch
from torch import nn
class PositionalEncoding(nn.Module):
"""位置编码"""
def __init__(self, num_hiddens, dropout, max_len=1000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(dropout)
# 创建一个足够长的P
self.P = torch.zeros(size=(1, max_len, num_hiddens))
X = torch.arange(max_len, dtype=torch.float32).reshape(-1, 1) / torch.pow(1000, torch.arange(0, num_hiddens, 2,
dtype=torch.float32) / num_hiddens)
self.P[:, :, 0::2] = torch.sin(X)
self.P[:, :, 1::2] = torch.cos(X)
def forward(self, X):
X = X + self.P[:, :X.shape[1], :].to(X.device)
return self.dropout(X)
encoding_dim, num_steps = 32, 60
pos_encoding = PositionalEncoding(encoding_dim, 0)
pos_encoding.eval()
X = pos_encoding(torch.zeros(size=(1, num_steps, encoding_dim)))
P = pos_encoding.P[:, :X.shape[1], :]
d2l.torch.plot(torch.arange(num_steps), Y=P[0, :, 6:10].T, xlabel='Row (position)', figsize=(6, 2.5),
legend=['Column %d' % d for d in torch.arange(6, 10)])
P = P[0, :, :].unsqueeze(0).unsqueeze(0)
d2l.torch.show_heatmaps(P, xlabel='Column(encoding dimension)', ylabel='Row (position)', figsize=(3.5, 4), cmap='Blues')
transformer
1个query/key/value做多个注意力,最后进行一个连接,然后输出
当给定相同的查询、键和值的集合时, 希望模型可以基于相同的注意力机制学习到不同的行为, 然后将不同的行为作为知识组合起来, 捕获序列内各种范围的依赖关系 (例如短距离依赖和长距离依赖关系)。 因此允许注意力机制组合使用查询、键和值的不同子空间表示(representation subspaces)可能是有益的。
为此与其只使用单独一个注意力汇聚, 我们可以用独立学习得到的 ℎ 组不同的线性投影(linear projections)来变换查询、键和值。 然后,这 ℎ 组变换后的查询、键和值将并行地送到注意力汇聚中。 最后,将这 ℎ 个注意力汇聚的输出拼接在一起, 并且通过另一个可以学习的线性投影进行变换, 以产生最终输出,这种设计被称为多头注意力(multihead attention)。 对于 ℎ 个注意力汇聚输出,每一个注意力汇聚都被称作一个头(head)
基于位置的前馈网络对序列中的所有位置的表示进行变换时使用的是同一个多层感知机(MLP),这就是称前馈网络是基于位置的(positionwise)的原因。在下面的实现中,输入X的形状(批量大小,时间步数或序列长度,隐单元数或特征维度)将被一个两层的感知机转换成形状为(批量大小,时间步数,ffn_num_outputs)的输出张量。
import math
import d2l.torch
import torch
from torch import nn
import pandas as pd
class PositionWiseFFN(nn.Module):
"""基于位置的前馈网络"""
def __init__(self, ffn_num_inputs, ffn_num_hiddens, ffn_num_outputs):
super(PositionWiseFFN, self).__init__()
self.dense1 = nn.Linear(ffn_num_inputs, ffn_num_hiddens)
self.relu = nn.ReLU()
self.dense2 = nn.Linear(ffn_num_hiddens, ffn_num_outputs)
def forward(self, X):
return self.dense2(self.relu(self.dense1(X)))
positionWiseFFN = PositionWiseFFN(4, 4, 8)
positionWiseFFN.eval()
positionWiseFFN(torch.ones(size=(2, 3, 4)))[0]
ln = nn.LayerNorm(3) #对每一行进行求均值为0,方差为1,每一行表示一个样本的所有特征
bn = nn.BatchNorm1d(3) #对每一列进行求均值为0,方差为1,每一列表示所有样本的一个特征
X = torch.tensor([[1, 2, 3], [8, 9, 10], [15, 16, 17]], dtype=torch.float32)
# 在训练模式下计算X的均值和方差
print('layer_norm :', ln(X), '\nbatch_norm : ', bn(X))
class AddNorm(nn.Module):
"""残差连接后进行层规范化"""
def __init__(self, normalized_shape, dropout):
super(AddNorm, self).__init__()
self.dropout = nn.Dropout(dropout)
self.layer_norm = nn.LayerNorm(normalized_shape=normalized_shape)
def forward(self, X, Y):
return self.layer_norm(self.dropout(Y) + X)
add_norm = AddNorm(normalized_shape=[3, 4], dropout=0.5)
add_norm.eval()
add_norm(torch.ones(size=(2, 3, 4)), torch.ones(size=(2, 3, 4)))
class EncoderBlock(nn.Module):
"""transformer编码器块"""
def __init__(self, query_size, key_size, value_size, num_hiddens, normalized_shape, ffn_num_inputs, ffn_num_hiddens,
num_heads, dropout, use_bias=False):
super(EncoderBlock, self).__init__()
self.multihead_attention = d2l.torch.MultiHeadAttention(key_size, query_size, value_size, num_hiddens,
num_heads, dropout, use_bias)
self.addnorm1 = AddNorm(normalized_shape, dropout)
self.ffn = PositionWiseFFN(ffn_num_inputs, ffn_num_hiddens, num_hiddens)
self.addnorm2 = AddNorm(normalized_shape, dropout)
def forward(self, X, valid_lens):
Y = self.addnorm1(X, self.multihead_attention(X, X, X, valid_lens))
return self.addnorm2(Y, self.ffn(Y))
X = torch.ones(size=(2, 100, 24))
#valid_lens中3表示第一个样本序列有100个,这100个样本中前3个是有效样本,后面97个样本是填充的无效样本;
#2表示第二个样本序列有100个,这100个样本中前2个是有效样本,后面98个样本是填充的无效样本
valid_lens = torch.tensor([3, 2])
encoder_block = EncoderBlock(query_size=24, key_size=24, value_size=24, num_hiddens=24, normalized_shape=[100, 24],
ffn_num_inputs=24, ffn_num_hiddens=48, num_heads=8, dropout=0.5, use_bias=False)
encoder_block.eval()
encoder_block(X, valid_lens).shape
class TransformerEncoder(d2l.torch.Encoder):
"""transformer编码器"""
def __init__(self, vocab_size, query_size, key_size, value_size, num_hiddens, normalized_shape, ffn_num_inputs,
ffn_num_hiddens, num_heads, num_layers, dropout, use_bias=False):
super(TransformerEncoder, self).__init__()
self.num_hiddens = num_hiddens
self.embedding = nn.Embedding(vocab_size, num_hiddens)
self.positionalEncoding = d2l.torch.PositionalEncoding(num_hiddens, dropout)
self.encoder_blocks = nn.Sequential()
for i in range(num_layers):
self.encoder_blocks.add_module(f'encoder_block{i}',
EncoderBlock(query_size, key_size, value_size, num_hiddens, normalized_shape,
ffn_num_inputs, ffn_num_hiddens, num_heads, dropout,
use_bias=use_bias))
def forward(self, X, valid_lens, *args):
# 因为位置编码值在-1和1之间,
# 因此嵌入值乘以嵌入维度的平方根进行缩放,
# 然后再与位置编码相加。
X = self.positionalEncoding(self.embedding(X) * math.sqrt(self.num_hiddens))
self.attention_weights = [None] * len(self.encoder_blocks)
for i, encoder_block in enumerate(self.encoder_blocks):
X = encoder_block(X, valid_lens)
self.attention_weights[i] = encoder_block.multihead_attention.attention.attention_weights
return X
transformer_encoder = TransformerEncoder(200, 24, 24, 24, 24, [100, 24], 24, 48, 8, 2, 0.5, use_bias=False)
transformer_encoder.eval()
transformer_encoder(torch.ones(size=(2, 100), dtype=torch.long), valid_lens).shape
class DecoderBlock(nn.Module):
"""解码器中第i个块"""
def __init__(self, query_size, key_size, value_size, num_hiddens, normalized_shape, ffn_num_inputs, ffn_num_hiddens,
num_heads, dropout, i, use_bias=False):
super(DecoderBlock, self).__init__()
self.i = i #i表示这是第i个DecoderBlock块
self.mask_multihead_attention1 = d2l.torch.MultiHeadAttention(key_size, query_size, value_size, num_hiddens,
num_heads, dropout, bias=use_bias)
self.addnorm1 = AddNorm(normalized_shape, dropout)
self.mutilhead_attention2 = d2l.torch.MultiHeadAttention(key_size, query_size, value_size, num_hiddens,
num_heads, dropout, bias=use_bias)
self.addnorm2 = AddNorm(normalized_shape, dropout)
self.ffn = PositionWiseFFN(ffn_num_inputs, ffn_num_hiddens, num_hiddens)
self.addnorm3 = AddNorm(normalized_shape, dropout)
def forward(self, X, state):
enc_outputs, enc_valid_lens = state[0], state[1]
# 训练阶段,输出序列的所有词元都在同一时间处理,
# 因此state[2][self.i]初始化为None。
# 预测阶段,输出序列是通过词元一个接着一个解码的,
# 因此state[2][self.i]包含着直到当前时间步第i个块解码的输出表示
# 训练时,由于每次都需要调用init_state函数,因此重新训练一个batch时,state[2]始终是一个None列表,当测试时,由于每次根据当前时间步的词元预测下一个词元时都不会重新调用init_state()函数,不会重新初始化state,因此state[2]里面保存的是之前时间步预测出来的词元信息(存的是decoder每层第一个掩码多头注意力state信息)
if state[2][self.i] is None:
keys_values = X
else:
keys_values = torch.cat([state[2][self.i], X], dim=1)
state[2][self.i] = keys_values
if self.training:
batch_size, num_step, _ = X.shape
#训练时执行当前时间步的query时只看它前面的keys,values,不看它后面的keys,values。因为预测时是从左往右预测的,右边还没有预测出来,因此右侧的keys是没有的,看不到右侧的keys;训练时预测当前时间步词元能看到后面的目标词元,因此需要dec_valid_lens
# dec_valid_lens的开头:(batch_size,num_steps),
# 其中每一行是[1,2,...,num_steps]
dec_valid_lens = torch.arange(1, num_step + 1, device=X.device).repeat(batch_size, 1)
else:
#测试时预测当前时间步的词元只能看到之前预测出来的词元,后面还没预测的词元还看不到,因此dec_valid_lens可以不需要
dec_valid_lens = None
# 自注意力
X2 = self.mask_multihead_attention1(X, keys_values, keys_values, dec_valid_lens)
Y = self.addnorm1(X, X2)
# 编码器-解码器注意力。
# enc_outputs的开头:(batch_size,num_steps,num_hiddens)
Y2 = self.mutilhead_attention2(Y, enc_outputs, enc_outputs, enc_valid_lens)
Z = self.addnorm2(Y, Y2)
return self.addnorm3(Z, self.ffn(Z)), state
decoder_block = DecoderBlock(24, 24, 24, 24, [100, 24], 24, 48, 8, 0.5, 0, use_bias=False)
decoder_block.eval()
X = torch.ones(size=(2, 100, 24))
state = [encoder_block(X, valid_lens), valid_lens, [None]]
decoder_block(X, state)[0].shape
class TransformerDecoder(d2l.torch.Decoder):
def __init__(self, vocab_size, query_size, key_size, value_size, num_hiddens, normalized_shape, ffn_num_inputs,
ffn_num_hiddens, num_heads, num_layers, dropout, use_bias=False):
super(TransformerDecoder, self).__init__()
self.num_hiddens = num_hiddens
self.num_layers = num_layers
self.embedding = nn.Embedding(vocab_size, num_hiddens)
self.positionalEncoding = d2l.torch.PositionalEncoding(num_hiddens, dropout)
self.decoder_blocks = nn.Sequential()
for i in range(num_layers):
self.decoder_blocks.add_module(f'decoder_block{i}',
DecoderBlock(query_size, key_size, value_size, num_hiddens, normalized_shape,
ffn_num_inputs, ffn_num_hiddens, num_heads, dropout, i,
use_bias=use_bias))
self.dense = nn.Linear(num_hiddens, vocab_size)
def init_state(self, enc_outputs, enc_valid_lens, *args):
return [enc_outputs, enc_valid_lens, [None] * self.num_layers]
def forward(self, X, state):
X = self.positionalEncoding(self.embedding(X) * math.sqrt(self.num_hiddens))
self._attention_weights = [[None] * len(self.decoder_blocks) for _ in range(2)]
for i, decoder_block in enumerate(self.decoder_blocks):
X, state = decoder_block(X, state)
# 解码器自注意力权重
self._attention_weights[0][i] = decoder_block.mask_multihead_attention1.attention.attention_weights
# “编码器-解码器”自注意力权重
self._attention_weights[1][i] = decoder_block.mutilhead_attention2.attention.attention_weights
return self.dense(X), state
@property
def attention_weights(self):
return self._attention_weights
import os
def train_seq2seq(net, data_iter, lr, num_epochs, tgt_vocab, device):
"""Train a model for sequence to sequence.
Defined in :numref:`sec_seq2seq_decoder`"""
def xavier_init_weights(m):
if type(m) == nn.Linear:
nn.init.xavier_uniform_(m.weight)
if type(m) == nn.GRU:
for param in m._flat_weights_names:
if "weight" in param:
nn.init.xavier_uniform_(m._parameters[param])
net.apply(xavier_init_weights)
#net.to(device)
#net = net.to(device[0])
print(device)
gpu0 = torch.device(device[0])
net = nn.DataParallel(module=net, device_ids=device)
net = net.to(gpu0)
#net = nn.parallel.DistributedDataParallel(module=net,device_ids=device,broadcast_buffers=False)
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
loss = d2l.torch.MaskedSoftmaxCELoss()
net.train()
animator = d2l.torch.Animator(xlabel='epoch', ylabel='loss',
xlim=[10, num_epochs])
for epoch in range(num_epochs):
timer = d2l.torch.Timer()
metric = d2l.torch.Accumulator(2) # Sum of training loss, no. of tokens
for batch in data_iter:
optimizer.zero_grad()
X, X_valid_len, Y, Y_valid_len = [x.to(gpu0) for x in batch]
bos = torch.tensor([tgt_vocab['<bos>']] * Y.shape[0],
device=gpu0).reshape(-1, 1)
#print(X.device,Y.device,X_valid_len.device,Y_valid_len.device,bos.device)
dec_input = d2l.torch.concat([bos, Y[:, :-1]], 1) # Teacher forcing
dec_input = dec_input.to(gpu0)
#print(dec_input.device)
#net.cuda()
Y_hat, _ = net(X, dec_input, X_valid_len)
l = loss(Y_hat, Y, Y_valid_len)
l.sum().backward() # Make the loss scalar for `backward`
d2l.torch.grad_clipping(net, 1)
num_tokens = Y_valid_len.sum()
optimizer.step()
with torch.no_grad():
metric.add(l.sum(), num_tokens)
if (epoch + 1) % 10 == 0:
animator.add(epoch + 1, (metric[0] / metric[1],))
print(f'loss {metric[0] / metric[1]:.3f}, {metric[1] / timer.stop():.1f} '
f'tokens/sec on {str(device)}')
batch_size, num_steps = 64, 10
query_size, key_size, value_size, num_hiddens = 32, 32, 32, 32
normalized_shape = [32]
ffn_num_inputs, ffn_num_hiddens = 32, 64
num_heads, num_layers, dropout = 4, 2, 0.1
use_bias = False
lr, num_epochs, device = 0.005, 300, d2l.torch.try_gpu()
train_iter, src_vocab, tgt_vocab = d2l.torch.load_data_nmt(batch_size, num_steps)
transformer_encoder = TransformerEncoder(len(src_vocab), query_size, key_size, value_size, num_hiddens,
normalized_shape, ffn_num_inputs, ffn_num_hiddens, num_heads, num_layers,
dropout, use_bias=use_bias)
transformer_decoder = TransformerDecoder(len(tgt_vocab), query_size, key_size, value_size, num_hiddens,
normalized_shape, ffn_num_inputs, ffn_num_hiddens, num_heads, num_layers,
dropout, use_bias=use_bias)
net = d2l.torch.EncoderDecoder(transformer_encoder, transformer_decoder)
d2l.torch.train_seq2seq(net, train_iter, lr, num_epochs, tgt_vocab, device)
#train_seq2seq(net,train_iter,lr,num_epochs,tgt_vocab,device)
# engs = ['go .', "i lost .", 'he\'s calm .', 'i\'m home .']
# fras = ['va !', 'j\'ai perdu .', 'il est calme .', 'je suis chez moi .']
engs = ['go .', "i lost .", 'he\'s calm .', 'i\'m home .', 'hi']
fras = ['va !', 'j\'ai perdu .', 'il est calme .', 'je suis chez moi .', 'salut !']
for eng, fra in zip(engs, fras):
translation, dec_attention_weight_seq = d2l.torch.predict_seq2seq(net, eng, src_vocab, tgt_vocab, num_steps, device,
save_attention_weights=True)
print(f'eng:{eng}==>', f'translation:{translation},', f'BLEU score:{d2l.torch.bleu(translation, fra, k=2)}')
enc_attention_weights = torch.cat(net.encoder.attention_weights, dim=0).reshape((num_layers, num_heads, -1, num_steps))
enc_attention_weights.shape
d2l.torch.show_heatmaps(enc_attention_weights.cpu(), xlabel='Keys Positions', ylabel='Query Positions',
titles=['Head %d' % i for i in range(1, 5)], figsize=(7, 3.5))
'''
#dec_attention_weights_2d里面的元素为:每一步每个层每一个多头注意力每个头对相应key-value的注意力权重[torch.tensor([1])(预测时只有1个key-value),torch.tensor([10])(10个key-value),torch.tensor([3])(3个key-value)]
dec_attention_weights_2d = [head[0].tolist()
for step in dec_attention_weight_seq
for attn in step for blk in attn for head in blk]
#dec_attention_weights_filled:由于dec_attention_weights_2d中的元素(key-value的注意力权重)tensor形状不同,有torch.tensor([1]),torch.tensor([10]),torch.tensor([3])等,因此需要将这些tensor弄成形状大小相同的tensor,填充的部分值为0
dec_attention_weights_filled = torch.tensor(
pd.DataFrame(dec_attention_weights_2d).fillna(0.0).values)
print(dec_attention_weights_filled.shape)
#dec_attention_weight_seq:表示预测所有步(第一步,第二步,第三步等)所得到的权重组合在一起(为一个list)
#step:表示预测第一步的词元所得到的注意力权重
#attn:表示模型所有层(2层)中第一个多头注意力的权重(也即是每层第一个多头注意力权重组合在一起)
#blk:表示模型第一个多头注意力在第一个层的多头注意力权重(也即是第一层第一个多头注意力权重),形状大小为:torch.Size([4, 1, 1])
#head:表示模型在第一层第一个多头注意力中第一个头的注意力权重,形状大小为:torch.Size([1, 1])
#head[0]:表示模型在第一层第一个多头注意力中第一个头的注意力权重中第一个query对于key_value的权重,形状大小为:torch.Size([1])
for step in dec_attention_weight_seq:
for attn in step:
print(len(attn))
for blk in attn:
print(blk.shape)
for head in blk:
print(head.shape)
#torch.Size([3, 2, 2, 4, 10])表示:预测的每一个词元(有3个预测词元),2表示每一层有两个多头attention,num_layers=2表示有两层decoder-block块,num_heads=4表示每个多头注意力有4个头,num_steps=10表示有10个key-value,对于10个key-value计算得到的10个注意力权重
dec_attention_weights = dec_attention_weights_filled.reshape((-1, 2, num_layers, num_heads, num_steps))
print(dec_attention_weights.shape)
'''
dec_attention_weights_2d = []
for step in dec_attention_weight_seq:
for attn in step:
for blk in attn:
for head in blk:
dec_attention_weights_2d.append(head[
0].tolist()) #将query对应的key-value的权重拿出来(head[0]表示将第一个query的所有权重全部拿出来,因为预测是一个进行一个预测,因此query是只有一个的)
dec_attention_weights_filled = torch.tensor(pd.DataFrame(dec_attention_weights_2d).fillna(0.0).values)
dec_attention_weights = dec_attention_weights_filled.reshape((-1, 2, num_layers, num_heads, num_steps))
dec_self_attention_weights, dec_inter_attention_weights = dec_attention_weights.permute(1, 2, 3, 0,
4) #将预测的词元步数调整到第三维,将每个层中多头注意力个数调整到第一维
dec_self_attention_weights.shape, dec_inter_attention_weights.shape #dec_self_attention_weights.shape=torch.Size([2, 4, 3, 10]表示:第一个多头attention中每个层每个头每个预测的词元步数中每个key-values的注意力权重,
#dec_inter_attention_weights.shape=torch.Size([2, 4, 3, 10]表示:第二个多头attention中每个层每个头每个预测的词元步数中每个key-values的注意力权重,
d2l.torch.show_heatmaps(dec_self_attention_weights[:, :, :, :len(translation.split()) + 1], xlabel='Keys Position',
ylabel='Queries Position', titles=['Head %d' % i for i in range(1, 5)], figsize=(7, 3.5))
d2l.torch.show_heatmaps(dec_inter_attention_weights, xlabel='Keys Position', ylabel='Queries Position',
titles=['Head %d' % i for i in range(1, 5)], figsize=(7, 3.5))