简介
Pytorch中 nn.Transformer的使用详解与Transformer的黑盒讲解_iioSnail的博客-CSDN博客
代码
例子1
#coding=utf-8
import math
import random
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
"""
https://blog.csdn.net/zhaohongfei_358/article/details/126019181
一些细节:
https://blog.csdn.net/zhaohongfei_358/article/details/122861751
https://zhuanlan.zhihu.com/p/360343417
https://zhuanlan.zhihu.com/p/389183195
https://zhuanlan.zhihu.com/p/398039366?utm_medium=social&utm_oi=629375409599549440
http://nlp.seas.harvard.edu/2018/04/03/attention.html
https://mp.weixin.qq.com/s/cY0IkHTpxS6x6cqsueXZIg
"""
"""
自注意力机制
"""
class SelfAttention(nn.Module):
def __init__(self, input_vector_dim: int, dim_k=None, dim_v=None):
"""
初始化SelfAttention, 包含如下关键参数:
input_vector_dim: 输入向量的维度, 对应上述公式中的d, 例如你将单词编码为了10维的向量, 则该值为10
dim_k: 矩阵W^k和W^q的维度
dim_v: 输出向量的维度, 即b的维度, 例如, 经过Attention后的输出向量b, 如果你想让他的维度为15, 则该值为15, 若不填, 则取input_vector_dim
"""
super(SelfAttention, self).__init__()
self.input_vector_dim = input_vector_dim
# 如果 dim_k 和 dim_v 为 None,则取输入向量的维度
if dim_k is None:
dim_k = input_vector_dim
if dim_v is None:
dim_v = input_vector_dim
"""
实际写代码时,常用线性层来表示需要训练的矩阵,方便反向传播和参数更新
"""
self.W_q = nn.Linear(input_vector_dim, dim_k, bias=False)
self.W_k = nn.Linear(input_vector_dim, dim_k, bias=False)
self.W_v = nn.Linear(input_vector_dim, dim_v, bias=False)
# 这个是根号下d_k
self._norm_fact = 1 / np.sqrt(dim_k)
def forward(self, x):
"""
进行前向传播:
x: 输入向量, size为(batch_size, input_num, input_vector_dim)
"""
# 通过W_q, W_k, W_v矩阵计算出,Q,K,V
# Q,K,V矩阵的size为 (batch_size, input_num, output_vector_dim)
Q = self.W_q(x)
K = self.W_k(x)
V = self.W_v(x)
# permute用于变换矩阵的size中对应元素的位置,
# 即,将K的size由(batch_size, input_num, output_vector_dim),变为(batch_size, output_vector_dim,input_num)
# 0,1,2 代表各个元素的下标,即变换前,batch_size所在的位置是0,input_num所在的位置是1
K_T = K.permute(0, 2, 1)
# bmm是batch matrix-matrix product,即对一批矩阵进行矩阵相乘
# bmm详情参见:https://pytorch.org/docs/stable/generated/torch.bmm.html
atten = nn.Softmax(dim=-1)(torch.bmm(Q, K_T) * self._norm_fact)
# 最后再乘以 V
output = torch.bmm(atten, V)
return output
def test_SelfAttention():
model = SelfAttention(128, 32, 64)
#定义50个为一批(batch_size=50),输入向量维度为128,一次输入5个向量,欲经过Attention层后,编码成5个64维的向量.
x = torch.Tensor(50,5,128)
y = model(x)
print(y.size())
print("test end.")
"""
多头自注意力机制
"""
def attention(query, key, value):
"""
计算Attention的结果.
这里其实传入的是Q,K,V,而Q,K,V的计算是放在模型中的,请参考后续的MultiHeadedAttention类.
这里的Q,K,V有两种Shape,如果是Self-Attention,Shape为(batch, 词数, d_model),
例如(1, 7, 128),即batch_size为1,一句7个单词,每个单词128维
但如果是Multi-Head Attention,则Shape为(batch, head数, 词数,d_model/head数),
例如(1, 8, 7, 16),即Batch_size为1,8个head,一句7个单词,128/8=16.
这样其实也能看出来,所谓的MultiHead其实就是将128拆开了.
在Transformer中,由于使用的是MultiHead Attention,所以Q,K,V的Shape只会是第二种.
"""
# 获取d_model的值.之所以这样可以获取,是因为query和输入的shape相同,
# 若为Self-Attention,则最后一维都是词向量的维度,也就是d_model的值.
# 若为MultiHead Attention,则最后一维是 d_model / h,h为head数
d_k = query.size(-1)
# 执行QK^T / √d_k
scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
# 执行公式中的Softmax
# 这里的p_attn是一个方阵
# 若是Self Attention,则shape为(batch, 词数, 次数),例如(1, 7, 7)
# 若是MultiHead Attention,则shape为(batch, head数, 词数,词数)
p_attn = scores.softmax(dim=-1)
# 最后再乘以 V.
# 对于Self Attention来说,结果Shape为(batch, 词数, d_model),这也就是最终的结果了.
# 但对于MultiHead Attention来说,结果Shape为(batch, head数, 词数,d_model/head数)
# 而这不是最终结果,后续还要将head合并,变为(batch, 词数, d_model).不过这是MultiHeadAttention
# 该做的事情.
return torch.matmul(p_attn, value)
class MultiHeadAttention(nn.Module):
def __init__(self, h, d_model):
"""
h: head的数量
"""
super(MultiHeadAttention, self).__init__()
assert d_model % h == 0
# We assume d_v always equals d_k
self.d_k = d_model // h
self.h = h
# 定义W^q, W^k, W^v和W^o矩阵.
# 如果你不知道为什么用nn.Linear定义矩阵,可以参考该文章:
# https://blog.csdn.net/zhaohongfei_358/article/details/122797190
self.linears = [
nn.Linear(d_model, d_model),
nn.Linear(d_model, d_model),
nn.Linear(d_model, d_model),
nn.Linear(d_model, d_model),
]
def forward(self, x):
# 获取Batch Size
nbatches = x.size(0)
"""
1. 求出Q, K, V,这里是求MultiHead的Q,K,V,所以Shape为(batch, head数, 词数,d_model/head数)
1.1 首先,通过定义的W^q,W^k,W^v求出SelfAttention的Q,K,V,此时Q,K,V的Shape为(batch, 词数, d_model)
对应代码为 `linear(x)`
1.2 分成多头,即将Shape由(batch, 词数, d_model)变为(batch, 词数, head数,d_model/head数).
对应代码为 `view(nbatches, -1, self.h, self.d_k)`
1.3 最终交换“词数”和“head数”这两个维度,将head数放在前面,最终shape变为(batch, head数, 词数,d_model/head数).
对应代码为 `transpose(1, 2)`
"""
query, key, value = [
linear(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
for linear, x in zip(self.linears, (x, x, x))
]
"""
2. 求出Q,K,V后,通过attention函数计算出Attention结果,
这里x的shape为(batch, head数, 词数,d_model/head数)
self.attn的shape为(batch, head数, 词数,词数)
"""
x = attention(
query, key, value
)
"""
3. 将多个head再合并起来,即将x的shape由(batch, head数, 词数,d_model/head数)
再变为 (batch, 词数,d_model)
3.1 首先,交换“head数”和“词数”,这两个维度,结果为(batch, 词数, head数, d_model/head数)
对应代码为:`x.transpose(1, 2).contiguous()`
3.2 然后将“head数”和“d_model/head数”这两个维度合并,结果为(batch, 词数,d_model)
"""
x = (
x.transpose(1, 2)
.contiguous()
.view(nbatches, -1, self.h * self.d_k)
)
# 最终通过W^o矩阵再执行一次线性变换,得到最终结果.
return self.linears[-1](x)
def test_MultiHeadAttention():
# 定义8个head,词向量维度为512
model = MultiHeadAttention(8, 256)
# 传入一个batch_size为2, 7个单词,每个单词为256维度
x = torch.rand(2, 7, 256)
# 输出Attention后的结果
y = model(x)
print(y.size())
print("test end.")
def test_TransformerEncoder():
# 定义8个head,词向量维度为512
layer = nn.TransformerEncoderLayer(256,nhead=8,dim_feedforward=512,batch_first=True)
model = nn.TransformerEncoder(layer,num_layers=2)
# 传入一个batch_size为2, 7个单词,每个单词为256维度
x = torch.rand(2, 7, 256)
# 输出Attention后的结果
y = model(x)
print(y.size())
print("test end.")
def test_TransformerDecoder():
# 定义8个head,词向量维度为512
#layer = nn.TransformerEncoderLayer(256,nhead=8,dim_feedforward=512,batch_first=True)
layer = nn.TransformerDecoderLayer(d_model=256,nhead=8,dim_feedforward=512,batch_first=True)
model = nn.TransformerDecoder(layer,num_layers=2)
# 传入一个batch_size为2, 7个单词,每个单词为256维度
x = torch.rand(2, 7, 256)
tgt = torch.rand(2, 6, 256)
# 输出Attention后的结果
y = model(tgt,x)
print(y.size())
print("test end.")
def main():
print("main")
#test_SelfAttention()
#test_MultiHeadAttention()
#test_TransformerEncoder()
test_TransformerDecoder()
print("main end.")
if __name__ == '__main__':
main()
例子2
#coding=utf-8
import math
import random
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
"""
https://blog.csdn.net/zhaohongfei_358/article/details/126019181
"""
def test_nnTransformer():
# 定义编码器,词典大小为10,要把token编码成128维的向量
embedding = nn.Embedding(10, 128)
# 定义transformer,模型维度为128(也就是词向量的维度)
transformer = nn.Transformer(d_model=128, batch_first=True) # batch_first一定不要忘记
# 定义源句子,可以想想成是 <bos> 我 爱 吃 肉 和 菜 <eos> <pad> <pad>
src = torch.LongTensor([[0, 3, 4, 5, 6, 7, 8, 1, 2, 2]])
# 定义目标句子,可以想想是 <bos> I like eat meat and vegetables <eos> <pad>
tgt = torch.LongTensor([[0, 3, 4, 5, 6, 7, 8, 1, 2]])
# 将token编码后送给transformer(这里暂时不加Positional Encoding)
outputs = transformer(embedding(src), embedding(tgt))
print(outputs.size())
print("*"*50)
print(outputs.shape)
def get_key_padding_mask(tokens):
key_padding_mask = torch.zeros(tokens.size())
key_padding_mask[tokens == 2] = -torch.inf
return key_padding_mask
def test_nnTransformer2():
src = torch.LongTensor([
[0, 8, 3, 5, 5, 9, 6, 1, 2, 2, 2],
[0, 6, 6, 8, 9, 1 ,2, 2, 2, 2, 2],
])
tgt = torch.LongTensor([
[0, 8, 3, 5, 5, 9, 6, 1, 2, 2],
[0, 6, 6, 8, 9, 1 ,2, 2, 2, 2],
])
tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size(-1))
print(tgt_mask)
src_key_padding_mask = get_key_padding_mask(src)
tgt_key_padding_mask = get_key_padding_mask(tgt)
print(src_key_padding_mask)
print("="*50)
print(tgt_key_padding_mask)
# 定义编码器,词典大小为10,要把token编码成128维的向量
embedding = nn.Embedding(10, 128)
# 定义transformer,模型维度为128(也就是词向量的维度)
transformer = nn.Transformer(d_model=128, batch_first=True) # batch_first一定不要忘记
# 将token编码后送给transformer(这里暂时不加Positional Encoding)
outputs = transformer(embedding(src), embedding(tgt),
tgt_mask=tgt_mask,
src_key_padding_mask=src_key_padding_mask,
tgt_key_padding_mask=tgt_key_padding_mask)
print(outputs.size())
print("*"*50)
print(outputs.shape)
class PositionalEncoding(nn.Module):
"Implement the PE function."
def __init__(self, d_model, dropout, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
# 初始化Shape为(max_len, d_model)的PE (positional encoding)
pe = torch.zeros(max_len, d_model)
# 初始化一个tensor [[0, 1, 2, 3, ...]]
position = torch.arange(0, max_len).unsqueeze(1)
# 这里就是sin和cos括号中的内容,通过e和ln进行了变换
div_term = torch.exp(
torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
)
# 计算PE(pos, 2i)
pe[:, 0::2] = torch.sin(position * div_term)
# 计算PE(pos, 2i+1)
pe[:, 1::2] = torch.cos(position * div_term)
# 为了方便计算,在最外面在unsqueeze出一个batch
pe = pe.unsqueeze(0)
# 如果一个参数不参与梯度下降,但又希望保存model的时候将其保存下来
# 这个时候就可以用register_buffer
self.register_buffer("pe", pe)
def forward(self, x):
"""
x 为embedding后的inputs,例如(1,7, 128),batch size为1,7个单词,单词维度为128.
"""
# 将x和positional encoding相加。
x = x + self.pe[:, : x.size(1)].requires_grad_(False)
return self.dropout(x)
class MyTransformer(nn.Module):
def __init__(self, d_model=128):
super(MyTransformer, self).__init__()
# 定义词向量,词典数为10。我们不预测两位小数。
self.embedding = nn.Embedding(num_embeddings=10, embedding_dim=128)
# 定义Transformer。超参是我拍脑袋想的
self.transformer = nn.Transformer(d_model=128, num_encoder_layers=2, num_decoder_layers=2, dim_feedforward=512, batch_first=True)
# 定义位置编码器
self.positional_encoding = PositionalEncoding(d_model, dropout=0)
# 定义最后的线性层,这里并没有用Softmax,因为没必要. 因为后面的CrossEntropyLoss中自带了
self.predictor = nn.Linear(128, 10)
def forward(self, src, tgt):
# 生成mask
tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size()[-1])
src_key_padding_mask = MyTransformer.get_key_padding_mask(src)
tgt_key_padding_mask = MyTransformer.get_key_padding_mask(tgt)
# 对src和tgt进行编码
src = self.embedding(src)
tgt = self.embedding(tgt)
# 给src和tgt的token增加位置信息
src = self.positional_encoding(src)
tgt = self.positional_encoding(tgt)
# 将准备好的数据送给transformer
out = self.transformer(src, tgt,
tgt_mask=tgt_mask,
src_key_padding_mask=src_key_padding_mask,
tgt_key_padding_mask=tgt_key_padding_mask)
"""
这里直接返回transformer的结果。因为训练和推理时的行为不一样,
所以在该模型外再进行线性层的预测.
"""
return out
@staticmethod
def get_key_padding_mask(tokens):
"""
用于key_padding_mask
"""
key_padding_mask = torch.zeros(tokens.size())
key_padding_mask[tokens == 2] = -torch.inf
return key_padding_mask
#定义一个生成随机数据的函数,模拟句子来训练
def generate_random_batch(batch_size, max_length=16):
src = []
for i in range(batch_size):
# 随机生成句子长度
random_len = random.randint(1, max_length - 2)
# 随机生成句子词汇,并在开头和结尾增加<bos>和<eos>
random_nums = [0] + [random.randint(3, 9) for _ in range(random_len)] + [1]
# 如果句子长度不足max_length,进行填充
random_nums = random_nums + [2] * (max_length - random_len - 2)
src.append(random_nums)
src = torch.LongTensor(src)
# tgt不要最后一个token
tgt = src[:, :-1]
# tgt_y不要第一个的token
tgt_y = src[:, 1:]
# 计算tgt_y,即要预测的有效token的数量
n_tokens = (tgt_y != 2).sum()
# 这里的n_tokens指的是我们要预测的tgt_y中有多少有效的token,后面计算loss要用
return src, tgt, tgt_y, n_tokens
def test_train():
max_length=16
model = MyTransformer()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
#train
total_loss = 0
for step in range(2000):
# 生成数据
src, tgt, tgt_y, n_tokens = generate_random_batch(batch_size=2, max_length=max_length)
# 清空梯度
optimizer.zero_grad()
# 进行transformer的计算
out = model(src, tgt)
# 将结果送给最后的线性层进行预测
out = model.predictor(out)
"""
loss计算:
由于训练时我们的是对所有的输出都进行预测, 所以需要对out进行reshape一下.
我们的out的Shape为(batch_size, 词数, 词典大小), view之后变为: (batch_size*词数, 词典大小).
而在这些预测结果中, 我们只需要对非<pad>部分进行, 所以需要进行正则化. 也就是除以n_tokens。
"""
loss = criterion(out.contiguous().view(-1, out.size(-1)), tgt_y.contiguous().view(-1)) / n_tokens
# 计算梯度
loss.backward()
# 更新参数
optimizer.step()
total_loss += loss
# 每40次打印一下loss
if step != 0 and step % 40 == 0:
print("Step {}, total_loss: {}".format(step, total_loss))
total_loss = 0
#predict
model = model.eval()
# 随便定义一个src
src = torch.LongTensor([[0, 4, 3, 4, 6, 8, 9, 9, 8, 1, 2, 2]])
# tgt从<bos>开始,看看能不能重新输出src中的值
tgt = torch.LongTensor([[0]])
# 一个一个词预测,直到预测为<eos>,或者达到句子最大长度
for i in range(max_length):
# 进行transformer计算
out = model(src, tgt)
# 预测结果,因为只需要看最后一个词,所以取`out[:, -1]`
predict = model.predictor(out[:, -1])
# 找出最大值的index
y = torch.argmax(predict, dim=1)
# 和之前的预测结果拼接到一起
tgt = torch.concat([tgt, y.unsqueeze(0)], dim=1)
# 如果为<eos>,说明预测结束,跳出循环
if y == 1:
break
print(tgt)
print("test train end.")
def main():
print("main")
test_train()
print("end.")
if __name__ == '__main__':
main()
例子3
#coding=utf-8
import math
import random
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
"""
下面网址里面的官网链接有相关代码
https://www.cnblogs.com/lfri/p/15044391.html
https://blog.csdn.net/adczsw/article/details/121719210
https://blog.csdn.net/SHU15121856/article/details/104448734
https://blog.csdn.net/weixin_44751294/article/details/124704785
注意是不是双向网络 则D为2
"""
#RNN单层例子
def test_RNNCell():
rnn = nn.RNNCell(10, 20)
input = torch.randn(5, 3, 10)
hx = torch.randn(3, 20)
output = []
for i in range(input.size()[0]):
hx = rnn(input[i], hx)
output.append(hx)
output = torch.stack(output, dim=0)
print(output.shape)
print(hx.shape)
print("test end.")
def test_RNN():
num_layers = 6
rnn = nn.RNN(10, 20, num_layers)
input = torch.randn(5, 3, 10)
h0 = torch.randn(num_layers, 3, 20)
output, hn = rnn(input, h0)
print(output.shape)
print(hn.shape)
print("test end.")
def test_LSTMCell():
rnn = nn.LSTMCell(10, 20) # (input_size, hidden_size)
input = torch.randn(5, 3, 10) # (time_steps, batch, input_size)
hx = torch.randn(3, 20) # (batch, hidden_size)
cx = torch.randn(3, 20)
output = []
for i in range(input.size()[0]):
hx, cx = rnn(input[i], (hx, cx))
output.append(hx)
output = torch.stack(output, dim=0)
print(output.shape)
print(hx.shape)
print(cx.shape)
print("test end.")
def test_LSTM():
num_layers = 6
rnn = nn.LSTM(10, 20, num_layers)
input = torch.randn(5, 3, 10) # (time_steps, batch, input_size)
h0 = torch.randn(num_layers, 3, 20) # (D*num_layers, batch, hidden_size) D:表示如果是双向网络, 就是2, 单向网络就是1
c0 = torch.randn(num_layers, 3, 20)
output, (hn, cn) = rnn(input, (h0, c0)) # output.shape : (time_steps, batch, D*hidden_size) hn.shape : (num_layers, batch, D*hidden_size)
print(output.shape)
print(hn.shape)
print(cn.shape)
print("test end.")
def test_GRUCell():
rnn = nn.GRUCell(10, 20)
input = torch.randn(5, 3, 10)
hx = torch.randn(3, 20)
output = []
for i in range(input.size()[0]):
hx = rnn(input[i], hx)
output.append(hx)
output = torch.stack(output, dim=0)
print(output.shape)
print(hx.shape)
print("test end.")
def test_GRU():
num_layers = 6
rnn = nn.GRU(10, 20, num_layers)
input = torch.randn(5, 3, 10)
h0 = torch.randn(num_layers, 3, 20)
output, hn = rnn(input, h0)
print(output.shape)
print(hn.shape)
print("test end.")
def main():
print("main")
#test_RNN()
#test_RNNCell()
#test_LSTM()
#test_LSTMCell()
#test_GRU()
test_GRUCell()
print("end.")
if __name__ == '__main__':
main()