自然语言处理笔记总目录
输入部分包含:
- 源文本嵌入层及其位置编码器
- 目标文本嵌入层及其位置编码器
文本嵌入层及位置编码器的代码实现:
Embedding:
import torch
from torch import nn
import math
class Embeddings(nn.Module):
def __init__(self, vocab, d_model):
super(Embeddings, self).__init__()
self.d_model = d_model
self.embedding = nn.Embedding(vocab, d_model)
def forward(self, input):
embedded = self.embedding(input)
return embedded * math.sqrt(self.d_model)
if __name__ == '__main__':
d_model = 512
vocab = 1000
input = torch.LongTensor([[100, 2, 421, 508], [491, 998, 1, 221]])
emb = Embeddings(vocab, d_model)
out = emb(input)
print(out)
print(out.shape)
Out:
tensor([[[ 33.0487, 10.5949, 24.7869, ..., 32.2125, 6.6365, -9.1670],
[ 8.0757, -15.8932, -15.6231, ..., 1.0692, 17.7031, 10.5373],
[ 17.2680, 18.7539, 39.5824, ..., -21.3367, -13.4020, -9.0699],
[ 8.7345, 29.4133, -44.8271, ..., 38.8789, -2.8853, -65.1093]],
[[ -1.6227, -20.3071, -18.3763, ..., -1.9169, 43.6601, 17.1877],
[ 10.1663, 51.5647, 19.9411, ..., 26.7301, 16.3152, 8.1196],
[-22.3285, 28.9032, 87.8063, ..., 38.0816, -2.2564, 32.9298],
[-17.6660, -0.8846, 8.9129, ..., 1.4339, -33.7272, 14.8318]]],
grad_fn=<MulBackward0>)
torch.Size([2, 4, 512])
Positional Encoding:
class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(dropout)
# 初始化一个位置编码矩阵,大小是max_len x d_model
self.pe = torch.zeros(max_len, d_model)
# 初始化一个绝对位置矩阵,包含词汇的绝对位置,大小为max_len x 1
self.position = torch.arange(0, max_len).unsqueeze(1)
# 生成位置编码矩阵:max_len x 1的绝对位置矩阵position,与1 x d_model形状的变换矩阵div_term相乘
# 但这里由于使用正余弦位置编码,所以正弦和余弦都初始化为1 x d_model/2的矩阵再拼接即可
div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
self.pe[:, 0::2] = torch.sin(self.position * div_term)
self.pe[:, 1::2] = torch.cos(self.position * div_term)
self.pe = self.pe.unsqueeze(0)
self.register_buffer('ppe', self.pe)
def forward(self, x):
x = x + self.pe[:, :x.size(1)]
return self.dropout(x)
if __name__ == '__main__':
d_model = 512
vocab = 10
input = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8]])
emb = Embeddings(vocab, d_model)
out = emb(input)
dropout = 0.2
max_len = 50
x = out
pe = PositionalEncoding(d_model, dropout, max_len)
out_pe = pe(x)
print(out_pe)
print(out_pe.shape)
Out:
tensor([[[ 0.0000, 37.0802, 5.8806, ..., 0.0000, -0.0000, -0.0000],
[-12.7432, 29.2241, -12.9079, ..., 0.0000, 6.4460, -0.0000],
[-19.4035, 2.4065, -31.3536, ..., -0.0000, -38.7197, 54.6770],
[ 0.0000, -7.8297, 30.8798, ..., 6.0458, 56.0214, 0.0000]],
[[-31.7881, -3.7327, -8.3707, ..., -44.0128, -16.7447, 6.9718],
[ 40.8189, 0.0000, 8.1482, ..., -44.5094, -0.0000, -19.8663],
[ 16.3972, -8.9104, -16.2789, ..., 41.8107, -8.1127, -4.7275],
[-41.2381, 19.6516, -42.2035, ..., 9.2604, -7.5968, -38.6914]]],
grad_fn=<MulBackward0>)
torch.Size([2, 4, 512])
绘制词汇向量中特征的分布曲线:
import matplotlib.pyplot as plt
# 创建一张15 x 5大小的画布
plt.figure(figsize=(15, 5))
# 实例化PositionalEncoding类得到pe对象, 输入参数是20和0
pe = PositionalEncoding(20, 0)
# 这个tensor里的数值都是0, 被处理后相当于位置编码张量
y = pe(torch.zeros(1, 100, 20))
# 定义画布的横纵坐标, 横坐标到100的长度, 纵坐标是某一个词汇中的某维特征在不同长度下对应的值
# 因为总共有20维之多, 我们这里只查看4,5,6,7维的值.
plt.plot(np.arange(100), y[0, :, 4:8].data.numpy())
# 在画布上填写维度提示信息
plt.legend(["dim %d"%p for p in [4,5,6,7]])