seq2seq中的函数注解
载入数据集部分
def pad(line, max_len, padding_token):
if len(line) > max_len:
return line[:max_len]
return line + [padding_token] * (max_len - len(line))
print(pad(src_vocab[source[0]], 10, src_vocab.pad)) # source 0 有两个字符,故会有个4显示
print(source[0])
输出:
[38, 4, 0, 0, 0, 0, 0, 0, 0, 0]
['go', '.']
def build_array(lines, vocab, max_len, is_source):
lines = [vocab[line] for line in lines]
if not is_source: # 如果不是源语句
lines = [[vocab.bos] + line + [vocab.eos] for line in lines] # 加上开始字符和结束字符
array = torch.tensor([pad(line, max_len, vocab.pad) for line in lines]) # 补齐字符
valid_len = (array != vocab.pad).sum(1) #第一个维度,每一行的有效长度,如果不等于填充字符
return array, valid_len
def load_data_nmt(batch_size, max_len): # This function is saved in d2l.
src_vocab, tgt_vocab = build_vocab(source), build_vocab(target)
src_array, src_valid_len = build_array(source, src_vocab, max_len, True) # 构建源
# print(src_array.shape)
# print(src_vocab.pad)
# print((src_array != src_vocab.pad).shape)
tgt_array, tgt_valid_len = build_array(target, tgt_vocab, max_len, False) # 构建目标
train_data = data.TensorDataset(src_array, src_valid_len, tgt_array, tgt_valid_len)
train_iter = data.DataLoader(train_data, batch_size, shuffle=True)
return src_vocab, tgt_vocab, train_iter
src_vocab, tgt_vocab, train_iter = load_data_nmt(batch_size=2, max_len=8)
for X, X_valid_len, Y, Y_valid_len, in train_iter:
print('X =', X.type(torch.int32), '\nValid lengths for X =', X_valid_len,
'\nY =', Y.type(torch.int32), '\nValid lengths for Y =', Y_valid_len)
break
输出:
X = tensor([[ 348, 15, 188, 4, 0, 0, 0, 0],
[ 8, 15, 7, 2783, 0, 0, 0, 0]], dtype=torch.int32)
Valid lengths for X = tensor([4, 4])
Y = tensor([[ 1, 205, 59, 13, 2, 0, 0, 0],
[ 1, 44, 19, 580, 6, 2, 0, 0]], dtype=torch.int32)
Valid lengths for Y = tensor([5, 6])
transpose函数用法
torch.transpose(input, dim0, dim1, out=None) → Tensor
返回输入矩阵input
的转置。交换维度dim0
和dim1
。输出张量和输入张量共享内存。
对于二维矩阵:
B = torch.tensor([[2,2],[4,4]])
print(B)
print("====")
print(B.transpose(0,1))
输出:
tensor([[2, 2],
[4, 4]])
====
tensor([[2, 4],
[2, 4]])
对于三维矩阵:
A = torch.tensor([
[[1,2,3,4],[5,6,7,8]],
[[9,10,11,12],[13,14,15,16]]
])
print(A.shape)
print(A)
print("====")
print(A.transpose(0,1))
[1,2,3,4]可以看作一个元素去理解即可,输出:
torch.Size([2, 2, 4])
tensor([[[ 1, 2, 3, 4],
[ 5, 6, 7, 8]],
[[ 9, 10, 11, 12],
[13, 14, 15, 16]]])
====
tensor([[[ 1, 2, 3, 4],
[ 9, 10, 11, 12]],
[[ 5, 6, 7, 8],
[13, 14, 15, 16]]])
numpy tile函数
numpy.tile(A, reps)
Construct an array by repeating A the number of times given by reps.
重复数组A,reps次构建一个新数组。
If reps has length d, the result will have dimension of max(d, A.ndim).
如果reps有长度d,则结果的维度为d或A.ndim中的最大值。
If A.ndim < d
, A is promoted to be d-dimensional by prepending new axes.
If A.ndim > d
, reps is promoted to A.ndim by pre-pending 1’s to it.
A = np.array([0,1,2])
print('A的维度',A.ndim)
print(np.tile(A, 2))
print('--')
print(np.tile(A,(2,2)))
print('--')
print(np.tile(A,(2,1,2)))
print('======================')
B = np.array([[1, 2], [3, 4]])
print(B)
print('B的维度',B.ndim)
print(np.tile(B,2))
print('--')
print(np.tile(B,(2,1)))A = np.array([0,1,2])
print('A的维度',A.ndim)
print(np.tile(A, 2))
输出:
A的维度 1
[0 1 2 0 1 2]
--
[[0 1 2 0 1 2]
[0 1 2 0 1 2]]
--
[[[0 1 2 0 1 2]]
[[0 1 2 0 1 2]]]
======================
[[1 2]
[3 4]]
B的维度 2
[[1 2 1 2]
[3 4 3 4]]
--
[[1 2]
[3 4]
[1 2]
[3 4]]
模型微调
微调由以下4步构成:
- 在源数据集(如ImageNet数据集)上预训练一个神经网络模型,即源模型。
- 创建一个新的神经网络模型,即目标模型。它复制了源模型上除了输出层外的所有模型设计及其参数。我们假设这些模型参数包含了源数据集上学习到的知识,且这些知识同样适用于目标数据集。我们还假设源模型的输出层跟源数据集的标签紧密相关,因此在目标模型中不予采用。
- 为目标模型添加一个输出大小为目标数据集类别个数的输出层,并随机初始化该层的模型参数。
- 在目标数据集(如椅子数据集)上训练目标模型。我们将从头训练输出层,而其余层的参数都是基于源模型的参数微调得到的。
L-1层用小学习率
L层用大学习率