defpreprocess_raw(text):
text = text.replace('\u202f',' ').replace('\xa0',' ')
out =''for i, char inenumerate(text.lower()):if char in(',','!','.')and i >0and text[i-1]!=' ':
out +=' '
out += char
return out
分词
num_examples =50000
source, target =[],[]for i, line inenumerate(text.split('\n')):if i > num_examples:break
parts = line.split('\t')iflen(parts)>=2:
source.append(parts[0].split(' '))
target.append(parts[1].split(' '))
source[0:3], target[0:3]
建立词典
defbuild_vocab(tokens):
tokens =[token for line in tokens for token in line]return d2l.data.base.Vocab(tokens, min_freq=3, use_special_tokens=True)
载入数据集
defpad(line, max_len, padding_token):iflen(line)> max_len:return line[:max_len]return line +[padding_token]*(max_len -len(line))
defbuild_array(lines, vocab, max_len, is_source):
lines =[vocab[line]for line in lines]ifnot is_source:
lines =[[vocab.bos]+ line +[vocab.eos]for line in lines]
array = torch.tensor([pad(line, max_len, vocab.pad)for line in lines])
valid_len =(array != vocab.pad).sum(1)#第一个维度return array, valid_len
defload_data_nmt(batch_size, max_len):# This function is saved in d2l.
src_vocab, tgt_vocab = build_vocab(source), build_vocab(target)
src_array, src_valid_len = build_array(source, src_vocab, max_len,True)
tgt_array, tgt_valid_len = build_array(target, tgt_vocab, max_len,False)
train_data = data.TensorDataset(src_array, src_valid_len, tgt_array, tgt_valid_len)
train_iter = data.DataLoader(train_data, batch_size, shuffle=True)return src_vocab, tgt_vocab, train_iter
图解实现机制
Encoder
classSeq2SeqEncoder(d2l.Encoder):def__init__(self, vocab_size, embed_size, num_hiddens, num_layers,
dropout=0,**kwargs):super(Seq2SeqEncoder, self).__init__(**kwargs)
self.num_hiddens=num_hiddens
self.num_layers=num_layers
self.embedding = nn.Embedding(vocab_size, embed_size)
self.rnn = nn.LSTM(embed_size,num_hiddens, num_layers, dropout=dropout)defbegin_state(self, batch_size, device):return[torch.zeros(size=(self.num_layers, batch_size, self.num_hiddens), device=device),
torch.zeros(size=(self.num_layers, batch_size, self.num_hiddens), device=device)]defforward(self, X,*args):
X = self.embedding(X)# X shape: (batch_size, seq_len, embed_size)
X = X.transpose(0,1)# RNN needs first axes to be time# state = self.begin_state(X.shape[1], device=X.device)
out, state = self.rnn(X)# The shape of out is (seq_len, batch_size, num_hiddens).# state contains the hidden state and the memory cell# of the last time step, the shape is (num_layers, batch_size, num_hiddens)return out, state
encoder = Seq2SeqEncoder(vocab_size=10, embed_size=8,num_hiddens=16, num_layers=2)
X = torch.zeros((4,7),dtype=torch.long)
output, state = encoder(X)
output.shape,len(state), state[0].shape, state[1].shape
损失函数
defSequenceMask(X, X_len,value=0):
maxlen = X.size(1)
mask = torch.arange(maxlen)[None,:].to(X_len.device)< X_len[:,None]
X[~mask]=value
return X
X = torch.tensor([[1,2,3],[4,5,6]])
SequenceMask(X,torch.tensor([1,2]))classMaskedSoftmaxCELoss(nn.CrossEntropyLoss):# pred shape: (batch_size, seq_len, vocab_size)# label shape: (batch_size, seq_len)# valid_length shape: (batch_size, )defforward(self, pred, label, valid_length):# the sample weights shape should be (batch_size, seq_len)
weights = torch.ones_like(label)
weights = SequenceMask(weights, valid_length).float()
self.reduction='none'
output=super(MaskedSoftmaxCELoss, self).forward(pred.transpose(1,2), label)return(output*weights).mean(dim=1)
defSequenceMask(X, X_len,value=-1e6):
maxlen = X.size(1)#print(X.size(),torch.arange((maxlen),dtype=torch.float)[None, :],'\n',X_len[:, None] )
mask = torch.arange((maxlen),dtype=torch.float)[None,:]>= X_len[:,None]#print(mask)
X[mask]=value
return X
defmasked_softmax(X, valid_length):# X: 3-D tensor, valid_length: 1-D or 2-D tensor
softmax = nn.Softmax(dim=-1)if valid_length isNone:return softmax(X)else:
shape = X.shape
if valid_length.dim()==1:try:
valid_length = torch.FloatTensor(valid_length.numpy().repeat(shape[1], axis=0))#[2,2,3,3]except:
valid_length = torch.FloatTensor(valid_length.cpu().numpy().repeat(shape[1], axis=0))#[2,2,3,3]else:
valid_length = valid_length.reshape((-1,))# fill masked elements with a large negative, whose exp is 0
X = SequenceMask(X.reshape((-1, shape[-1])), valid_length)return softmax(X).reshape(shape)
点积注意力实现
# Save to the d2l package.classDotProductAttention(nn.Module):def__init__(self, dropout,**kwargs):super(DotProductAttention, self).__init__(**kwargs)
self.dropout = nn.Dropout(dropout)# query: (batch_size, #queries, d)# key: (batch_size, #kv_pairs, d)# value: (batch_size, #kv_pairs, dim_v)# valid_length: either (batch_size, ) or (batch_size, xx)defforward(self, query, key, value, valid_length=None):
d = query.shape[-1]# set transpose_b=True to swap the last two dimensions of key
scores = torch.bmm(query, key.transpose(1,2))/ math.sqrt(d)
attention_weights = self.dropout(masked_softmax(scores, valid_length))print("attention_weight\n",attention_weights)return torch.bmm(attention_weights, value)
多层感知机注意力
# Save to the d2l package.classMLPAttention(nn.Module):def__init__(self, units,ipt_dim,dropout,**kwargs):super(MLPAttention, self).__init__(**kwargs)# Use flatten=True to keep query's and key's 3-D shapes.
self.W_k = nn.Linear(ipt_dim, units, bias=False)
self.W_q = nn.Linear(ipt_dim, units, bias=False)
self.v = nn.Linear(units,1, bias=False)
self.dropout = nn.Dropout(dropout)defforward(self, query, key, value, valid_length):
query, key = self.W_k(query), self.W_q(key)#print("size",query.size(),key.size())# expand query to (batch_size, #querys, 1, units), and key to# (batch_size, 1, #kv_pairs, units). Then plus them with broadcast.
features = query.unsqueeze(2)+ key.unsqueeze(1)#print("features:",features.size()) #--------------开启
scores = self.v(features).squeeze(-1)
attention_weights = self.dropout(masked_softmax(scores, valid_length))return torch.bmm(attention_weights, value)
添加后代码更改
classSeq2SeqAttentionDecoder(d2l.Decoder):def__init__(self, vocab_size, embed_size, num_hiddens, num_layers,
dropout=0,**kwargs):super(Seq2SeqAttentionDecoder, self).__init__(**kwargs)
self.attention_cell = MLPAttention(num_hiddens,num_hiddens, dropout)
self.embedding = nn.Embedding(vocab_size, embed_size)
self.rnn = nn.LSTM(embed_size+ num_hiddens,num_hiddens, num_layers, dropout=dropout)
self.dense = nn.Linear(num_hiddens,vocab_size)definit_state(self, enc_outputs, enc_valid_len,*args):
outputs, hidden_state = enc_outputs
# print("first:",outputs.size(),hidden_state[0].size(),hidden_state[1].size())# Transpose outputs to (batch_size, seq_len, hidden_size)return(outputs.permute(1,0,-1), hidden_state, enc_valid_len)#outputs.swapaxes(0, 1)defforward(self, X, state):
enc_outputs, hidden_state, enc_valid_len = state
#("X.size",X.size())
X = self.embedding(X).transpose(0,1)# print("Xembeding.size2",X.size())
outputs =[]for l, x inenumerate(X):# print(f"\n{l}-th token")# print("x.first.size()",x.size())# query shape: (batch_size, 1, hidden_size)# select hidden state of the last rnn layer as query
query = hidden_state[0][-1].unsqueeze(1)# np.expand_dims(hidden_state[0][-1], axis=1)# context has same shape as query# print("query enc_outputs, enc_outputs:\n",query.size(), enc_outputs.size(), enc_outputs.size())
context = self.attention_cell(query, enc_outputs, enc_outputs, enc_valid_len)# Concatenate on the feature dimension# print("context.size:",context.size())
x = torch.cat((context, x.unsqueeze(1)), dim=-1)# Reshape x to (1, batch_size, embed_size+hidden_size)# print("rnn",x.size(), len(hidden_state))
out, hidden_state = self.rnn(x.transpose(0,1), hidden_state)
outputs.append(out)
outputs = self.dense(torch.cat(outputs, dim=0))return outputs.transpose(0,1),[enc_outputs, hidden_state,
enc_valid_len]