数据准备
import torch
import spacy
from torchtext.data import Field,BucketIterator
from torchtext.datasets import Multi30k
#加载文本、建立分词器
de_seq=spacy.load("de_core_news_sm")
en_seq=spacy.load("en_core_web_sm")
def src_token(text):
return [word.text for word in de_seq.tokenizer(text)]
def trg_token(text):
return [word.text for word in en_seq.tokenizer(text)]
#源端和目标端的操作
SRC=Field(tokenize=src_token,
init_token="<sos>",
eos_token="<eos>",
lower=True)
TRG=Field(tokenize=trg_token,
init_token="<sos>",
eos_token="<eos>",
lower=True)
#定义数据集、建立词表
train_data,val_data,test_data=Multi30k.splits(exts=(".de",".en"),
fields=(SRC,TRG))
SRC.build_vocab(train_data,min_freq=2)
TRG.build_vocab(train_data,min_freq=2)
#建立迭代器
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size=128
train_iter,val_iter,test_iter=BucketIterator.splits(
(train_data,val_data,test_data),
batch_size=batch_size,
device=device
)
#测试
for example in train_iter:
break
print('1:源数据设备,目标数据设备',example.src.device,example.trg.device,len(example))
print('2:源数据shape,目标数据shape',example.src.shape,example.trg.shape)
print('3:源数据',example.src)
print('4:目标数据',example.trg)
结果:
torch.Size([31, 128]) torch.Size([27, 128]) 1:源数据设备,目标数据设备 cuda:0 cuda:0 128 2:源数据shape,目标数据shape torch.Size([31, 128]) torch.Size([35, 128]) 3:源数据 tensor([[ 2, 2, 2, ..., 2, 2, 2], [ 5, 15, 5, ..., 5, 8, 18], [ 13, 243, 0, ..., 431, 16, 0], ..., [ 1, 1, 1, ..., 1, 1, 1], [ 1, 1, 1, ..., 1, 1, 1], [ 1, 1, 1, ..., 1, 1, 1]], device='cuda:0') 4:目标数据 tensor([[ 2, 2, 2, ..., 2, 2, 2], [ 4, 7, 4, ..., 4, 4, 16], [ 9, 26, 4522, ..., 64, 14, 0], ..., [ 1, 1, 1, ..., 1, 1, 1], [ 1, 1, 1, ..., 1, 1, 1], [ 1, 1, 1, ..., 1, 1, 1]], device='cuda:0')
模型建立
#进入测试的时候 将batch放前面
src=example.src.permute(1,0)
trg=example.trg.permute(1,0)
#开始建立模型
import torch.nn as nn
class Encoder(nn.Module):
def __init__(self,src_vocab_size,emb_size,enc_hidden_size,dec_hidden_size,dropout=0.5):
super(Encoder,self).__init__()
#实例化一些函数、传参
self.embeding=nn.Embedding(src_vocab_size,emb_size,padding_idx=0)
self.rnn=nn.GRU(emb_size,enc_hidden_size,bidirectional=True,batch_first=True)
#这里的bidirectional=True就是双向的
self.fc=nn.Linear(2*enc_hidden_size,dec_hidden_size)
self.dropout=nn.Dropout(dropout)
def forward(self,src):
#src[bactch src_len]原始形状
src=self.dropout(self.embeding(src))#src[batch src_len emb_size]经过词嵌入还有dropout后的形状
outputs,f_n=self.rnn(src) #原始数据经过双向GRU
#从GRU里面出来outputs[batch src_len enc_hidden_size*2]、f_n[2 batch enc_hidden_size]
#这是LSTM、GRU里面都没有的部分
forward_hidden=f_n[-2,:,:]
backward_hidden=f_n[-1,:,:]
#forward_hidden[batch enc_hidden_size]
#backward_hidden[batch enc_hidden_size]
f_b_hidden=torch.cat((forward_hidden,backward_hidden),dim=1)#相当于就是把前后两个方向的隐藏单元结合起来
#f_b_hidden[batch enc_hidden_size*2]
f_n=self.fc(f_b_hidden).unsqueeze(0)#先将f_b_hidden经过一个fc线性层 然后再扩充维度
#f_b_hidden[1 batch dec_hidden_size]
return outputs,f_n
src_vocab_size=len(SRC.vocab)
trg_vocab_size=len(TRG.vocab)
emb_size=256
enc_hidden_size=512
dec_hidden_size=512
#测试
enModel=Encoder(src_vocab_size,emb_size,enc_hidden_size,dec_hidden_size).to(device)
enOutputs,h_n=enModel(src)
print('output大小,h_n大小:',enOutputs.shape,h_n.shape)
结果:
output大小,h_n大小: torch.Size([128, 31, 1024]) torch.Size([1, 128, 512])
class Attention(nn.Module):
def __init__(self,enc_hidden_size,dec_hidden_size):
super(Attention,self).__init__()
self.attn=nn.Linear(enc_hidden_size*2+dec_hidden_size,dec_hidden_size)
self.v=nn.Linear(dec_hidden_size,1,bias=False)
def forward(self,encode_outputs,h_n):
#encode_outputs[batch src_len 2*enc_hidden_size]
#h_n[1 batch dec_hidden_size]
src_len=encode_outputs.shape[1] #得到句子长度
#repeat就是在指定维度扩充 例如repeat(2,3) 就是在第0维度*2 第一维度*3 参考:https://blog.csdn.net/tequila53/article/details/119183678
h_n = h_n.permute(1,0,2).repeat(1,src_len,1)#复制src_len隐状态分别于encoder输出进行attn
#h_n[batch src_len dec_hidden_size] encode_outputs[batch src_len 2*enc_hidden_size]
inputs=torch.cat((encode_outputs,h_n),dim=2) #cat就是将两个数组组合在一起
#得到输入的形状inputs[batch src_len 2*enc_hidden_size+dec_hidden_size]
energy = self.attn(inputs) #注意力后的结果
#energy[batch sec_len dec_hidden_size]
scores=self.v(energy).squeeze()
#score[batch sec_len]
return nn.functional.softmax(scores,dim=1)
attnModel=Attention(enc_hidden_size,dec_hidden_size).to(device)
a = attnModel(enOutputs,h_n)
print('模型输出的形状:',a.shape)
print(a.sum(dim=1))
#sum(dim=1)矩阵横压缩 0就是列压缩 参考:https://blog.csdn.net/weixin_45281949/article/details/103282148
输出:
模型输出的形状: torch.Size([128, 31]) tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000], device='cuda:0', grad_fn=<SumBackward1>)
class Decoder(nn.Module):
def __init__(self,trg_vocab_size,emb_size,en_hidden_size,dec_hidden_size,attention,dropout=0.5):
super(Decoder,self).__init__()
#初始化参数 实例化方法
self.attn=attention
self.emb=nn.Embedding(trg_vocab_size,emb_size)
self.rnn=nn.GRU(emb_size+en_hidden_size*2,dec_hidden_size,batch_first=True)
self.fn=nn.Linear(emb_size+en_hidden_size*2+dec_hidden_size,trg_vocab_size)
self.dropout=nn.Dropout(dropout)
def forward(self,input_trg,h_n,en_outputs):
#input_trg[batch]
#h_n[batch dec_hidden_size]
#en_outputs[batch src_len en_hidden_size*2]
#首先对目标端进行词嵌入
input_trg=input_trg.unsqueeze(1)
#input_trg[batch 1]
input_trg=self.dropout(self.emb(input_trg))
#input_trg[batch 1 emb_size]
#进行自注意力 再进行维度扩展 得到权重
a=self.attn(en_outputs,h_n)
#a[batch src_len]
a=a.unsqueeze(1)
#a[batch 1 src_len]
weighted=torch.bmm(a,en_outputs)
#weighted[batch 1 en_hidden_size*2]
#bmm就是两个三维矩阵相乘 https://blog.csdn.net/sinat_34328764/article/details/105155644
#将input_trg和weighted结合起来
concat = torch.cat((input_trg,weighted),dim=2)
#concat[batch 1 en_hidden_size*2+emb_size]
h_n=h_n.unsqueeze(0)
#h_n[1 batch de_hidden_size]
#进入双向GRU
output,h_n=self.rnn(concat,h_n)
#output[batch 1 dec_hidden_size],h_n[1 batch de_hidden_size]
#组合concat和output 扩充维度
concat=torch.cat((concat,output),dim=2)
#concat[batch 1 en_hidden_size*2+emb_size+de_hidden_size]
concat=concat.squeeze()
#concat[batch en_hidden_size*2+emb_size+de_hidden_size]
#最后进入线性层
output=self.fn(concat)
#output[batch trg_vocab_size]
return output,h_n.squeeze()
这里发生了错误:
RuntimeError: For batched 3-D input, hx should also be 3-D but got 4-D tensor
import random
class Seq2Seq(nn.Module):
def __init__(self,encoder,decoder):
super(Seq2Seq,self).__init__()
self.encoder=encoder
self.decoder=decoder
def forward(self,src,trg,teach_threshold=0.5):
#src[batch src_len]
#trg[batch trg_len]
#从编码器获得en_outputs和h_n
en_outputs,h_n=self.encoder(src)
#en_outputs[batch src_len enc_hidden_size*2]
#h_n[1 batch dec_hidden_size]
batch=trg.shape[0]
trg_len=trg.shape[1]
#保存输出
outputs=torch.zeros(batch,trg_len,trg_vocab_size).to(device)
inputs=trg[:,0]
for t in range(1,trg_len):
output,h_n=self.decoder(inputs,h_n,en_outputs)
#output[batch trg_vocab_size]
#h_n[1 batch dec_hidden_size]
outputs[:,t,:]=output
probabilty=random.random()
inputs=trg[:,t] if probabilty<teach_threshold else output.argmax(1)
return outputs
model=Seq2Seq(enModel,deModel).to(device)
outputs=model(src,trg)
outputs[0,0,:],outputs[0,1,:]
训练模型
import time,math
from torch.optim import Adam
epochs=10
optim=Adam(model.parameters())
criterion=nn.CrossEntropyLoss(ignore_index=1)
def init_weights(m):
for name, param in m.named_parameters():
if 'weight' in name:
nn.init.normal_(param.data, mean=0, std=0.01)
else:
nn.init.constant_(param.data, 0)
model.apply(init_weights)
def epoch_time(start_time, end_time):
elapsed_time = end_time - start_time
elapsed_mins = int(elapsed_time / 60)
elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
return elapsed_mins, elapsed_secs
def train(model,data_iter,optim,criterion,clip):
model.train()
lossAll=0
for example in data_iter:
src=example.src.permute(1,0)
trg=example.trg.permute(1,0)
output=model(src,trg)
optim.zero_grad()
#output[batch trg_len trg_vocab_size]
output=output[:,1:,:].reshape(-1,trg_vocab_size)
trg=trg[:,1:].reshape(-1)
loss=criterion(output,trg)
loss.backward()
#截断梯度,对梯度高于1的直接设置为1
nn.utils.clip_grad_norm_(model.parameters(),clip)
optim.step()
lossAll+=loss.item()
return lossAll/len(data_iter)
def evaluate(model,data_iter,criterion):
model.eval()
lossAll=0
with torch.no_grad():
for example in data_iter:
src=example.src.permute(1,0)
trg=example.trg.permute(1,0)
output=model(src,trg)
#output[batch trg_len trg_vocab_size]
output=output[:,1:,:].reshape(-1,trg_vocab_size)
trg=trg[:,1:].reshape(-1)
loss=criterion(output,trg)
lossAll+=loss.item()
return lossAll/len(data_iter)
for epoch in range(epochs):
start_time = time.time()
train_loss = train(model,train_iter, optim, criterion,clip=1)
valid_loss = evaluate(model,val_iter,criterion)
end_time = time.time()
epoch_mins, epoch_secs = epoch_time(start_time, end_time)
print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
print(f'\t Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f}')