attention

 数据准备

import torch
import spacy
from torchtext.data import Field,BucketIterator
from torchtext.datasets import Multi30k


#加载文本、建立分词器
de_seq=spacy.load("de_core_news_sm")
en_seq=spacy.load("en_core_web_sm")
 

 

def src_token(text):
    return [word.text for word in de_seq.tokenizer(text)]
 
def trg_token(text):
    return [word.text for word in en_seq.tokenizer(text)]
 

#源端和目标端的操作
SRC=Field(tokenize=src_token,
         init_token="<sos>",
         eos_token="<eos>",
         lower=True)
 
TRG=Field(tokenize=trg_token,
         init_token="<sos>",
         eos_token="<eos>",
         lower=True)
#定义数据集、建立词表
train_data,val_data,test_data=Multi30k.splits(exts=(".de",".en"),
                                             fields=(SRC,TRG))
 
SRC.build_vocab(train_data,min_freq=2)
TRG.build_vocab(train_data,min_freq=2)
#建立迭代器
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size=128
 
train_iter,val_iter,test_iter=BucketIterator.splits(
    (train_data,val_data,test_data),
    batch_size=batch_size,
    device=device
)
#测试
for example in train_iter:
    break
print('1:源数据设备,目标数据设备',example.src.device,example.trg.device,len(example))
print('2:源数据shape,目标数据shape',example.src.shape,example.trg.shape)
print('3:源数据',example.src)
print('4:目标数据',example.trg)

结果:

torch.Size([31, 128]) torch.Size([27, 128])
1:源数据设备,目标数据设备 cuda:0 cuda:0 128
2:源数据shape,目标数据shape torch.Size([31, 128]) torch.Size([35, 128])
3:源数据 tensor([[  2,   2,   2,  ...,   2,   2,   2],
        [  5,  15,   5,  ...,   5,   8,  18],
        [ 13, 243,   0,  ..., 431,  16,   0],
        ...,
        [  1,   1,   1,  ...,   1,   1,   1],
        [  1,   1,   1,  ...,   1,   1,   1],
        [  1,   1,   1,  ...,   1,   1,   1]], device='cuda:0')
4:目标数据 tensor([[   2,    2,    2,  ...,    2,    2,    2],
        [   4,    7,    4,  ...,    4,    4,   16],
        [   9,   26, 4522,  ...,   64,   14,    0],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]], device='cuda:0')

模型建立 

#进入测试的时候 将batch放前面
src=example.src.permute(1,0)
trg=example.trg.permute(1,0)


#开始建立模型
import torch.nn as nn
class Encoder(nn.Module):
    def __init__(self,src_vocab_size,emb_size,enc_hidden_size,dec_hidden_size,dropout=0.5):
        super(Encoder,self).__init__()
        #实例化一些函数、传参
        self.embeding=nn.Embedding(src_vocab_size,emb_size,padding_idx=0)
        self.rnn=nn.GRU(emb_size,enc_hidden_size,bidirectional=True,batch_first=True)
        #这里的bidirectional=True就是双向的
        self.fc=nn.Linear(2*enc_hidden_size,dec_hidden_size)
        self.dropout=nn.Dropout(dropout)
    
    def forward(self,src):
        #src[bactch src_len]原始形状
        src=self.dropout(self.embeding(src))#src[batch src_len  emb_size]经过词嵌入还有dropout后的形状
        
        outputs,f_n=self.rnn(src) #原始数据经过双向GRU
        #从GRU里面出来outputs[batch src_len enc_hidden_size*2]、f_n[2 batch enc_hidden_size]
        
        
        #这是LSTM、GRU里面都没有的部分
        forward_hidden=f_n[-2,:,:]
        backward_hidden=f_n[-1,:,:]
        
        #forward_hidden[batch enc_hidden_size]
        #backward_hidden[batch enc_hidden_size]
        f_b_hidden=torch.cat((forward_hidden,backward_hidden),dim=1)#相当于就是把前后两个方向的隐藏单元结合起来
        #f_b_hidden[batch enc_hidden_size*2]
        
        f_n=self.fc(f_b_hidden).unsqueeze(0)#先将f_b_hidden经过一个fc线性层 然后再扩充维度
        #f_b_hidden[1 batch dec_hidden_size]
        
        return outputs,f_n
src_vocab_size=len(SRC.vocab)
trg_vocab_size=len(TRG.vocab)
 
emb_size=256
enc_hidden_size=512
dec_hidden_size=512

#测试
enModel=Encoder(src_vocab_size,emb_size,enc_hidden_size,dec_hidden_size).to(device)
enOutputs,h_n=enModel(src)
print('output大小,h_n大小:',enOutputs.shape,h_n.shape)

结果:

output大小,h_n大小: torch.Size([128, 31, 1024]) torch.Size([1, 128, 512])
class Attention(nn.Module):
    def __init__(self,enc_hidden_size,dec_hidden_size):
        super(Attention,self).__init__()
        
        self.attn=nn.Linear(enc_hidden_size*2+dec_hidden_size,dec_hidden_size)
        self.v=nn.Linear(dec_hidden_size,1,bias=False)
    
    def forward(self,encode_outputs,h_n):
        #encode_outputs[batch src_len 2*enc_hidden_size]
        #h_n[1 batch dec_hidden_size]
        src_len=encode_outputs.shape[1] #得到句子长度
        
        #repeat就是在指定维度扩充 例如repeat(2,3) 就是在第0维度*2 第一维度*3 参考:https://blog.csdn.net/tequila53/article/details/119183678
        h_n = h_n.permute(1,0,2).repeat(1,src_len,1)#复制src_len隐状态分别于encoder输出进行attn
        #h_n[batch src_len  dec_hidden_size] encode_outputs[batch src_len 2*enc_hidden_size]
        inputs=torch.cat((encode_outputs,h_n),dim=2) #cat就是将两个数组组合在一起
        #得到输入的形状inputs[batch src_len  2*enc_hidden_size+dec_hidden_size]
        
        energy = self.attn(inputs) #注意力后的结果
        #energy[batch sec_len dec_hidden_size]
        
        scores=self.v(energy).squeeze()
        #score[batch sec_len]
        
        return nn.functional.softmax(scores,dim=1)
attnModel=Attention(enc_hidden_size,dec_hidden_size).to(device)
a = attnModel(enOutputs,h_n)
print('模型输出的形状:',a.shape)
print(a.sum(dim=1))
#sum(dim=1)矩阵横压缩 0就是列压缩 参考:https://blog.csdn.net/weixin_45281949/article/details/103282148

输出:

模型输出的形状: torch.Size([128, 31])
tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000], device='cuda:0', grad_fn=<SumBackward1>)
class Decoder(nn.Module):
    def  __init__(self,trg_vocab_size,emb_size,en_hidden_size,dec_hidden_size,attention,dropout=0.5):
        super(Decoder,self).__init__()
        
        #初始化参数 实例化方法
        self.attn=attention
        self.emb=nn.Embedding(trg_vocab_size,emb_size)
        self.rnn=nn.GRU(emb_size+en_hidden_size*2,dec_hidden_size,batch_first=True)
        self.fn=nn.Linear(emb_size+en_hidden_size*2+dec_hidden_size,trg_vocab_size)
        self.dropout=nn.Dropout(dropout)
    
    def forward(self,input_trg,h_n,en_outputs):
        #input_trg[batch]
        #h_n[batch dec_hidden_size]
        #en_outputs[batch src_len en_hidden_size*2]
        
        #首先对目标端进行词嵌入
        input_trg=input_trg.unsqueeze(1)
        #input_trg[batch 1]
        input_trg=self.dropout(self.emb(input_trg))
        #input_trg[batch 1 emb_size]
        
        #进行自注意力 再进行维度扩展 得到权重
        a=self.attn(en_outputs,h_n)
        #a[batch src_len]
        a=a.unsqueeze(1)
        #a[batch 1 src_len]
        weighted=torch.bmm(a,en_outputs)
        #weighted[batch 1 en_hidden_size*2]
        #bmm就是两个三维矩阵相乘  https://blog.csdn.net/sinat_34328764/article/details/105155644
        
        #将input_trg和weighted结合起来
        concat = torch.cat((input_trg,weighted),dim=2)
        #concat[batch 1 en_hidden_size*2+emb_size]
        h_n=h_n.unsqueeze(0)
        #h_n[1 batch de_hidden_size]
        
        #进入双向GRU
        output,h_n=self.rnn(concat,h_n)
        #output[batch 1 dec_hidden_size],h_n[1 batch de_hidden_size]
        
        #组合concat和output 扩充维度
        concat=torch.cat((concat,output),dim=2)
        #concat[batch 1 en_hidden_size*2+emb_size+de_hidden_size]
        concat=concat.squeeze()
        #concat[batch  en_hidden_size*2+emb_size+de_hidden_size]
        
        #最后进入线性层
        output=self.fn(concat)
        #output[batch trg_vocab_size]
        return output,h_n.squeeze()

这里发生了错误: 

RuntimeError: For batched 3-D input, hx should also be 3-D but got 4-D tensor
import random

class Seq2Seq(nn.Module):
    def __init__(self,encoder,decoder):
        super(Seq2Seq,self).__init__()
        self.encoder=encoder
        self.decoder=decoder
    
    def forward(self,src,trg,teach_threshold=0.5):
        #src[batch src_len]
        #trg[batch trg_len]
        #从编码器获得en_outputs和h_n
        en_outputs,h_n=self.encoder(src)
        #en_outputs[batch src_len enc_hidden_size*2]
        #h_n[1 batch dec_hidden_size]
        
        batch=trg.shape[0]
        trg_len=trg.shape[1]
        
        #保存输出
        outputs=torch.zeros(batch,trg_len,trg_vocab_size).to(device)
        inputs=trg[:,0]
        for t in range(1,trg_len):
            output,h_n=self.decoder(inputs,h_n,en_outputs)
            #output[batch  trg_vocab_size]
            #h_n[1 batch dec_hidden_size]
            outputs[:,t,:]=output
            
            probabilty=random.random()
            inputs=trg[:,t] if probabilty<teach_threshold else output.argmax(1)
        return outputs
model=Seq2Seq(enModel,deModel).to(device)
outputs=model(src,trg)
outputs[0,0,:],outputs[0,1,:]

 训练模型

import time,math
from torch.optim import Adam

epochs=10
optim=Adam(model.parameters())
criterion=nn.CrossEntropyLoss(ignore_index=1)

def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
            
model.apply(init_weights)


def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


def train(model,data_iter,optim,criterion,clip):
    model.train()
    lossAll=0
    
    for example in data_iter:
        src=example.src.permute(1,0)
        trg=example.trg.permute(1,0)
        
        output=model(src,trg)
        optim.zero_grad()
        #output[batch trg_len trg_vocab_size]
        output=output[:,1:,:].reshape(-1,trg_vocab_size)
        trg=trg[:,1:].reshape(-1)
        loss=criterion(output,trg)
        loss.backward()
        #截断梯度,对梯度高于1的直接设置为1
        nn.utils.clip_grad_norm_(model.parameters(),clip)
        optim.step()
        lossAll+=loss.item()
    return lossAll/len(data_iter)


def evaluate(model,data_iter,criterion):
    model.eval()
    lossAll=0
    
    with torch.no_grad():
        for example in data_iter:
            src=example.src.permute(1,0)
            trg=example.trg.permute(1,0)
 
            output=model(src,trg)
            #output[batch trg_len trg_vocab_size]
            output=output[:,1:,:].reshape(-1,trg_vocab_size)
            trg=trg[:,1:].reshape(-1)
            loss=criterion(output,trg)
            lossAll+=loss.item()
    return lossAll/len(data_iter)

for epoch in range(epochs):
    start_time = time.time()
    train_loss = train(model,train_iter, optim, criterion,clip=1)
    valid_loss = evaluate(model,val_iter,criterion)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值