attention结构及其实现

最新推荐文章于 2024-04-27 22:01:34 发布

weixin_51762856

最新推荐文章于 2024-04-27 22:01:34 发布

阅读量624

点赞数 1

分类专栏：过一遍《深度学习进阶——自然语言处理》文章标签： python 深度学习自然语言处理神经网络机器学习

本文链接：https://blog.csdn.net/weixin_51762856/article/details/115497370

版权

过一遍《深度学习进阶——自然语言处理》专栏收录该内容

2 篇文章 0 订阅

订阅专栏

Seq2Seq的不足之处

要考虑到最大长度的向量，每一个向量都要扩展到最大向量的长度
如果有更长的向量输入，则需要改变模型参数，不够灵活

attention结构

加权和

attention加权和
对于每一个单词或者字对应的向量，都会有一个权重分数，将单词对应的向量与分数进行内积（此处可以用广播或者扩展分数），得到对应于每一个单词向量乘以分数的几个向量，将它们沿axis=1的方向相加，就会得到总的一个表示向量

#定义加权和
class weighted_sum:
    def __init__(self):
        self.params,self.grads = [],[]
        self.cache = None#这里cache用来保存hs,ar，因为后面反向传播需要用到
    #执行加权求和
    def forward(self,hs,a):
        N,T,H = hs.shape
        ar = a.reshape(N,T,1).repeat(H,axis=2)
        t = hs * ar
        t = np.sum(t,axis = 1)#因为要将不同片的第一维度相加求和
        self.cache = (hs,ar)
        return t
    
    def backward(self,dt):
        hs,ar = self.cache
        N,T,H = hs.shape
        dt = dt.reshape(N,1,H).repeat(T,axis=1)
        dhs = dt * ar
        dar = dt * hs
        da = np.sum(dar,axis=2)
        return dhs,da

反向传播要注意的点，sum对应的反向传播是repeat，repeat对应的反向传播是sum

计算单词权重（计算encoder的输出向量和decoder中LSTM输出向量的相似度）

将encoder中的所有隐藏层向量与每一个LSTM的输出状态相乘（点积），会得到不同的分数，相似程度越高，则分数越高
用softmax将这些分数划归到[0,1]，并且对每个相关性有更直观的体现

#计算注意力权重，编码器输出和解码器输出的相似程度
class attention_weight:
    def __init__(self):
        self.params,self.grads = [],[]
        self.cache = None
        self.softmax = Softmax()
        
    def forward(self,hs,h):
        N,T,H = hs.shape
        
        hr = h.reshape(N,1,H).repeat(T,axis=1)
        t = hr * hs
        s = np.sum(t,axis=2)
        a = self.softmax.forward(s)
        
        self.cache = (hs,hr)
        return a
    
    def backward(self,da):
        hs,hr = self.cache
        N,T,H = hs.shape
        
        ds = self.softmax.backward(da)
        ds = ds.reshape(N,T,1).repeat(H,axis=2)
        dhs = ds * hr
        dhr = ds * hs
        dh = np.sum(dhr,axis=1)
        
        return dhs,dh

attention层

attention层有加权和和计算相关性层构成

#将加权和层和attention_weight层拼接得到attention层
class attention_layer:
    def __init__(self):
        self.weight_sum_layer = weighted_sum()
        self.attention_weight_layer = attention_weight()
        self.attention_weight = None#用来保存attention_weight的输出
        self.params,self.grads = [],[]
        
    def forward(self,hs,h):#hs是编码器的状态输出，h是解码器lstm的输出
        a = self.attention_weight_layer.forward(hs,h)
        output = self.weight_sum_layer.forward(hs,a)
        self.attention_weight = a
        return output
    
    def backward(self,dout):
        dh0,da = self.weight_sum_layer.backward(dout)
        dh1,dh = self.attention_weight_layer.backward(da)
        dhs = dh0 + dh1
        return dhs,dh

Timeattention层

由LSTM，attention，Affine层构成

#创建TimeAttention层
class TimeAttention:
    def __init__(self):
        self.params,self.grads = [],[]
        self.layers = None
        self.attention_weight_layer = None
    
    def forward(self,hs_encoder,h_decoder):#单个h的形状为（N,H）,输入time的形状为(N,T,H)
        N,T,H = h_decoder.shape
        #创建保存output的数组
        output = np.empty_like(h_decoder)
        self.layers = []
        self.attention_weight_layer = []
        for t in range(T):
            layer = attention_layer()
            output[:,t,:] = layer.forward(hs_encoder,h_decoder[:,t,:])
            self.layers.append(layer)
            self.attention_weight_layer.append(layer.attention_weight_layer)
        return output
    
    def backward(self,dout):
        #返回encoder层的是每个attention层dhs相加，返回decoder层的是每个attention时刻t的输出构成的列表
        N,T,H = dout.shape
        dhs = 0
        dh = np.empty_like(dout)#(N,T,H)
        
        for t in range(T):
            layer = self.layers[t]
            dhst,dht = layer.backward(dout[:,t,:])#对应t时刻的输出
            dhs += dhst
            dh[:,t,:] = dht
        return dhs,dh

反向传播时注意LSTM的两个状态输出分别输入到attention层和affine层

attention实现Seq2Seq

对于encoder，只是最后输出从只输出最后一层的状态变为输出所有隐藏层的状态

#实现attention_encoder
#返回所有的隐藏层向量
class attention_encoder(Encoder):
    def forward(self,xs):
        out = self.embed.forward(xs)
        hs = self.lstm.forward(out)
        return hs#前一章中返回的是hs[:,-1,:]
    
    def backward(self,dout):
        dout = self.lstm.backward(dout)
        dout = self.embed.backward(dout)
        return dout

对于decoder，则是在LSTM和affine层中间添加了一个attention层
lstm输入状态是encoder的最后一个隐藏层状态，attention输入的是所有隐藏层状态
decoder的LSTM输出状态会同时输入到attention层和affine层，所有affine层的输入需要用到np.concatenate

class Attention_Decoder:
    def __init__(self,vocab_size,wordvec_size,hidden_size):
        V,D,H = vocab_size,wordvec_size,hidden_size
        rn = np.random.randn
        
        #为权重赋初始值
        embed_w = (rn(V,D)/100).astype('f')
        lstm_Wx = (rn(D,4*H)/np.sqrt(D)).astype('f')
        lstm_Wh = (rn(H,4*H)/np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4*H).astype('f')
        affine_W = (rn(2*H,V)/np.sqrt(2*H)).astype('f')
        affine_b = np.zeros(V).astype('f')
        
        self.lstm = TimeLSTM(lstm_Wx,lstm_Wh,lstm_b,True)
        self.embed = TimeEmbedding(embed_w)
        self.attention = TimeAttention()
        self.affine = TimeAffine(affine_W,affine_b)
        
        self.layers = [self.embed,self.lstm,self.attention,self.affine]
        self.params,self.grads = [],[]
        
        for layer in self.layers:
            self.params += layer.params
            self.grads += layer.grads
        
        #self.hs = 0#初始化隐藏状态
        
    def forward(self,xs,encoder_hs):
        h = encoder_hs[:,-1]
        self.lstm.set_state(h)#继承隐藏状态
        
        xs = self.embed.forward(xs)
        decoder_hs = self.lstm.forward(xs)#输入向量，先进行嵌入，再进行lstm运算
        #输入到attention的是encoder和decoder中lstm输出的合并
        c = self.attention.forward(encoder_hs,decoder_hs)
        out = np.concatenate((c,decoder_hs),axis=2)
        out = self.affine.forward(out)
        return out
    
    def backward(self,dout):
        dout = self.affine.backward(dout)
        N,T,H2 = dout.shape
        H = H2//2
        
        dc,ddecoder_h0 = dout[:,:,:H],dout[:,:,H:]#因为前向传播时LSTM输出是两个分量
        dencoder_h,ddecoder_h1 = self.attention.backward(dc)
        ddecoder_h = ddecoder_h0 + ddecoder_h1
        
        dout = self.lstm.backward(ddecoder_h)
        dh = self.lstm.dh
        dencoder_h[:,-1] += dh#因为lstm只接受了最后一层的隐藏层状态
        self.embed.backward(dout)
        
        return dencoder_h#返回encoder的隐藏层梯度
    #生成的单词,每次根据预测概率最大的值取出下一个单词
    def generate(self,encoder_hs,start_id,sample_size):#h是上一个lstm传来的隐藏层状态
        sample_id = start_id
        sample_ids = []
        h = encoder_hs[:,-1]
        self.lstm.set_state(h)
        for i in range(sample_size):
            x = np.array(sample_id).reshape(1,1)
            
            out = self.embed.forward(x)
            decoder_hs = self.lstm.forward(out)
            c = self.attention.forward(encoder_hs,decoder_hs)
            out = np.concatenate((c,decoder_hs),axis=2)
            score = self.affine.forward(out)
            
            sample = np.argmax(score.flatten())
            sample_ids.append(int(sample))
        return sample_ids

最后就是将encoder和decoder拼接得到带有attention的Seq2Seq

#定义带有attention机制的seq2seq类
class attention_seq2seq(Seq2seq):
    def __init__(self,vocab_size,wordvec_size,hidden_size):
        args = vocab_size,wordvec_size,hidden_size
        self.encoder = attention_encoder(*args)
        self.decoder = Attention_Decoder(*args)
        self.softmax = TimeSoftmaxWithLoss()
        
        self.params = self.encoder.params + self.decoder.params
        self.grads = self.encoder.grads + self.decoder.grads