Seq2Seq的不足之处
要考虑到最大长度的向量,每一个向量都要扩展到最大向量的长度
如果有更长的向量输入,则需要改变模型参数,不够灵活
attention结构
加权和
对于每一个单词或者字对应的向量,都会有一个权重分数,将单词对应的向量与分数进行内积(此处可以用广播或者扩展分数),得到对应于每一个单词向量乘以分数的几个向量,将它们沿axis=1的方向相加,就会得到总的一个表示向量
#定义加权和
class weighted_sum:
def __init__(self):
self.params,self.grads = [],[]
self.cache = None#这里cache用来保存hs,ar,因为后面反向传播需要用到
#执行加权求和
def forward(self,hs,a):
N,T,H = hs.shape
ar = a.reshape(N,T,1).repeat(H,axis=2)
t = hs * ar
t = np.sum(t,axis = 1)#因为要将不同片的第一维度相加求和
self.cache = (hs,ar)
return t
def backward(self,dt):
hs,ar = self.cache
N,T,H = hs.shape
dt = dt.reshape(N,1,H).repeat(T,axis=1)
dhs = dt * ar
dar = dt * hs
da = np.sum(dar,axis=2)
return dhs,da
反向传播要注意的点,sum对应的反向传播是repeat,repeat对应的反向传播是sum
计算单词权重(计算encoder的输出向量和decoder中LSTM输出向量的相似度)
将encoder中的所有隐藏层向量与每一个LSTM的输出状态相乘(点积),会得到不同的分数,相似程度越高,则分数越高
用softmax将这些分数划归到[0,1],并且对每个相关性有更直观的体现
#计算注意力权重,编码器输出和解码器输出的相似程度
class attention_weight:
def __init__(self):
self.params,self.grads = [],[]
self.cache = None
self.softmax = Softmax()
def forward(self,hs,h):
N,T,H = hs.shape
hr = h.reshape(N,1,H).repeat(T,axis=1)
t = hr * hs
s = np.sum(t,axis=2)
a = self.softmax.forward(s)
self.cache = (hs,hr)
return a
def backward(self,da):
hs,hr = self.cache
N,T,H = hs.shape
ds = self.softmax.backward(da)
ds = ds.reshape(N,T,1).repeat(H,axis=2)
dhs = ds * hr
dhr = ds * hs
dh = np.sum(dhr,axis=1)
return dhs,dh
attention层
attention层有加权和和计算相关性层构成
#将加权和层和attention_weight层拼接得到attention层
class attention_layer:
def __init__(self):
self.weight_sum_layer = weighted_sum()
self.attention_weight_layer = attention_weight()
self.attention_weight = None#用来保存attention_weight的输出
self.params,self.grads = [],[]
def forward(self,hs,h):#hs是编码器的状态输出,h是解码器lstm的输出
a = self.attention_weight_layer.forward(hs,h)
output = self.weight_sum_layer.forward(hs,a)
self.attention_weight = a
return output
def backward(self,dout):
dh0,da = self.weight_sum_layer.backward(dout)
dh1,dh = self.attention_weight_layer.backward(da)
dhs = dh0 + dh1
return dhs,dh
Timeattention层
由LSTM,attention,Affine层构成
#创建TimeAttention层
class TimeAttention:
def __init__(self):
self.params,self.grads = [],[]
self.layers = None
self.attention_weight_layer = None
def forward(self,hs_encoder,h_decoder):#单个h的形状为(N,H),输入time的形状为(N,T,H)
N,T,H = h_decoder.shape
#创建保存output的数组
output = np.empty_like(h_decoder)
self.layers = []
self.attention_weight_layer = []
for t in range(T):
layer = attention_layer()
output[:,t,:] = layer.forward(hs_encoder,h_decoder[:,t,:])
self.layers.append(layer)
self.attention_weight_layer.append(layer.attention_weight_layer)
return output
def backward(self,dout):
#返回encoder层的是每个attention层dhs相加,返回decoder层的是每个attention时刻t的输出构成的列表
N,T,H = dout.shape
dhs = 0
dh = np.empty_like(dout)#(N,T,H)
for t in range(T):
layer = self.layers[t]
dhst,dht = layer.backward(dout[:,t,:])#对应t时刻的输出
dhs += dhst
dh[:,t,:] = dht
return dhs,dh
反向传播时注意LSTM的两个状态输出分别输入到attention层和affine层
attention实现Seq2Seq
对于encoder,只是最后输出从只输出最后一层的状态变为输出所有隐藏层的状态
#实现attention_encoder
#返回所有的隐藏层向量
class attention_encoder(Encoder):
def forward(self,xs):
out = self.embed.forward(xs)
hs = self.lstm.forward(out)
return hs#前一章中返回的是hs[:,-1,:]
def backward(self,dout):
dout = self.lstm.backward(dout)
dout = self.embed.backward(dout)
return dout
对于decoder,则是在LSTM和affine层中间添加了一个attention层
lstm输入状态是encoder的最后一个隐藏层状态,attention输入的是所有隐藏层状态
decoder的LSTM输出状态会同时输入到attention层和affine层,所有affine层的输入需要用到np.concatenate
class Attention_Decoder:
def __init__(self,vocab_size,wordvec_size,hidden_size):
V,D,H = vocab_size,wordvec_size,hidden_size
rn = np.random.randn
#为权重赋初始值
embed_w = (rn(V,D)/100).astype('f')
lstm_Wx = (rn(D,4*H)/np.sqrt(D)).astype('f')
lstm_Wh = (rn(H,4*H)/np.sqrt(H)).astype('f')
lstm_b = np.zeros(4*H).astype('f')
affine_W = (rn(2*H,V)/np.sqrt(2*H)).astype('f')
affine_b = np.zeros(V).astype('f')
self.lstm = TimeLSTM(lstm_Wx,lstm_Wh,lstm_b,True)
self.embed = TimeEmbedding(embed_w)
self.attention = TimeAttention()
self.affine = TimeAffine(affine_W,affine_b)
self.layers = [self.embed,self.lstm,self.attention,self.affine]
self.params,self.grads = [],[]
for layer in self.layers:
self.params += layer.params
self.grads += layer.grads
#self.hs = 0#初始化隐藏状态
def forward(self,xs,encoder_hs):
h = encoder_hs[:,-1]
self.lstm.set_state(h)#继承隐藏状态
xs = self.embed.forward(xs)
decoder_hs = self.lstm.forward(xs)#输入向量,先进行嵌入,再进行lstm运算
#输入到attention的是encoder和decoder中lstm输出的合并
c = self.attention.forward(encoder_hs,decoder_hs)
out = np.concatenate((c,decoder_hs),axis=2)
out = self.affine.forward(out)
return out
def backward(self,dout):
dout = self.affine.backward(dout)
N,T,H2 = dout.shape
H = H2//2
dc,ddecoder_h0 = dout[:,:,:H],dout[:,:,H:]#因为前向传播时LSTM输出是两个分量
dencoder_h,ddecoder_h1 = self.attention.backward(dc)
ddecoder_h = ddecoder_h0 + ddecoder_h1
dout = self.lstm.backward(ddecoder_h)
dh = self.lstm.dh
dencoder_h[:,-1] += dh#因为lstm只接受了最后一层的隐藏层状态
self.embed.backward(dout)
return dencoder_h#返回encoder的隐藏层梯度
#生成的单词,每次根据预测概率最大的值取出下一个单词
def generate(self,encoder_hs,start_id,sample_size):#h是上一个lstm传来的隐藏层状态
sample_id = start_id
sample_ids = []
h = encoder_hs[:,-1]
self.lstm.set_state(h)
for i in range(sample_size):
x = np.array(sample_id).reshape(1,1)
out = self.embed.forward(x)
decoder_hs = self.lstm.forward(out)
c = self.attention.forward(encoder_hs,decoder_hs)
out = np.concatenate((c,decoder_hs),axis=2)
score = self.affine.forward(out)
sample = np.argmax(score.flatten())
sample_ids.append(int(sample))
return sample_ids
最后就是将encoder和decoder拼接得到带有attention的Seq2Seq
#定义带有attention机制的seq2seq类
class attention_seq2seq(Seq2seq):
def __init__(self,vocab_size,wordvec_size,hidden_size):
args = vocab_size,wordvec_size,hidden_size
self.encoder = attention_encoder(*args)
self.decoder = Attention_Decoder(*args)
self.softmax = TimeSoftmaxWithLoss()
self.params = self.encoder.params + self.decoder.params
self.grads = self.encoder.grads + self.decoder.grads
这里用softmax是计算损失