主要是看其提出的注意力模块
①
输入
##### SelfAttentionLayer
输入q[128, 15, 512]或者v[128, 80, 512]作为input
attn_mask = padding_mask_k(input, input)
softmax_mask = padding_mask_q(input, input)
##### SingleAttentionLayer
输入两个模态q和v
attn_mask = padding_mask_k(q, v)
softmax_mask = padding_mask_q(q, v)
赋值后两个k和v是同一个模态,这里取q是[128, 15, 512],k和v是[128, 80, 512]
##### CoAttentionLayer & CoConcatAttentionLayer & CoSiameseAttentionLayer
比SingleAttentionLayer多了
attn_mask_ = padding_mask_k(v, q)
softmax_mask_ = padding_mask_q(v, q)
赋值只赋q和v分别的两模态
层内
单注意力
if attn_mask is None or softmax_mask is None:
attn_mask = padding_mask_k(q, k)
softmax_mask = padding_mask_q(q, k)
# linear projection
k = self.linear_k(k)
v = self.linear_v(v)
q = self.linear_q(q)
scale = k.size(-1)**-0.5
##### SelfAttentionLayer
attention = torch.bmm(q, k.transpose(1, 2))
##### SingleAttentionLayer
attention = torch.bmm(q, k.transpose(-2, -1))
不过k总就3维,其实这俩也差不多
if scale is not None:
attention = attention * scale
if attn_mask is not None:
attention = attention.masked_fill(attn_mask, -np.inf)
attention = self.softmax(attention)
attention = attention.masked_fill(softmax_mask, 0.)
output = torch.bmm(attention, v)
output = self.linear_final(output)
output = self.layer_norm(output + residual)
双注意力
#同样的加上这样两个掩码
if attn_mask_ is None or softmax_mask_ is None:
attn_mask_ = padding_mask_k(video, question)
softmax_mask_ = padding_mask_q(video, question)
# 这里只要两模态各自的q和v头
question_q = self.linear_question(question)
video_k = self.linear_video(video)
question = self.linear_v_question(question)
video = self.linear_v_video(video)
scale = video.size(-1)**-0.5
attention_qv = torch.bmm(question_q, video_k.transpose(1, 2))
if scale is not None:
attention_qv = attention_qv * scale
if attn_mask is not None:
attention_qv = attention_qv.masked_fill(attn_mask, -np.inf)
attention_qv = self.softmax(attention_qv)
attention_qv = attention_qv.masked_fill(softmax_mask, 0.)
#接着逆向再计算一次
attention_vq = torch.bmm(video_k, question_q.transpose(1, 2))
if scale is not None:
attention_vq = attention_vq * scale
if attn_mask_ is not None:
attention_vq = attention_vq.masked_fill(attn_mask_, -np.inf)
attention_vq = self.softmax(attention_vq)
attention_vq = attention_vq.masked_fill(softmax_mask_, 0.)
##### CoAttentionLayer
output_qv = torch.bmm(attention_qv, video)
output_qv = self.linear_final_qv(output_qv)
output_q = self.layer_norm_qv(output_qv + q)
output_vq = torch.bmm(attention_vq, question)
output_vq = self.linear_final_vq(output_vq)
output_v = self.layer_norm_vq(output_vq + v)
##### CoConcatAttentionLayer 和上面的区别在全连接前拼接取代后面的残差
output_qv = torch.bmm(attention_qv, video)
output_qv = self.linear_final_qv(torch.cat((output_qv, q), dim=-1))
output_q = self.layer_norm_qv(output_qv)
output_vq = torch.bmm(attention_vq, question)
output_vq = self.linear_final_vq(torch.cat((output_vq, v), dim=-1))
output_v = self.layer_norm_vq(output_vq)
##### CoSiameseAttentionLayer 和上面的区别在于使用同一个全连接,映射到统一的空间
output_qv = torch.bmm(attention_qv, video)
output_qv = self.linear_final(torch.cat((output_qv, q), dim=-1))
output_q = self.layer_norm_qv(output_qv)
output_vq = torch.bmm(attention_vq, question)
output_vq = self.linear_final(torch.cat((output_vq, v), dim=-1))
output_v = self.layer_norm_vq(output_vq)
return output_q, output_v
②
ScaledDotProductAttention VS MaskedScaledDotProductAttention
输入q k v
##### ScaledDotProductAttention
attention = torch.matmul(q, k.transpose(1, 2))
##### MaskedScaledDotProductAttention
attention = torch.matmul(q, k.transpose(-2, -1))
#公共部分
if scale is not None:
attention = attention * scale
if attn_mask is not None:
attention = attention.masked_fill(attn_mask, -np.inf)
attention = self.softmax(attention)
##### MaskedScaledDotProductAttention加上下面一句
attention = attention.masked_fill(softmax_mask, 0.)
attention = self.dropout(attention)
output = torch.matmul(attention, v)
SelfTransformerLayer
if self.position:
input += self.pos_embedding(input_length)[:, :input.size()[1], :]
attention_mask = padding_mask_k(input, input)
softmax_mask = padding_mask_q(input, input)
MaskedMultiHeadAttention #三个输入query, key, value都是上面的input
MaskedPositionalWiseFeedForward
output = self.w2(F.relu(self.w1(x)))
output = self.dropout(output)
output = self.layer_norm(x + output)
SoftKNN VS MultiHeadAttention VS MaskedMultiHeadAttention
#公共部分
residual = query
dim_per_head = self.dim_per_head
num_heads = self.num_heads
batch_size = key.size(0)
key = self.linear_k(key)
value = self.linear_v(value)
query = self.linear_q(query)
##### SoftKNN & MultiHeadAttention
key = key.view(batch_size * num_heads,
-1,
dim_per_head)
value = value.view(batch_size * num_heads,
-1,
dim_per_head)
query = query.view(batch_size * num_heads,
-1,
dim_per_head)
if attn_mask is not None:
attn_mask = attn_mask.repeat(num_heads, 1, 1)
scale = (key.size(-1) // num_heads)**-0.5
context, attention = self.dot_product_attention(
query, key, value, scale, attn_mask)
context = context.view(batch_size, -1, dim_per_head * num_heads)
##### MaskedMultiHeadAttention
key = key.view(batch_size,
-1,
num_heads,
dim_per_head).transpose(1, 2)
value = value.view(batch_size,
-1,
num_heads,
dim_per_head).transpose(1, 2)
query = query.view(batch_size,
-1,
num_heads,
dim_per_head).transpose(1, 2)
if attn_mask is not None:
attn_mask = attn_mask.unsqueeze(1).repeat(1, num_heads, 1, 1)
if softmax_mask is not None:
softmax_mask = softmax_mask.unsqueeze(1).repeat(1, num_heads, 1, 1)
scale = key.size(-1)**-0.5
context, attention = self.dot_product_attention(
query, key, value, scale, attn_mask, softmax_mask)
context = context.transpose(1, 2).contiguous().view(
batch_size, -1, dim_per_head * num_heads)
#区别在于qkv维度变化,加了一个softmax掩码用于有掩码的dot_product_attention,scale计算不同但值应该是一样的,以及最终context的计算方式不同
##### MultiHeadAttention & MaskedMultiHeadAttention
output = self.linear_final(context)
output = self.dropout(output)
output = self.layer_norm(residual + output)