读Reasoning with Heterogeneous Graph Alignment for Video Question Answering复现②

69 篇文章 0 订阅
2 篇文章 0 订阅

主要是看其提出的注意力模块

输入

##### SelfAttentionLayer
输入q[128, 15, 512]或者v[128, 80, 512]作为input
attn_mask = padding_mask_k(input, input)
softmax_mask = padding_mask_q(input, input)
##### SingleAttentionLayer
输入两个模态q和v
attn_mask = padding_mask_k(q, v)
softmax_mask = padding_mask_q(q, v)
赋值后两个k和v是同一个模态,这里取q是[128, 15, 512],k和v是[128, 80, 512]
##### CoAttentionLayer & CoConcatAttentionLayer & CoSiameseAttentionLayer
比SingleAttentionLayer多了
attn_mask_ = padding_mask_k(v, q)
softmax_mask_ = padding_mask_q(v, q)
赋值只赋q和v分别的两模态

层内

单注意力
if attn_mask is None or softmax_mask is None:
    attn_mask = padding_mask_k(q, k)
    softmax_mask = padding_mask_q(q, k)

# linear projection
k = self.linear_k(k)
v = self.linear_v(v)
q = self.linear_q(q)

scale = k.size(-1)**-0.5
##### SelfAttentionLayer
attention = torch.bmm(q, k.transpose(1, 2))
##### SingleAttentionLayer
attention = torch.bmm(q, k.transpose(-2, -1))

不过k总就3维,其实这俩也差不多

if scale is not None:
    attention = attention * scale
if attn_mask is not None:
    attention = attention.masked_fill(attn_mask, -np.inf)
attention = self.softmax(attention)
attention = attention.masked_fill(softmax_mask, 0.)

output = torch.bmm(attention, v)
output = self.linear_final(output)
output = self.layer_norm(output + residual)

双注意力
#同样的加上这样两个掩码
if attn_mask_ is None or softmax_mask_ is None:
    attn_mask_ = padding_mask_k(video, question)
    softmax_mask_ = padding_mask_q(video, question)

# 这里只要两模态各自的q和v头
question_q = self.linear_question(question)
video_k = self.linear_video(video)
question = self.linear_v_question(question)
video = self.linear_v_video(video)

scale = video.size(-1)**-0.5

attention_qv = torch.bmm(question_q, video_k.transpose(1, 2))
if scale is not None:
    attention_qv = attention_qv * scale
if attn_mask is not None:
    attention_qv = attention_qv.masked_fill(attn_mask, -np.inf)
attention_qv = self.softmax(attention_qv)
attention_qv = attention_qv.masked_fill(softmax_mask, 0.)

#接着逆向再计算一次
attention_vq = torch.bmm(video_k, question_q.transpose(1, 2))
if scale is not None:
    attention_vq = attention_vq * scale
if attn_mask_ is not None:
    attention_vq = attention_vq.masked_fill(attn_mask_, -np.inf)
attention_vq = self.softmax(attention_vq)
attention_vq = attention_vq.masked_fill(softmax_mask_, 0.)
##### CoAttentionLayer
output_qv = torch.bmm(attention_qv, video)
output_qv = self.linear_final_qv(output_qv)
output_q = self.layer_norm_qv(output_qv + q)
output_vq = torch.bmm(attention_vq, question)
output_vq = self.linear_final_vq(output_vq)
output_v = self.layer_norm_vq(output_vq + v)
##### CoConcatAttentionLayer 和上面的区别在全连接前拼接取代后面的残差
output_qv = torch.bmm(attention_qv, video)
output_qv = self.linear_final_qv(torch.cat((output_qv, q), dim=-1))
output_q = self.layer_norm_qv(output_qv)
output_vq = torch.bmm(attention_vq, question)
output_vq = self.linear_final_vq(torch.cat((output_vq, v), dim=-1))
output_v = self.layer_norm_vq(output_vq)
##### CoSiameseAttentionLayer 和上面的区别在于使用同一个全连接,映射到统一的空间
output_qv = torch.bmm(attention_qv, video)
output_qv = self.linear_final(torch.cat((output_qv, q), dim=-1))
output_q = self.layer_norm_qv(output_qv)
output_vq = torch.bmm(attention_vq, question)
output_vq = self.linear_final(torch.cat((output_vq, v), dim=-1))
output_v = self.layer_norm_vq(output_vq)

return output_q, output_v

ScaledDotProductAttention VS MaskedScaledDotProductAttention

输入q k v

	##### ScaledDotProductAttention
		attention = torch.matmul(q, k.transpose(1, 2))
	##### MaskedScaledDotProductAttention
		attention = torch.matmul(q, k.transpose(-2, -1))

#公共部分
if scale is not None:
    attention = attention * scale
if attn_mask is not None:
    attention = attention.masked_fill(attn_mask, -np.inf)
attention = self.softmax(attention)

	##### MaskedScaledDotProductAttention加上下面一句
		attention = attention.masked_fill(softmax_mask, 0.)

attention = self.dropout(attention)
output = torch.matmul(attention, v)

SelfTransformerLayer

	if self.position:
	    input += self.pos_embedding(input_length)[:, :input.size()[1], :]
	attention_mask = padding_mask_k(input, input)
	softmax_mask = padding_mask_q(input, input)

MaskedMultiHeadAttention #三个输入query, key, value都是上面的input
MaskedPositionalWiseFeedForward
	output = self.w2(F.relu(self.w1(x)))
    output = self.dropout(output)
	output = self.layer_norm(x + output)

SoftKNN VS MultiHeadAttention VS MaskedMultiHeadAttention

#公共部分
residual = query

dim_per_head = self.dim_per_head
num_heads = self.num_heads
batch_size = key.size(0)

key = self.linear_k(key)
value = self.linear_v(value)
query = self.linear_q(query)

	##### SoftKNN & MultiHeadAttention
	    key = key.view(batch_size * num_heads,
	    			   -1, 
	    			   dim_per_head)
	    value = value.view(batch_size * num_heads, 
	    				   -1, 
	    				   dim_per_head)
	    query = query.view(batch_size * num_heads, 
	    				   -1, 
	    				   dim_per_head)
	    if attn_mask is not None:
	        attn_mask = attn_mask.repeat(num_heads, 1, 1)
	
	    scale = (key.size(-1) // num_heads)**-0.5
	    context, attention = self.dot_product_attention(
	        query, key, value, scale, attn_mask)
	
	    context = context.view(batch_size, -1, dim_per_head * num_heads)

	##### MaskedMultiHeadAttention 
	    key = key.view(batch_size, 
	    			   -1, 
	    			   num_heads, 
	    			   dim_per_head).transpose(1, 2)
	    value = value.view(batch_size, 
	    				   -1, 
	    				   num_heads,
	                       dim_per_head).transpose(1, 2)
	    query = query.view(batch_size, 
	    				   -1, 
	    				   num_heads,
	                       dim_per_head).transpose(1, 2)
        if attn_mask is not None:
	        attn_mask = attn_mask.unsqueeze(1).repeat(1, num_heads, 1, 1)
	    if softmax_mask is not None:
	        softmax_mask = softmax_mask.unsqueeze(1).repeat(1, num_heads, 1, 1)
	    scale = key.size(-1)**-0.5
	    context, attention = self.dot_product_attention(
	        query, key, value, scale, attn_mask, softmax_mask)
	    context = context.transpose(1, 2).contiguous().view(
	        batch_size, -1, dim_per_head * num_heads)
	#区别在于qkv维度变化,加了一个softmax掩码用于有掩码的dot_product_attention,scale计算不同但值应该是一样的,以及最终context的计算方式不同
	##### MultiHeadAttention & MaskedMultiHeadAttention
		output = self.linear_final(context)
		output = self.dropout(output)
		output = self.layer_norm(residual + output)
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值