读Reasoning with Heterogeneous Graph Alignment for Video Question Answering复现②

最新推荐文章于 2023-03-14 22:23:22 发布

Mighty_Crane

最新推荐文章于 2023-03-14 22:23:22 发布

阅读量1k

点赞数

分类专栏：融合论文小白文章标签： python

本文链接：https://blog.csdn.net/weixin_40459958/article/details/121242976

版权

小白同时被 3 个专栏收录

71 篇文章 2 订阅

订阅专栏

论文

69 篇文章 0 订阅

订阅专栏

融合

2 篇文章 0 订阅

订阅专栏

主要是看其提出的注意力模块

①

输入

##### SelfAttentionLayer
输入q[128, 15, 512]或者v[128, 80, 512]作为input
attn_mask = padding_mask_k(input, input)
softmax_mask = padding_mask_q(input, input)
##### SingleAttentionLayer
输入两个模态q和v
attn_mask = padding_mask_k(q, v)
softmax_mask = padding_mask_q(q, v)
赋值后两个k和v是同一个模态，这里取q是[128, 15, 512]，k和v是[128, 80, 512]

##### CoAttentionLayer & CoConcatAttentionLayer & CoSiameseAttentionLayer
比SingleAttentionLayer多了
attn_mask_ = padding_mask_k(v, q)
softmax_mask_ = padding_mask_q(v, q)
赋值只赋q和v分别的两模态

层内

单注意力

if attn_mask is None or softmax_mask is None:
    attn_mask = padding_mask_k(q, k)
    softmax_mask = padding_mask_q(q, k)

# linear projection
k = self.linear_k(k)
v = self.linear_v(v)
q = self.linear_q(q)

scale = k.size(-1)**-0.5

##### SelfAttentionLayer
attention = torch.bmm(q, k.transpose(1, 2))
##### SingleAttentionLayer
attention = torch.bmm(q, k.transpose(-2, -1))

不过k总就3维，其实这俩也差不多

if scale is not None:
    attention = attention * scale
if attn_mask is not None:
    attention = attention.masked_fill(attn_mask, -np.inf)
attention = self.softmax(attention)
attention = attention.masked_fill(softmax_mask, 0.)

output = torch.bmm(attention, v)
output = self.linear_final(output)
output = self.layer_norm(output + residual)

双注意力

#同样的加上这样两个掩码
if attn_mask_ is None or softmax_mask_ is None:
    attn_mask_ = padding_mask_k(video, question)
    softmax_mask_ = padding_mask_q(video, question)

# 这里只要两模态各自的q和v头
question_q = self.linear_question(question)
video_k = self.linear_video(video)
question = self.linear_v_question(question)
video = self.linear_v_video(video)

scale = video.size(-1)**-0.5

attention_qv = torch.bmm(question_q, video_k.transpose(1, 2))
if scale is not None:
    attention_qv = attention_qv * scale
if attn_mask is not None:
    attention_qv = attention_qv.masked_fill(attn_mask, -np.inf)
attention_qv = self.softmax(attention_qv)
attention_qv = attention_qv.masked_fill(softmax_mask, 0.)

#接着逆向再计算一次
attention_vq = torch.bmm(video_k, question_q.transpose(1, 2))
if scale is not None:
    attention_vq = attention_vq * scale
if attn_mask_ is not None:
    attention_vq = attention_vq.masked_fill(attn_mask_, -np.inf)
attention_vq = self.softmax(attention_vq)
attention_vq = attention_vq.masked_fill(softmax_mask_, 0.)

##### CoAttentionLayer
output_qv = torch.bmm(attention_qv, video)
output_qv = self.linear_final_qv(output_qv)
output_q = self.layer_norm_qv(output_qv + q)
output_vq = torch.bmm(attention_vq, question)
output_vq = self.linear_final_vq(output_vq)
output_v = self.layer_norm_vq(output_vq + v)
##### CoConcatAttentionLayer 和上面的区别在全连接前拼接取代后面的残差
output_qv = torch.bmm(attention_qv, video)
output_qv = self.linear_final_qv(torch.cat((output_qv, q), dim=-1))
output_q = self.layer_norm_qv(output_qv)
output_vq = torch.bmm(attention_vq, question)
output_vq = self.linear_final_vq(torch.cat((output_vq, v), dim=-1))
output_v = self.layer_norm_vq(output_vq)
##### CoSiameseAttentionLayer 和上面的区别在于使用同一个全连接，映射到统一的空间
output_qv = torch.bmm(attention_qv, video)
output_qv = self.linear_final(torch.cat((output_qv, q), dim=-1))
output_q = self.layer_norm_qv(output_qv)
output_vq = torch.bmm(attention_vq, question)
output_vq = self.linear_final(torch.cat((output_vq, v), dim=-1))
output_v = self.layer_norm_vq(output_vq)

return output_q, output_v

②

ScaledDotProductAttention VS MaskedScaledDotProductAttention

输入q k v

	##### ScaledDotProductAttention
		attention = torch.matmul(q, k.transpose(1, 2))
	##### MaskedScaledDotProductAttention
		attention = torch.matmul(q, k.transpose(-2, -1))

#公共部分
if scale is not None:
    attention = attention * scale
if attn_mask is not None:
    attention = attention.masked_fill(attn_mask, -np.inf)
attention = self.softmax(attention)

	##### MaskedScaledDotProductAttention加上下面一句
		attention = attention.masked_fill(softmax_mask, 0.)

attention = self.dropout(attention)
output = torch.matmul(attention, v)

SelfTransformerLayer

	if self.position:
	    input += self.pos_embedding(input_length)[:, :input.size()[1], :]
	attention_mask = padding_mask_k(input, input)
	softmax_mask = padding_mask_q(input, input)

MaskedMultiHeadAttention #三个输入query, key, value都是上面的input
MaskedPositionalWiseFeedForward
	output = self.w2(F.relu(self.w1(x)))
    output = self.dropout(output)
	output = self.layer_norm(x + output)

SoftKNN VS MultiHeadAttention VS MaskedMultiHeadAttention

#公共部分
residual = query

dim_per_head = self.dim_per_head
num_heads = self.num_heads
batch_size = key.size(0)

key = self.linear_k(key)
value = self.linear_v(value)
query = self.linear_q(query)

	##### SoftKNN & MultiHeadAttention
	    key = key.view(batch_size * num_heads,
	    			   -1, 
	    			   dim_per_head)
	    value = value.view(batch_size * num_heads, 
	    				   -1, 
	    				   dim_per_head)
	    query = query.view(batch_size * num_heads, 
	    				   -1, 
	    				   dim_per_head)
	    if attn_mask is not None:
	        attn_mask = attn_mask.repeat(num_heads, 1, 1)
	
	    scale = (key.size(-1) // num_heads)**-0.5
	    context, attention = self.dot_product_attention(
	        query, key, value, scale, attn_mask)
	
	    context = context.view(batch_size, -1, dim_per_head * num_heads)

	##### MaskedMultiHeadAttention 
	    key = key.view(batch_size, 
	    			   -1, 
	    			   num_heads, 
	    			   dim_per_head).transpose(1, 2)
	    value = value.view(batch_size, 
	    				   -1, 
	    				   num_heads,
	                       dim_per_head).transpose(1, 2)
	    query = query.view(batch_size, 
	    				   -1, 
	    				   num_heads,
	                       dim_per_head).transpose(1, 2)
        if attn_mask is not None:
	        attn_mask = attn_mask.unsqueeze(1).repeat(1, num_heads, 1, 1)
	    if softmax_mask is not None:
	        softmax_mask = softmax_mask.unsqueeze(1).repeat(1, num_heads, 1, 1)
	    scale = key.size(-1)**-0.5
	    context, attention = self.dot_product_attention(
	        query, key, value, scale, attn_mask, softmax_mask)
	    context = context.transpose(1, 2).contiguous().view(
	        batch_size, -1, dim_per_head * num_heads)
	#区别在于qkv维度变化，加了一个softmax掩码用于有掩码的dot_product_attention，scale计算不同但值应该是一样的，以及最终context的计算方式不同
	##### MultiHeadAttention & MaskedMultiHeadAttention
		output = self.linear_final(context)
		output = self.dropout(output)
		output = self.layer_norm(residual + output)

Mighty_Crane

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
读Reasoning with Heterogeneous Graph Alignment for Video Question Answering复现②

主要是看其提出的注意力模块①输入##### SelfAttentionLayer输入q[128, 15, 512]或者v[128, 80, 512]作为inputattn_mask = padding_mask_k(input, input)softmax_mask = padding_mask_q(input, input)##### SingleAttentionLayer输入两个模态q和vattn_mask = padding_mask_k(q, v)softmax_mask =
复制链接

扫一扫