分层共同注意力代码解读
本文主要是对分层共同注意力的其中一篇代码解读,该代码不是原作者写的,原作者用的是torch,源码地址:https://github.com/jiasenlu/HieCoAttenVQA
本文用到的源码地址:https://github.com/karunraju/VQA
本代码基于在coattentionnet网络模型上调试,
1. 总体代码结构
代码结构主要包含,谁是父类,谁是子类,谁调用谁…等,
我将代码的结构,用思维导图表示,如下:
详细如下:
网络结构如下:
CoattentionNet(
(embed): Embedding(11471, 512)
(unigram_conv): Conv1d(512, 512, kernel_size=(1,), stride=(1,))
(bigram_conv): Conv1d(512, 512, kernel_size=(2,), stride=(1,), padding=(1,), dilation=(2,))
(trigram_conv): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
(max_pool): MaxPool2d(kernel_size=(3, 1), stride=(3, 1), padding=0, dilation=1, ceil_mode=False)
(lstm): LSTM(512, 512, num_layers=3, dropout=0.4)
(tanh): Tanh()
(W_w): Linear(in_features=512, out_features=512, bias=True)
(W_p): Linear(in_features=1024, out_features=512, bias=True)
(W_s): Linear(in_features=1024, out_features=512, bias=True)
)
接下来,分别介绍每个文件。
2. dataset.py
相应的注释在代码中已标注出,也以问题id为:262148000举了个例,这个文件就一个pre_process_dataset方法:它的返回值如下:
q2i:{‘where’: 0, ‘is’: 1, ‘he’: 2, ‘looking’: 3})
a2i:{‘down’: 0, ‘at table’: 1, ‘skateboard’: 2, ‘table’: 3}
i2a:{0: ‘down’, 1: ‘at table’, 2: ‘skateboard’, 3: ‘table’}
a2i_count:{‘down’: 7, ‘at table’: 1, ‘skateboard’: 1, ‘table’: 1}
import os
import operator
import numpy as np
from collections import defaultdict
from external.vqa.vqa import VQA
def pre_process_dataset(image_dir, qjson, ajson, img_prefix):
'''
return值为:
q2i:{'where': 0, 'is': 1, 'he': 2, 'looking': 3})
a2i:{'down': 0, 'at table': 1, 'skateboard': 2, 'table': 3}
i2a:{0: 'down', 1: 'at table', 2: 'skateboard', 3: 'table'}
a2i_count:{'down': 7, 'at table': 1, 'skateboard': 1, 'table': 1}
'''
print('Preprocessing datatset. \n')
vqa = VQA(ajson, qjson) #实例化VQA对象,标注json文件和问题json文件,创建好索引
img_names = [f for f in os.listdir(image_dir) if '.jpg' in f]
img_ids = []
for fname in img_names:
img_id = fname.split('.')[0].rpartition(img_prefix)[-1]
img_ids.append(int(img_id))
ques_ids = vqa.getQuesIds(img_ids) #12-18行见vqa_dataset:获取问题id
q2i = defaultdict(lambda: len(q2i))
pad = q2i["<pad>"]
start = q2i["<sos>"]
end = q2i["<eos>"]
UNK = q2i["<unk>"]
a2i_count = {}
'''
如果ques_id = 262148000,
调用vqa.loadQQA 执行elif中语句返回:
[{'image_id': 262148, 'question': 'Where is he looking?', 'question_id': 262148000}]
'''
for ques_id in ques_ids:
qa = vqa.loadQA(ques_id)[0] #
qqa = vqa.loadQQA(ques_id)[0]#输出:'image_id': 262148, 'question': 'Where is he looking?', 'question_id': 262148000}
ques = qqa['question'][:-1]#输出:Where is he looking
[q2i[x] for x in ques.lower().strip().split(" ")] #输出:{'where': 0, 'is': 1, 'he': 2, 'looking': 3})
answers = qa['answers'] #10个答案 并统计每个答案的中的answer中的词频
for ans in answers:
if not ans['answer_confidence'] == 'yes': #如果有信心回答跳过下面的语句继续执行下一次循环
continue
ans = ans['answer'].lower()
if ans not in a2i_count:
a2i_count[ans] = 1
else:
a2i_count[ans] = a2i_count[ans] + 1 #输出:{'down': 7, 'at table': 1, 'skateboard': 1, 'table': 1}
#a_sort=[('down', 7), ('at table', 1), ('skateboard', 1), ('table', 1)]
a_sort = sorted(a2i_count.items(), key=operator.itemgetter(1), reverse=True)
i2a = {}
count = 0
a2i = defaultdict(lambda: len(a2i))
for word, _ in a_sort:#word:down,at table,skateboard,table, _:7,1,1,1
a2i[word] #a2i:{'down': 0, 'at table': 1, 'skateboard': 2, 'table': 3}
i2a[a2i[word]] = word #i2a:{0: 'down', 1: 'at table', 2: 'skateboard', 3: 'table'}
count = count + 1
if count == 1000: #前1000个词频出现高的答案
break
return q2i, a2i, i2a, a2i_count
if __name__ == '__main__':
image_dir = "C:/Users/90647/Desktop/计算机视觉/论文/VQA/数据集/val2014/val2014"
img_prefix = "COCO_val2014_"
qjson = "C:/Users/90647/Desktop/计算机视觉/论文/VQA/数据集/v2_Questions_Val_mscoco/v2_OpenEnded_mscoco_val2014_questions.json"
ajson = "C:/Users/90647/Desktop/计算机视觉/论文/VQA/数据集/v2_Annotations_Val_mscoco/v2_mscoco_val2014_annotations.json"
q2i, a2i, i2a, a2i_count = pre_process_dataset(image_dir, qjson, ajson, img_prefix)
#将返回的值直接存在本地的npy文件中,npy文件是一种数据存取格式,只能用python打开。
np.save('C:/Users/90647/Desktop/计算机视觉/论文/VQA/数据集/q2i.npy', q2i)
np.save('C:/Users/90647/Desktop/计算机视觉/论文/VQA/数据集/a2i.npy', a2i)
np.save('C:/Users/90647/Desktop/计算机视觉/论文/VQA/数据集/i2a.npy', i2a)
np.save('C:/Users/90647/Desktop/计算机视觉/论文/VQA/数据集/a2i_count.npy', a2i_count)
- npy:代码中也可看出,函数返回的内容,以npy文件格式存储了,npy是数据存储的一种方式,只能用python来加载。
有些神经网络会将训练好的权重保存在npy文件中,我们便可以用已经训练好的权重来初始化子集的网络,训练网络时加载存储权重的npy文件会大大提高网络速度。我们将它们对应的npy文件用python打开如下:
import numpy as np
#a2i = np.load('a2i.npy',encoding = "bytes").item()
a2i_count = np.load('a2i_count.npy',encoding = "bytes").item()
#i2a = np.load('i2a.npy',encoding = "bytes").item()
#q2i = np.load('q2i.npy',encoding = "bytes").item()
a2i_count
-
a2i.npy:答案的词频排序格式为:{答案:序号}
-
a2i_count.npy:答案的词频统计
-
i2a.npy :答案的词频排序–格式:{序号-答案} 只有前1000个数据
-
q2i.npy : 暂且没
3. vqa.py
在介绍vqa.py文件之前,我们先看它所用的两个文件——questions.json,annotation.json,它们的结构思维导图如下:
- questions.json
- annotation.json文件
import json
import datetime
import copy
class VQA:
def __init__(self, annotation_file=None, question_file=None):
"""
Constructor of VQA helper class for reading and visualizing questions and answers.
:param annotation_file (str): location of VQA annotation file
:return:
"""
# load dataset
self.dataset = {} #字典类型
self.questions = {}
self.qa = {}
self.qqa = {}
self.imgToQA = {}
if not annotation_file == None and not question_file == None:
print('loading VQA annotations and questions into memory...')
time_t = datetime.datetime.utcnow()
dataset = json.load(open(annotation_file, 'r')) #标注文件
questions = json.load(open(question_file, 'r')) #问题文件
print(datetime.datetime.utcnow() - time_t)
self.dataset = dataset
self.questions = questions
self.createIndex()
def createIndex(self):
# create index
print('creating index...')
#imgToQA是字典类型,获取的数据是dataset字典中键为annotations的image_id的数据
imgToQA = {ann['image_id']: [] for ann in self.dataset['annotations']} #生成{262148: [], 393225: [], 393226: [], ...}
qa = {ann['question_id']: [] for ann in self.dataset['annotations']} #qa与qqa数据相同生成{262148000:[],...}
qqa = {ann['question_id']: [] for ann in self.dataset['annotations']}
for ann in self.dataset['annotations']:
imgToQA[ann['image_id']] += [ann]##以imgId作为字典键,值对应的就是:annotations中的一条完整数据
qa[ann['question_id']] = ann#以questionId作为字典键,值对应的就是:annotations中的一条完整数据
for ques in self.questions['questions']:#以问题id作为字典存放{"image_id": 262148, "question": "Where is he looking?"}
qqa[ques['question_id']] = ques
print('index created!')
# create class members
self.qa = qa
self.qqa = qqa #此刻qqa与qa数据不一样。
self.imgToQA = imgToQA
def info(self): #打印标注文件中的键为info的值
for key, value in self.datset['info'].items():
print('%s: %s' % (key, value))
def getQuesIds(self, imgIds=[], quesTypes=[], ansTypes=[]):
imgIds = imgIds if type(imgIds) == list else [imgIds] #将imgIds转换成列表形式
quesTypes = quesTypes if type(quesTypes) == list else [quesTypes] #将问题类型转换成列表形式
ansTypes = ansTypes if type(ansTypes) == list else [ansTypes] #将答案类型转换成列表形式
if len(imgIds) == len(quesTypes) == len(ansTypes) == 0: #如果这些数据为0
anns = self.dataset['annotations'] #则anns加载dataset中的键为annotation的值
else:
if not len(imgIds) == 0: #等价于imgids!=0 annotation的值中如果img_id相同,会合并到一个ann的列表中。
anns = sum([self.imgToQA[imgId] for imgId in imgIds if imgId in self.imgToQA], [])
else:
anns = self.dataset['annotations']
anns = anns if len(quesTypes) == 0 else [ann for ann in anns if ann['question_type'] in quesTypes]
anns = anns if len(ansTypes) == 0 else [ann for ann in anns if ann['answer_type'] in ansTypes]
ids = [ann['question_id'] for ann in anns]
return ids
#根据要求获取图像id
def getImgIds(self, quesIds=[], quesTypes=[], ansTypes=[]):
quesIds = quesIds if type(quesIds) == list else [quesIds] #将quesIds转换列表形式
quesTypes = quesTypes if type(quesTypes) == list else [quesTypes]
ansTypes = ansTypes if type(ansTypes) == list else [ansTypes]
if len(quesIds) == len(quesTypes) == len(ansTypes) == 0:
anns = self.dataset['annotations']
else:
if not len(quesIds) == 0: #如果quesIds!=0执行if里内容
anns = sum([[self.qa[quesId]] for quesId in quesIds if quesId in self.qa], [])
else:
anns = self.dataset['annotations']
anns = anns if len(quesTypes) == 0 else [ann for ann in anns if ann['question_type'] in quesTypes]
anns = anns if len(ansTypes) == 0 else [ann for ann in anns if ann['answer_type'] in ansTypes]
ids = [ann['image_id'] for ann in anns]
return ids
#根据指定的问题id加载问题和答案
#返回的结果:answers里内容以列表形式存储,
def loadQA(self, ids=[]):
if type(ids) == list:
return [self.qa[id] for id in ids]
elif type(ids) == int:
return [self.qa[ids]]
def loadQQA(self, ids=[]):
if type(ids) == list:
return [self.qqa[id] for id in ids]
elif type(ids) == int:
return [self.qqa[ids]]
def showQA(self, anns):
"""
Display the specified annotations.
:param anns (array of object): annotations to display
:return: None
"""
if len(anns) == 0:
return 0
for ann in anns:
quesId = ann['question_id']
print("Question: %s" % (self.qqa[quesId]['question']))
for ans in ann['answers']:
print("Answer %d: %s" % (ans['answer_id'], ans['answer']))
#加载结果文件并返回一个结果对象
def loadRes(self, resFile, quesFile):
res = VQA() #实例化一个res对象
#加载问题文件
res.questions = json.load(open(quesFile))
#copy.deepcopy解释:https://www.cnblogs.com/nmap/p/9315970.html
res.dataset['info'] = copy.deepcopy(self.questions['info']) #深拷贝,改变原有对象,不会改变复制的内容
res.dataset['task_type'] = copy.deepcopy(self.questions['task_type'])
res.dataset['data_type'] = copy.deepcopy(self.questions['data_type'])
res.dataset['data_subtype'] = copy.deepcopy(self.questions['data_subtype'])
res.dataset['license'] = copy.deepcopy(self.questions['license'])
print('Loading and preparing results... ')
time_t = datetime.datetime.utcnow()
#加载结果文件
anns = json.load(open(resFile))
#assert 表达式 [, 参数]
# 当表达式为真时,程序继续往下执行;
# 当表达式为假时,抛出AssertionError错误,并将参数输出
#如果anns不是list时输出:'results is not an array of objects'
assert type(anns) == list, 'results is not an array of objects'
annsQuesIds = [ann['question_id'] for ann in anns] #annotations文件中的问题id列表
#如果annsQuesIds与set(self.getQuesIds())相等继续执行
#否则可能原因:
# 结果与当前的VQA集不一致。
# 或者结果没有对注释文件中的所有问题id进行预测,
# 或者至少有一个问题id不属于注释文件中的问题id
assert set(annsQuesIds) == set(self.getQuesIds()), \
'Results do not correspond to current VQA set.' \
' Either the results do not have predictions ' \
'for all question ids in annotation file or ' \
'there is atleast one question id that does' \
' not belong to the question ids in the annotation file.'
for ann in anns:
quesId = ann['question_id']
if res.dataset['task_type'] == 'Multiple Choice':
#判断问题id:1234对应的answer是否在问题json文件的一样的问题id:1234
#如果不在说明预测答案不在多项选择中
assert ann['answer'] in self.qqa[quesId][
'multiple_choices'], 'predicted answer is not one of the multiple choices'
qaAnn = self.qa[quesId]
ann['image_id'] = qaAnn['image_id']
ann['question_type'] = qaAnn['question_type']
ann['answer_type'] = qaAnn['answer_type']
print('DONE (t=%0.2fs)' % ((datetime.datetime.utcnow() - time_t).total_seconds()))
res.dataset['annotations'] = anns
res.createIndex()
return res
4. vqa_dataset.py
import os
import torch
import numpy as np
import torch.nn as nn
import torchvision.transforms as transforms
from PIL import Image
from six.moves import cPickle as pickle
from torch.utils.data import Dataset
from torchvision.datasets.folder import accimage_loader
from external.vqa.vqa import VQA
from coatt.dataset import pre_process_dataset
def pil_loader(path):
# open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
with open(path, 'rb') as f:
img = Image.open(f)
img = img.convert('RGB')
return img
def default_loader(path):
from torchvision import get_image_backend
if get_image_backend() == 'accimage':
return accimage_loader(path)
else:
return pil_loader(path)
class VqaDataset(Dataset):
"""
使用VQA python API加载VQA数据集 ,外部external文件中已经提供了必要的子集vqa
"""
def __init__(self, image_dir, question_json_file_path, annotation_json_file_path,
image_filename_pattern, collate=False, q2i=None, a2i=None, i2a=None,
a2i_count=None, img_names=None, img_ids=None, ques_ids=None,
method='simple', dataset_type='train', enc_dir=''):
"""
Args:
image_dir (string): 到COCO images目录的路径
question_json_file_path (string): 包含问题数据的json文件的路径
annotation_json_file_path (string): 路径到json文件,其中包含映射图像、问题和的注释答案在一起
image_filename_pattern (string): 此数据集中图像文件名使用的模式例如: (eg "COCO_train2014_{}.jpg")
"""
print(method) #simple案例
self.image_dir = image_dir
self.qjson = question_json_file_path #问题数据的json文件
self.ajson = annotation_json_file_path #包含图片,问题,答案的标注文件
img_prefix = image_filename_pattern.split('{}')[0]#img_prefix=COCO_train2014_
self.collate = collate
self.q2i = q2i
self.a2i = a2i
self.i2a = i2a
self.a2i_count = a2i_count
self.img_ids = img_ids
self.ques_ids = ques_ids
self.img_names = img_names
self.method = method
self.vqa = VQA(self.ajson, self.qjson) #初始化vqa实例,实现CreateIndex函数。
if self.method == 'simple': #如果是简单的基线实验将图片变为224*224
self.transform = transforms.Compose([transforms.Resize((224, 224)),
transforms.ToTensor()])
else:
self.transform = transforms.Compose([transforms.Resize((448, 448)),
transforms.ToTensor()])
if not collate: #collate为false执行if里面语句
self.img_names = [f for f in os.listdir(self.image_dir) if '.jpg' in f] #获取图片名:列表形式
self.img_ids = []#获取图片id:列表形式
#for:img_ids的列表中全是图片id:000000000001-nnnnnnnnnn,
for fname in self.img_names:
#fname = "COCO_train2014_000000000009.jpg"
#img_id = 000000000009(string类型)
img_id = fname.split('.')[0].rpartition(img_prefix)[-1]
self.img_ids.append(int(img_id)) #转换成int类型[9,8...]
#根据图片id获取相应的问题id
self.ques_ids = self.vqa.getQuesIds(self.img_ids)
#进行预处理 返回的值在dataset中已写出
self.q2i, self.a2i, self.i2a, self.a2i_count = pre_process_dataset(image_dir, self.qjson,
self.ajson, img_prefix)
self.q2i_len = len(self.q2i)
self.a2i_len = len(self.a2i.keys())
# 获取q2i的keys,q2i:{'where': 0, 'is': 1, 'he': 2, 'looking': 3}),即获取where,is,he,looking...
self.q2i_keys = self.q2i.keys()
self.enc_dir = enc_dir
if collate and dataset_type == 'train':
with open('C:/Users/90647/Desktop/计算机视觉/论文/VQA/数据集/train_enc_idx.npy', 'rb') as f:
self.enc_idx = pickle.load(f)
elif collate and dataset_type == 'val':
with open('C:/Users/90647/Desktop/计算机视觉/论文/VQA/数据集/val_enc_idx.npy', 'rb') as f:
self.enc_idx = pickle.load(f)
#返回问题长度
def __len__(self):
return len(self.ques_ids)
#根据idx获取一条数据
def __getitem__(self, idx):
ques_id = self.ques_ids[idx]
img_id = self.vqa.getImgIds([ques_id])[0]
qa = self.vqa.loadQA(ques_id)[0] #一条qa标注信息 以annotation文件为基础
qqa = self.vqa.loadQQA(ques_id)[0] #一条qqa标注信息 以问题json文件为基础
img_name = self.img_names[self.img_ids.index(img_id)] #获取img_id的图片名
if self.method == 'simple': #如果是simple实验,如何处理图片,其实也就尺寸不同。
img = default_loader(self.image_dir + '/' + img_name)
imgT = self.transform(img).float()
else:
img = default_loader(self.image_dir + '/' + img_name)
imgT = self.transform(img).float()
ques = qqa['question'][:-1] #获取去掉问号的问题
#q2i={dict:11471},q2i={"<pad>":0,'<sos>':1,'<eos>':2,'<unk>':3,'what':4,'color':5...}
#quesI = [1,4,454,54,843,6,11,2]
quesI = [self.q2i["<sos>"]] + [self.q2i[x.lower()] for x in ques.split(" ") if x.lower() in self.q2i_keys] + [self.q2i["<eos>"]]
if not self.collate:
quesI = quesI + [self.q2i["<pad>"]]*(8 - len(quesI))
if self.method == 'simple':
quesT = torch.zeros(self.q2i_len).float() #quesT:tensor([0.,1.,1.,..0.0.0])
for idx in quesI:
quesT[idx] = 1
else:
quesT = torch.from_numpy(np.array(quesI)).long()
answers = qa['answers'] # answer:[{'answer':'laptop and desktop','answerconfidence':'yes'...}]
max_count = 0
answer = ""
for ans in answers:
#if not ans['answer_confidence'] == 'yes':
# continue
ans = ans['answer'].lower() #ans:str
if ans in self.a2i.keys() and self.a2i_count[ans] > max_count:
max_count = self.a2i_count[ans] # max_count=51005
answer = ans
if answer == "": # only for validation
gT = torch.from_numpy(np.array([self.a2i_len])).long() #gT 1维Tensor tensor([337])
else:
gT = torch.from_numpy(np.array([self.a2i[answer]])).long()
if not self.collate:
return {'img' : imgT, 'ques' : quesT, 'gt': gT}
#imgT -- 3维Tensor数据
#quesT -- 1维长度11471???
#gT -- 1维
return imgT, quesT, gT
5. coattention_net.py
import torch
import torch.nn as nn
import torch.nn.functional as fn
import torch.nn.utils.rnn as rnn
#共同注意网络
class CoattentionNet(nn.Module):
"""
Predicts an answer to a question about an image using the Hierarchical Question-Image Co-Attention
for Visual Question Answering (Lu et al, 2017) paper.
"""
def __init__(self, num_embeddings, num_classes, embed_dim=512, k=30):
super().__init__()
#nn.Embedding:输入为一个编号列表,输出为对应的符号嵌入向量列表
#num_embeddings:词典的大小尺寸,比如总共出现5000个词,那就输入5000
#embed_dim:嵌入向量的维度,即用多少维来表示一个符号
self.embed = nn.Embedding(num_embeddings, embed_dim)
self.unigram_conv = nn.Conv1d(embed_dim, embed_dim, 1, stride=1, padding=0) #单字卷积
self.bigram_conv = nn.Conv1d(embed_dim, embed_dim, 2, stride=1, padding=1, dilation=2)#双字卷积
self.trigram_conv = nn.Conv1d(embed_dim, embed_dim, 3, stride=1, padding=2, dilation=2)#三字卷积
self.max_pool = nn.MaxPool2d((3, 1)) #最大池化层:文中采用最大池获取短语级特征
#最大池化后用lstm编码序列问题短语级,相应的问题级特征qst是时间tLSTM的隐藏向量。
self.lstm = nn.LSTM(input_size=embed_dim, hidden_size=embed_dim, num_layers=3, dropout=0.4)
self.tanh = nn.Tanh()
#采用randn随机初始化权重
self.W_b = nn.Parameter(torch.randn(embed_dim, embed_dim))
self.W_v = nn.Parameter(torch.randn(k, embed_dim))
self.W_q = nn.Parameter(torch.randn(k, embed_dim))
self.w_hv = nn.Parameter(torch.randn(k, 1))
self.w_hq = nn.Parameter(torch.randn(k, 1))
self.W_w = nn.Linear(embed_dim, embed_dim)
self.W_p = nn.Linear(embed_dim*2, embed_dim)
self.W_s = nn.Linear(embed_dim*2, embed_dim)
self.fc = nn.Linear(embed_dim, num_classes)
#前向传播
def forward(self, image, question): # Image: B x 512 x 196
#pad_packed_sequence:是将pack_padded_sequence生成的结构转化为原先的结构,定长的tensor。
#关于pad_packed_sequence和pack_padded_sequence,https://www.cnblogs.com/sbj123456789/p/9834018.html
question, lens = rnn.pad_packed_sequence(question)
#permute:将tensor的维度换位。
question = question.permute(1, 0) # Ques : B x L
words = self.embed(question).permute(0, 2, 1) # Words: B x L x 512
#unsqueeze():增加维度
unigrams = torch.unsqueeze(self.tanh(self.unigram_conv(words)), 2) # B x 512 x L
bigrams = torch.unsqueeze(self.tanh(self.bigram_conv(words)), 2) # B x 512 x L
trigrams = torch.unsqueeze(self.tanh(self.trigram_conv(words)), 2) # B x 512 x L
words = words.permute(0, 2, 1)
#squeeze():删除维度
phrase = torch.squeeze(self.max_pool(torch.cat((unigrams, bigrams, trigrams), 2)))
phrase = phrase.permute(0, 2, 1) # B x L x 512
hidden = None
#pack_padded_sequence:是将句子按照batch优先的原则记录每个句子的词,变化为不定长tensor,方便计算损失函数
phrase_packed = nn.utils.rnn.pack_padded_sequence(torch.transpose(phrase, 0, 1), lens)
sentence_packed, hidden = self.lstm(phrase_packed, hidden)
sentence, _ = rnn.pad_packed_sequence(sentence_packed)
#transpose:维度调整
sentence = torch.transpose(sentence, 0, 1) # B x L x 512
v_word, q_word = self.parallel_co_attention(image, words)
v_phrase, q_phrase = self.parallel_co_attention(image, phrase)
v_sent, q_sent = self.parallel_co_attention(image, sentence)
h_w = self.tanh(self.W_w(q_word + v_word))
h_p = self.tanh(self.W_p(torch.cat(((q_phrase + v_phrase), h_w), dim=1)))
h_s = self.tanh(self.W_s(torch.cat(((q_sent + v_sent), h_p), dim=1)))
logits = self.fc(h_s)
return logits
#平行共同注意
def parallel_co_attention(self, V, Q): # V : B x 512 x 196, Q : B x L x 512
C = torch.matmul(Q, torch.matmul(self.W_b, V)) # B x L x 196
H_v = self.tanh(torch.matmul(self.W_v, V) + torch.matmul(torch.matmul(self.W_q, Q.permute(0, 2, 1)), C)) # B x k x 196
H_q = self.tanh(torch.matmul(self.W_q, Q.permute(0, 2, 1)) + torch.matmul(torch.matmul(self.W_v, V), C.permute(0, 2, 1))) # B x k x L
#a_v = torch.squeeze(fn.softmax(torch.matmul(torch.t(self.w_hv), H_v), dim=2)) # B x 196
#a_q = torch.squeeze(fn.softmax(torch.matmul(torch.t(self.w_hq), H_q), dim=2)) # B x L
a_v = fn.softmax(torch.matmul(torch.t(self.w_hv), H_v), dim=2) # B x 1 x 196
a_q = fn.softmax(torch.matmul(torch.t(self.w_hq), H_q), dim=2) # B x 1 x L
v = torch.squeeze(torch.matmul(a_v, V.permute(0, 2, 1))) # B x 512
q = torch.squeeze(torch.matmul(a_q, Q)) # B x 512
return v, q
6.image_encoding.py
import os
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchvision import models, transforms
from PIL import Image
from six.moves import cPickle as pickle
from torchvision.datasets.folder import accimage_loader
def pil_loader(path):
# open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
with open(path, 'rb') as f:
img = Image.open(f)
return img.convert('RGB')
def default_loader(path):
from torchvision import get_image_backend
if get_image_backend() == 'accimage':
return accimage_loader(path)
else:
return pil_loader(path)
class VqaImgDataset(Dataset):
def __init__(self, image_dir, name, img_prefix):
self.image_dir = image_dir
self.img_names = [f for f in os.listdir(self.image_dir) if '.jpg' in f]
self.transform = transforms.Compose([transforms.Resize((448, 448)),
transforms.ToTensor()])
img_ids = {}
for idx, fname in enumerate(self.img_names):
img_id = fname.split('.')[0].rpartition(img_prefix)[-1]
img_ids[int(img_id)] = idx
#list(img_ids)
print("nnnnnn")
with open('C:/Users/90647/Desktop/计算机视觉/论文/VQA/数据集/' + name + '_enc_idx.npy', 'wb') as f:
pickle.dump(img_ids, f)
def __len__(self):
return len(self.img_names)
def __getitem__(self, idx):
img = default_loader(self.image_dir + '/' + self.img_names[idx])
imgT = self.transform(img)
return imgT.float()
tr_image_dir = 'C:/Users/90647/Desktop/计算机视觉/论文/VQA/数据集/val2014/val2014'
va_image_dir = 'C:/Users/90647/Desktop/计算机视觉/论文/VQA/数据集/val2014/val2014'
tr_out_dir = 'C:/Users/90647/Desktop/计算机视觉/论文/VQA/数据集/tr_enc'
va_out_dir = 'C:/Users/90647/Desktop/计算机视觉/论文/VQA/数据集/va_enc'
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model = models.resnet18(pretrained=True)
modules = list(model.children())[:-2]
model = nn.Sequential(*modules)
for params in model.parameters():
params.requires_grad = False
if DEVICE == 'cuda':
model = model.cuda()
tr_img_dataset = VqaImgDataset(image_dir=tr_image_dir, name='train', img_prefix="COCO_val2014_")
tr_img_dataset_loader = DataLoader(tr_img_dataset, batch_size=10, shuffle=False, num_workers=0)
va_img_dataset = VqaImgDataset(image_dir=va_image_dir, name='val', img_prefix="COCO_val2014_")
va_img_dataset_loader = DataLoader(va_img_dataset, batch_size=10, shuffle=False, num_workers=0)
print('Dumping Training images encodings.')
for idx, imgT in enumerate(tr_img_dataset_loader):
imgT = imgT.to(DEVICE)
out = model(imgT)
out = out.view(out.size(0), out.size(1), -1)
out = out.cpu().numpy()
path = tr_out_dir + '/' + str(idx) + '.npz'
#np.savez(path, out=out)
np.savez_compressed(path, out=out)
print(path)
print('Dumping Validation images encodings.')
for idx, imgT in enumerate(va_img_dataset_loader):
imgT = imgT.to(DEVICE)
out = model(imgT)
out = out.view(out.size(0), out.size(1), -1)
out = out.cpu().numpy()
path = va_out_dir + '/' + str(idx) + '.npz'
#np.savez(path, out=out)
np.savez_compressed(path, out=out)
print(path)
7.coattention_experiment_runner.py
import torch
import numpy as np
from six.moves import cPickle as pickle
from torch.utils.data import DataLoader
from coatt.coattention_net import CoattentionNet
from coatt.experiment_runner_base import ExperimentRunnerBase
from coatt.vqa_dataset import VqaDataset
#自定义读取数据函数,参数seq_list:序列列表
def collate_lines(seq_list):
imgT, quesT, gT = zip(*seq_list)
lens = [len(ques) for ques in quesT]
seq_order = sorted(range(len(lens)), key=lens.__getitem__, reverse=True)
imgT = torch.stack([imgT[i] for i in seq_order])
quesT = [quesT[i] for i in seq_order]
gT = torch.stack([gT[i] for i in seq_order])
return imgT, quesT, gT
class CoattentionNetExperimentRunner(ExperimentRunnerBase):
"""
Sets up the Co-Attention model for training. This class is specifically responsible for creating the model and optimizing it.
"""
def __init__(self, train_image_dir, train_question_path, train_annotation_path,
test_image_dir, test_question_path,test_annotation_path, batch_size, num_epochs,
num_data_loader_workers):
self.method = 'coattention'
print('Loading numpy files. \n')
with open('C:/Users/90647/Desktop/计算机视觉/论文/VQA/数据集/q2i.pkl', 'rb') as f:
q2i = pickle.load(f)
with open('C:/Users/90647/Desktop/计算机视觉/论文/VQA/数据集/a2i.pkl', 'rb') as f:
a2i = pickle.load(f)
with open('C:/Users/90647/Desktop/计算机视觉/论文/VQA/数据集/i2a.pkl', 'rb') as f:
i2a = pickle.load(f)
with open('C:/Users/90647/Desktop/计算机视觉/论文/VQA/数据集/a2i_count.pkl', 'rb') as f:
a2i_count = pickle.load(f)
print(type(a2i))
tr_img_names = np.load('C:/Users/90647/Desktop/计算机视觉/论文/VQA/数据集/tr_img_names.npy', encoding='latin1').tolist()
tr_img_ids = np.load('C:/Users/90647/Desktop/计算机视觉/论文/VQA/数据集/tr_img_ids.npy', encoding='latin1').tolist()
tr_ques_ids = np.load('C:/Users/90647/Desktop/计算机视觉/论文/VQA/数据集/tr_ques_ids.npy', encoding='latin1').tolist()
va_img_names = np.load('C:/Users/90647/Desktop/计算机视觉/论文/VQA/数据集/va_img_names.npy', encoding='latin1').tolist()
va_img_ids = np.load('C:/Users/90647/Desktop/计算机视觉/论文/VQA/数据集/va_img_ids.npy', encoding='latin1').tolist()
va_ques_ids = np.load('C:/Users/90647/Desktop/计算机视觉/论文/VQA/数据集/va_ques_ids.npy', encoding='latin1').tolist()
#va_ques_ids = np.load('C:/Users/90647/Desktop/计算机视觉/论文/VQA/数据集/va_ques_ids_orig.npy', encoding='latin1').tolist()
print('Creating Datasets.')
train_dataset = VqaDataset(image_dir=train_image_dir, collate=True,
question_json_file_path=train_question_path,
annotation_json_file_path=train_annotation_path,
image_filename_pattern="COCO_train2014_{}.jpg",
q2i=q2i, a2i=a2i, i2a=i2a, a2i_count=a2i_count,
img_names=tr_img_names, img_ids=tr_img_ids,
ques_ids=tr_ques_ids, method=self.method,
dataset_type="train", enc_dir='C:/Users/90647/Desktop/计算机视觉/论文/VQA/数据集/tr_enc')
val_dataset = VqaDataset(image_dir=test_image_dir, collate=True,
question_json_file_path=test_question_path,
annotation_json_file_path=test_annotation_path,
image_filename_pattern="COCO_val2014_{}.jpg",
q2i=q2i, a2i=a2i, i2a=i2a, a2i_count=a2i_count,
img_names=va_img_names, img_ids=va_img_ids,
ques_ids=va_ques_ids, method=self.method,
dataset_type="validation", enc_dir='C:/Users/90647/Desktop/计算机视觉/论文/VQA/数据集/va_enc')
#DataLoader加载数据,collate_fn是代表如何取数据的,我门这里自定义了collate_lines函数
self._train_dataset_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=5, collate_fn=collate_lines)
self._val_dataset_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=5, collate_fn=collate_lines)
print('Creating Co Attention Model.')
model = CoattentionNet(len(q2i), 1000).float()
super().__init__(train_dataset, val_dataset, model, batch_size, num_epochs, num_data_loader_workers)
def _optimize(self, predicted_answers, true_answer_ids):
self.optimizer.zero_grad()
loss = self.criterion(predicted_answers, true_answer_ids)
loss.backward()
self.optimizer.step()
return loss
8. main.py
import argparse
from coatt.simple_baseline_experiment_runner import SimpleBaselineExperimentRunner
from coatt.coattention_experiment_runner import CoattentionNetExperimentRunner
if __name__ == "__main__":
# Feel free to add more args, or change/remove these.
#随意添加,更改,删除参数。
'''
ArgumentParser类:命令行解析的主入口点
add_argument()类方法:用于填充用于填充带有可选参数和位置参数操作的解析器
parse_args()类方法:然后调用parse_args()方法来转换命令行转换为具有属性的对象
'''
parser = argparse.ArgumentParser(description='Load VQA.') #实例化
parser.add_argument('--model', type=str, choices=['simple', 'coattention'], default='simple')
parser.add_argument('--train_image_dir', type=str, default='C:/Users/90647/Desktop/计算机视觉/论文/VQA/数据集/val2014/val2014')
parser.add_argument('--train_question_path', type=str, default='C:/Users/90647/Desktop/计算机视觉/论文/VQA/数据集/v2_Questions_Val_mscoco/v2_OpenEnded_mscoco_val2014_questions.json')
parser.add_argument('--train_annotation_path', type=str, default='C:/Users/90647/Desktop/计算机视觉/论文/VQA/数据集/v2_Annotations_Val_mscoco/v2_mscoco_val2014_annotations.json')
parser.add_argument('--test_image_dir', type=str, default='C:/Users/90647/Desktop/计算机视觉/论文/VQA/数据集/val2014/val2014')
parser.add_argument('--test_question_path', type=str, default='C:/Users/90647/Desktop/计算机视觉/论文/VQA/数据集/v2_Questions_Val_mscoco/v2_OpenEnded_mscoco_val2014_questions.json')
parser.add_argument('--test_annotation_path', type=str, default='C:/Users/90647/Desktop/计算机视觉/论文/VQA/数据集/v2_Annotations_Val_mscoco/v2_mscoco_val2014_annotations.json')
parser.add_argument('--batch_size', type=int, default=100)
parser.add_argument('--num_epochs', type=int, default=0)
parser.add_argument('--num_data_loader_workers', type=int, default=0)
args = parser.parse_args()
if args.model == "simple":
experiment_runner_class = SimpleBaselineExperimentRunner
elif args.model == "coattention":
experiment_runner_class = CoattentionNetExperimentRunner
else:
raise ModuleNotFoundError()
experiment_runner = experiment_runner_class(train_image_dir=args.train_image_dir,#训练集图片目录
train_question_path=args.train_question_path, #训练集问题目录
train_annotation_path=args.train_annotation_path,#训练集标注目录
test_image_dir=args.test_image_dir, #测试集图片目录
test_question_path=args.test_question_path,#测试集问题目录
test_annotation_path=args.test_annotation_path,#测试集标注目录
batch_size=args.batch_size, #批量大小
num_epochs=args.num_epochs, #epochs大小
num_data_loader_workers=args.num_data_loader_workers)
experiment_runner.train()
9. experiment_runner_base.py
import os
import shutil
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.utils.rnn as rnn
from tensorboardX import SummaryWriter
from datetime import datetime
from torchvision import models
class ExperimentRunnerBase(object):
"""
This base class contains the simple train and validation loops for your VQA experiments.
Anything specific to a particular experiment (Simple or Coattention) should go in the corresponding subclass.
"""
def __init__(self, train_dataset, val_dataset, model, batch_size, num_epochs, num_data_loader_workers=10, lr=0.001):
self._model = model
self._num_epochs = num_epochs
self._log_freq = 10 # Steps
self._test_freq = 250*4 # Steps
self._save_freq = 1 #2 # Epochs
self._print_freq = 50
self._batch_size = batch_size
self._lr = lr
# Use the GPU if it's available.
self.DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
if self.DEVICE == "cuda":
self._model = self._model.cuda()
if self.method == 'simple':
self.optimizer = optim.Adam(self._model.parameters(), lr=self._lr, weight_decay=1e-8)
#self.optimizer = optim.Adam(self._model.parameters(), lr=self._lr)
self.optimizer = optim.SGD([{'params': self._model.embed.parameters(), 'lr': 0.8},
{'params': self._model.gnet.parameters(), 'lr': 1e-2},
{'params': self._model.fc.parameters(), 'lr': 1e-2}
], momentum=0.9)
#self.optimizer = optim.Adam([{'params': self._model.embed.parameters(), 'lr': 0.08},
# {'params': self._model.gnet.parameters(), 'lr': 1e-3},
# {'params': self._model.fc.parameters(), 'lr': 1e-3}
# ], weight_decay=1e-8)
else:
self.optimizer = optim.Adam(self._model.parameters(), lr=self._lr, weight_decay=1e-8)
self.criterion = nn.CrossEntropyLoss() #计算损失值
self.initialize_weights() #初始化权重
#Logger for tensorboard
self.writer = SummaryWriter() #在tensorboard上输出日志
self.total_validation_questions = 121512.0#214354.0
if self.method == 'simple':
self.chk_dir = './chk_simple/' #保存模型的文件目录
else:
self.chk_dir = './chk_coattention/'
print('Creating Image Encoder')
self.img_enc = models.resnet18(pretrained=True)
modules = list(self.img_enc.children())[:-2]
self.img_enc = nn.Sequential(*modules)
for params in self.img_enc.parameters():
params.requires_grad = False
if self.DEVICE == "cuda":
self.img_enc = self.img_enc.cuda()
self.img_enc.eval()
print("Fine!")
if not os.path.exists(self.chk_dir): #文件不存在就创建文件
os.makedirs(self.chk_dir)
def _optimize(self, predicted_answers, true_answers): #优化函数,在子类中实现
"""
This gets implemented in the subclasses. Don't implement this here.
"""
raise NotImplementedError()
def validate(self): #验证:返回验证精度
# TODO. Should return your validation accuracy
accuracy = 0.0
for batch_id, (imgT, quesT, gT) in enumerate(self._val_dataset_loader): #val_dataset_loader验证集返回,
self._model.eval() # 将模型设置为训练模式
if batch_id == 100000:
break;
if not self.method == 'simple': #如果method!=simple执行if里面的内容
quesT = rnn.pack_sequence(quesT) #pack_sequence封装可变长度张量的列表,a=[1,2,3],b=[4,5],pack后[1,2,3,4,5]
imgT = imgT.to(self.DEVICE)#在device设备上运行
imgT = self.img_enc(imgT)
imgT = imgT.view(imgT.size(0), imgT.size(1), -1)
imgT, quesT, gT = imgT.to(self.DEVICE), quesT.to(self.DEVICE), gT.to(self.DEVICE) #转换成相应设备
# print(gT.shape)
# print(gT.shape[0])
# gT = torch.squeeze(gT) #降维,减掉gT维???
pd_ans = self._model(imgT, quesT) # TODO
#print(gT.shape[0])
for i in range(gT.shape[0]):
if torch.argmax(pd_ans[i]).item() == gT[i]:
accuracy = accuracy + 1.0
if (batch_id + 1) % self._print_freq == 0:
print('Validation Accuracy: %f' % (accuracy / ((batch_id + 1)*self._batch_size)))
accuracy = accuracy / self.total_validation_questions
return accuracy
#训练
def train(self):
print('Started Training.\n')
tr_iter = 0
val_iter = 0
best_prec = 0.0
for epoch in range(self._num_epochs):
if (epoch + 1) // 3 == 0: #epoch=0
self.adjust_learning_rate(epoch + 1) # 调用调整学率函数
num_batches = len(self._train_dataset_loader) #num_batches = 204404
print(type(self._train_dataset_loader))
for batch_id, (imgT, quesT, gT) in enumerate(self._train_dataset_loader):
#print(type(imgT))
if batch_id==10000:
break;
self._model.train() # Set the model to train mode
current_step = epoch * num_batches + batch_id
# ============
# TODO: Run the model and get the ground truth answers that you'll pass to your optimizer
# This logic should be generic; not specific to either the Simple Baseline or CoAttention.
if not self.method == 'simple':
quesT = rnn.pack_sequence(quesT)
imgT = imgT.to(self.DEVICE)
imgT = self.img_enc(imgT)
imgT = imgT.view(imgT.size(0), imgT.size(1), -1) #
else:
imgT = imgT.to(self.DEVICE)
quesT, gT = quesT.to(self.DEVICE), gT.to(self.DEVICE)
predicted_answer = self._model(imgT, quesT) # TODO
ground_truth_answer = torch.squeeze(gT) # TODO
# ============
# Optimize the model according to the predictions
loss = self._optimize(predicted_answer, ground_truth_answer.unsqueeze(0))
loss = torch.unsqueeze(loss,0)
if (current_step + 1) % self._log_freq == 0:
print("Epoch: {}, Batch {}/{} has loss {}".format(epoch, batch_id, num_batches, loss))
# TODO: you probably want to plot something here
self.writer.add_scalar('train/loss', loss.item(), tr_iter)
tr_iter = tr_iter + 1
# if (current_step + 1) % self._test_freq == 0:
# self._model.eval()
# val_accuracy = self.validate()
# print("Epoch: {} has val accuracy {}".format(epoch, val_accuracy))
#
# # TODO: you probably want to plot something here
# self.writer.add_scalar('valid/accuracy', val_accuracy, val_iter)
# val_iter = val_iter + 1
if (epoch + 1) % self._save_freq == 0 or epoch == self._num_epochs - 1:
val_accuracy = self.validate()
print("Epoch: {} has val accuracy {}".format(epoch, val_accuracy))
self.writer.add_scalar('valid/accuracy', val_accuracy, val_iter)
val_iter = val_iter + 1
# remember best val_accuracy and save checkpoint
is_best = val_accuracy > best_prec
best_prec = max(val_accuracy, best_prec)
self.save_checkpoint({'epoch': epoch + 1,
'state_dict': self._model.state_dict(),
'best_prec': best_prec},
#'optimizer': optimizer.state_dict()}, is_best,
is_best, self.chk_dir + 'checkpoint_' + str(epoch + 1) + '.pth.tar')
# Closing tensorboard logger
logdir = os.path.join('./tb_', datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
if not os.path.exists(logdir):
os.makedirs(logdir)
self.writer.export_scalars_to_json(logdir + 'tb_summary.json')
self.writer.close()
#初始化权重
def initialize_weights(self):
for layer in self._model.modules():
#判断网络层是否是卷积层
if not isinstance(layer, (nn.Conv2d, nn.Linear)):
continue
try:
#xavier高斯初始化权重
torch.nn.init.xavier_normal_(layer.weight)
#偏差用0填充
try:
nn.init.constant_(layer.bias.data, 0)
except:
pass
except:
pass
# 保存当前轮和最好模型。保存成checkpoint时可用于推理和继续训练
def save_checkpoint(self, state, is_best, filename='checkpoint.pth.tar'):
torch.save(state, filename)
if is_best:
#copy文件目标文件无需存在
shutil.copyfile(filename, 'model_best.pth.tar')
#调整学习率
def adjust_learning_rate(self, epoch): #epoch+1 = 1
"""Sets the learning rate to the initial LR decayed by 10 every 5 epochs"""
for param_group in self.optimizer.param_groups:
param_group['lr'] = param_group['lr'] / 10
7. 错误总结
在调试代码时报了如下错误:
- TypeError: can’t pickle dict_keys objects
经过反复调试代码,然后百度,发现如下解决方式:
self.q2i_keys = self.q2i.keys()
改为
self.q2i_keys = list(self.q2i.keys())
-
RuntimeError: CUDA out of memory. Tried to allocate 1.53 GiB (GPU 0; 2.00 Gi
将主函数中batch_size的默认值改为10 -
将上面那一行后的unsqueeze(0)去掉
以上到此结束,可能因为环境会出现不同的错误,我看的也比较浅显,大佬莫笑。若有疑问,可以留言或百度,自选哈。