Describing Videos by Exploiting Temporal Structure - 数据准备

最新推荐文章于 2023-07-14 09:05:55 发布

Tsingzao-于廷照

最新推荐文章于 2023-07-14 09:05:55 发布

阅读量1.6k

点赞数 1

分类专栏： Theano

本文链接：https://blog.csdn.net/yutingzhaomeng/article/details/79236677

版权

Theano 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

注：本文的数据集准备同样适应于以下两篇文章。

Attention-based LSTM with Semantic Consistency for Videos Captioning

Hierarchical LSTM with Adjusted TEmporal Attention for Video Captioning

作者GitHub提供了相关代码，根据作者README下载相应数据集可以跑通实验，但作者只提供了MSVD处理数据，如果想在其他数据集测试，则需要自行建立。作者没有提供相关代码，本文结合自己的实验过程，给出自己建立数据的代码，相互学习。

1、视频转单帧

import os
video_path = '/data/MSRVTTClips/train-video/'
frame_path = '/data/MSRVTTFrames/'

count = 0
for video in os.listdir(video_path):
    os.mkdir(frame_path+video.split('eo')[0]+video.split('eo')[-1].split('.mp4')[0])
    os.system("ffmpeg -i "+video_path+video+" "+frame_path+video.split('eo')[0]+video.split('eo')[-1].split('.mp4')[0]+"/frame-%4d.jpg")

2、提取帧特征

import os
import sys
caffe_root = '/home/caffe_cudnn/python/'
sys.path.insert(0,caffe_root)
import numpy as np

gpu_id = 2
import caffe
caffe.set_device(gpu_id)
caffe.set_mode_gpu()

layer_num = 152
extract_from_layer = 'pool5'
model_def = "/home/caffe_cudnn/models/resnet/ResNet-"+str(layer_num)+"-deploy.prototxt"
pretrained_model = "/home/caffe_cudnn/models/resnet/ResNet-"+str(layer_num)+"-model.caffemodel"
batch_size = 1
folder_path = '/data/MSRVTTFrames/'
save_path = '/data/msrvtt/resnet'+str(layer_num)+'/'
mean_file = "/home/caffe_cudnn/models/resnet/ResNet_mean.npy"

net = caffe.Net(model_def, pretrained_model, caffe.TEST)

transformer = caffe.io.Transformer({'data': net.blobs['data'].data.shape})
transformer.set_transpose('data', (2,0,1))
transformer.set_channel_swap('data', (2,1,0))
transformer.set_raw_scale('data', 255)
transformer.set_mean('data', np.reshape(np.load(mean_file),(3,224,224)))

for i in range(1,10001):
    video_path = os.path.join(folder_path, 'vid'+str(i)+'/')
    feature = []
    for idx in range(1,len(os.listdir(video_path))+1):
        frame = caffe.io.load_image(video_path+'frame-'+str(idx).zfill(4)+'.jpg')
        net.blobs['data'].data[0] = transformer.preprocess('data', frame)
        temp = net.forward()
        feat = net.blobs[extract_from_layer].data[0].copy()
        feat = np.reshape(feat, (2048,))
        feature.append(feat)
    feature = np.asarray(feature)
    np.save(save_path+'vid'+str(i)+'.npy', feature)
    print video_path

3、获取CAP以及worddict等

import json
import nltk
import pickle
from collections import Counter
import collections
import random

with open(anno_json_path, 'r') as f:
    anno_json = json.load(f)
anno_data = anno_json['sentences']

sentences = anno_data

counter = Counter()
ncaptions = len(sentences)
for i, row in enumerate(sentences):
    caption = row.split('\t')[1]
    # 直接按照空格进行单词的切分
    # tokens = caption.lower().split(' ')
    # 使用nltk来进行单词切分
    tokens = nltk.tokenize.word_tokenize(caption.lower())
    counter.update(tokens)
    if i % 10000 == 0:
        print('[{}/{}] tokenized the captions.'.format(i, ncaptions))
        
with open('/data/msrvtt/worddict.pkl','w') as f:
    pickle.dump(counter, f) 
        
temp = {}
for j in range(1,10001):
    temp['vid'+str(j)] = []
for i in range(len(sentences)):
    tmp = {}
    tmp['caption'] = sentences[i]['caption']
    tmp['cap_id'] = sentences[i]['sen_id']
    tmp['image_id'] = 'vid'+str(int(sentences[i]['video_id'].split('video')[-1])+1)
    tmp['tokenized'] = ' '.join(nltk.tokenize.word_tokenize(sentences[i]['caption'].lower()))
    temp['vid'+str(int(sentences[i]['video_id'].split('video')[-1])+1)].append(tmp)

tp = {}
for j in range(1,10001):
    tp['vid'+str(j)] = []
for k in range(1,10001):
    tmp = temp['vid'+str(k)]
    min_id = min(tmp[i]['cap_id'] for i in range(len(tmp)))
    for m in range(len(tmp)):
        tmp[m]['cap_id'] -= min_id
        tmp[m]['cap_id'] = str(tmp[m]['cap_id'])
    tp['vid'+str(k)] = tmp
       
d = collections.OrderedDict()
for i in range(1,10001):
    d['vid'+str(i)] = tp['vid'+str(i)]    
with open('/data/msrvtt/CAP.pkl','w') as f:
    pickle.dump(d, f)       
       
tmp = []
for i in range(1,6514):
    for j in range(20):
        tmp.append('vid'+str(i)+'_'+str(j)) 
random.shuffle(tmp)
with open('/data/msrvtt/train.pkl','w') as f:
    pickle.dump(tmp, f)
   
tmp = []
for i in range(6514,7011):
    for j in range(20):
        tmp.append('vid'+str(i)+'_'+str(j)) 
random.shuffle(tmp)    
with open('/data/msrvtt/valid.pkl','w') as f:
    pickle.dump(tmp, f)
     
tmp = []
for i in range(7011,10001):
    for j in range(20):
        tmp.append('vid'+str(i)+'_'+str(j)) 
random.shuffle(tmp)    
with open('/data/msrvtt/test.pkl','w') as f:
    pickle.dump(tmp, f)