创建容器
1.修改脚本挂载权限去掉只读
DATA_DIR=$1
MODEL_DIR=$2
OUTPUT=$3
if [ -z $CUDA_VISIBLE_DEVICES ]; then
CUDA_VISIBLE_DEVICES='all'
fi
if [ "$4" = "--prepro" ]; then
RO=""
else
RO=",readonly"
fi
docker run --gpus '"'device=$CUDA_VISIBLE_DEVICES'"' --ipc=host --rm -it \
-p 6666:22 \
--name swinbert \
--mount src=$(pwd),dst=/videocap,type=bind \
--mount src=$DATA_DIR,dst=/videocap/datasets,type=bind \
--mount src=$MODEL_DIR,dst=/videocap/models,type=bind \
--mount src=$OUTPUT,dst=/videocap/output,type=bind \
-e NVIDIA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
-w /videocap linjieli222/videocap_torch1.7:fairscale \
bash -c "source /videocap/setup.sh && bash"
数据预处理
2.提取帧和生成tsv文件的代码,在readme里并未详细给出。依次执行下列代码生成所需文件
对MSVD预处理
# train
python ./prepro/extract_frames.py \
--video_root_dir ./datasets/MSVD/videos/ \
--save_dir ./datasets/MSVD/ \
--video_info_tsv ./datasets/MSVD/train.img.tsv \
--num_frames 32
python ./prepro/create_image_frame_tsv.py \
--dataset MSVD \
--split train \
--image_size 256 \
--num_frames 32
# val
python ./prepro/extract_frames.py \
--video_root_dir ./datasets/MSVD/videos/ \
--save_dir ./datasets/MSVD/ \
--video_info_tsv ./datasets/MSVD/val.img.tsv \
--num_frames 32
python ./prepro/create_image_frame_tsv.py \
--dataset MSVD \
--split val \
--image_size 256 \
--num_frames 32
# test
python ./prepro/extract_frames.py \
--video_root_dir ./datasets/MSVD/videos/ \
--save_dir ./datasets/MSVD/ \
--video_info_tsv ./datasets/MSVD/test.img.tsv \
--num_frames 32
python ./prepro/create_image_frame_tsv.py \
--dataset MSVD \
--split test \
--image_size 256 \
--num_frames 32
对MSRVTT预处理
# train
python ./prepro/extract_frames.py \
--video_root_dir ./datasets/MSRVTT-v2/videos/ \
--save_dir ./datasets/MSRVTT-v2/ \
--video_info_tsv ./datasets/MSRVTT-v2/train.img.tsv \
--num_frames 32
python ./prepro/create_image_frame_tsv.py \
--dataset MSRVTT-v2 \
--split train \
--image_size 256 \
--num_frames 32
# val
python ./prepro/extract_frames.py \
--video_root_dir ./datasets/MSRVTT-v2/videos/ \
--save_dir ./datasets/MSRVTT-v2/ \
--video_info_tsv ./datasets/MSRVTT-v2/val.img.tsv \
--num_frames 32
python ./prepro/create_image_frame_tsv.py \
--dataset MSRVTT-v2 \
--split val \
--image_size 256 \
--num_frames 32
# test
python ./prepro/extract_frames.py \
--video_root_dir ./datasets/MSRVTT-v2/videos/ \
--save_dir ./datasets/MSRVTT-v2/ \
--video_info_tsv ./datasets/MSRVTT-v2/test.img.tsv \
--num_frames 32
python ./prepro/create_image_frame_tsv.py \
--dataset MSRVTT-v2 \
--split test \
--image_size 256 \
--num_frames 32
测试代码
3.执行验证代码
# Assume in the docker container
EVAL_DIR='./models/table1/msvd/best-checkpoint/'
CUDA_VISIBLE_DEVICES=0 python src/tasks/run_caption_VidSwinBert.py \
--val_yaml MSVD/val_32frames.yaml \
--do_eval true \
--do_train false \
--eval_model_dir $EVAL_DIR
注意修改val_32frames.yaml,下面是修改后的文件
img: frame_tsv/val_32frames_img_size256.img.tsv #这里的tsv文件是刚生成的
label: val.label.tsv
caption: val.caption.tsv
caption_coco_format: val.caption_coco_format.json
训练代码
4.多卡训练脚本
MSVD多卡训练脚本
#!/usr/bin/env bash
export CUDA_VISIBLE_DEVICES=0,1,2,3
GPU_NUM=4
WORLD_SIZE=1
RANK=0
MASTER_ADDR=127.0.0.1
TOTAL_GPU=$((WORLD_SIZE * GPU_NUM))
MASTER_PORT=$(( RANDOM % (49151 - 1024 + 1 ) + 1024 ))
python -u -m torch.distributed.launch --nproc_per_node=$GPU_NUM \
--master_addr=$MASTER_ADDR \
--master_port=$MASTER_PORT \
--nnodes=$WORLD_SIZE \
--node_rank=$RANK \
--use_env \
src/tasks/run_caption_VidSwinBert.py \
--config src/configs/VidSwinBert/msvd_8frm_default.json \
--train_yaml MSVD/train_32frames.yaml \
--val_yaml MSVD/val_32frames.yaml \
--per_gpu_train_batch_size 3 \
--per_gpu_eval_batch_size 3 \
--num_train_epochs 2 \
--learning_rate 0.00006 \
--max_num_frames 32 \
--pretrained_2d 0 \
--backbone_coef_lr 0.006 \
--mask_prob 0.5\
--max_masked_token 45 \
--zero_opt_stage 1 \
--mixed_precision_method deepspeed \
--deepspeed_fp16 \
--gradient_accumulation_steps 1 \
--learn_mask_enabled \
--loss_sparse_w 0.5 \
--output_dir ./output/msvd
MSRVTT多卡训练脚本
#!/usr/bin/env bash
export CUDA_VISIBLE_DEVICES=0,1,2,3
GPU_NUM=4
WORLD_SIZE=1
RANK=0
MASTER_ADDR=127.0.0.1
TOTAL_GPU=$((WORLD_SIZE * GPU_NUM))
MASTER_PORT=$(( RANDOM % (49151 - 1024 + 1 ) + 1024 ))
python -u -m torch.distributed.launch --nproc_per_node=$GPU_NUM \
--master_addr=$MASTER_ADDR \
--master_port=$MASTER_PORT \
--nnodes=$WORLD_SIZE \
--node_rank=$RANK \
--use_env \
src/tasks/run_caption_VidSwinBert.py \
--config src/configs/VidSwinBert/msrvtt_8frm_default.json \
--train_yaml MSRVTT-v2/train_32frames.yaml \
--val_yaml MSRVTT-v2/val_32frames.yaml \
--per_gpu_train_batch_size 6 \
--per_gpu_eval_batch_size 6 \
--num_train_epochs 10 \
--learning_rate 0.000075 \
--max_num_frames 32 \
--pretrained_2d 0 \
--backbone_coef_lr 0.0125 \
--mask_prob 0.5 \
--max_masked_token 45 \
--zero_opt_stage 1 \
--mixed_precision_method deepspeed \
--deepspeed_fp16 \
--gradient_accumulation_steps 4 \
--learn_mask_enabled \
--loss_sparse_w 0.5 \
--output_dir ./output/msrvtt