在mxnet上训练Slowfast视频动作识别模型
mxnet的安装(略)
slowfast模型简介(略)
准备自己的数据集
训练文件目录如下所示,每种动作的视频放在单独的文件夹中。视频格式一定要是同一种且mxnet支持的格式。
例如eating文件夹下:
按照官方的文档,还需要创建一个train.txt(可以创建一个空的,后边训练的程序里边会自动填写),包含所有视频的信息。第一列是文件夹和名字,第二类是取出的帧数,第三类是类别编号。
模型训练
我自己写了个train_tool.py来调用opencv的函数封装成mxnet能调用的类来完成训练中的数据处理。
from mxnet.gluon import Block
import cv2 as cv
import numpy as np
class VideoScale(Block):
def __init__(self, size):
super(VideoScale, self).__init__()
self.size = size
def forward(self, clips):
new_clips = []
for cur_img in clips:
new_img=cv.resize(cur_img,dsize=self.size,interpolation = cv.INTER_AREA)
new_clips.append(new_img)
return new_clips
class VideoRearrange(Block):
def __init__(self, fast_frame_num, slow_frame_num):
super(VideoRearrange, self).__init__()
self.fast_frame_num = fast_frame_num
self.slow_frame_num = slow_frame_num
def forward(self, clips):
clips = np.array(clips)
frame_num = clips.shape[0]
fast_gap = int(frame_num/self.fast_frame_num)
slow_gap = int(frame_num/self.slow_frame_num)
fast_frame_id_list = range(0, int(fast_gap*self.fast_frame_num), fast_gap)
slow_frame_id_list = range(0, int(slow_gap*self.slow_frame_num), slow_gap)
frame_id_list = list(fast_frame_id_list) + list(slow_frame_id_list)
new_clips = [clips[vid, :, :, :] for vid, _ in enumerate(frame_id_list)]
return new_clips
训练大致包括以下过程:
1.收集训练集信息
2.数据预处理
3.模型训练
4.保存模型参数为param文件,保存分类信息为npy文件。
5.加载测试
训练代码
from __future__ import division
import argparse, time, logging, os, sys, math
import numpy as np
import mxnet as mx
import gluoncv as gcv
from mxnet import gluon, nd, init, context
from mxnet import autograd as ag
from mxnet.gluon import nn
from mxnet.gluon.data.vision import transforms
from gluoncv.data.transforms import video
from gluoncv.data import VideoClsCustom
from gluoncv.model_zoo import get_model
from gluoncv.utils import makedirs, LRSequential, LRScheduler, split_and_load, TrainingHistory,export_block
import decord
import os
import glob
import train_tool
batch = 5
train_data_dir = r'C:\Users\Visungky\source\repos\ActionRecognition\train_data'
train_list_dir = r'C:\Users\Visungky\source\repos\ActionRecognition\train.txt'
train_list = open(train_list_dir,'w')
label_list = os.listdir(train_data_dir)
class_num = 0
classes = []
#print(label_list)
print('Collecting training data...')
for label in label_list:
video_label_dir = os.path.join(train_data_dir,label)
video_names = os.listdir(video_label_dir)
for video_name in video_names:
video_dir = os.path.join(video_label_dir,video_name)
path_to_video = os.path.join(label,video_name)
train_list.write('%s %d %s\n'%(path_to_video,100,class_num))
class_num = class_num+1
classes.append(label)
train_list.close()
print('Collect finish.')
print(classes)
# Load and prepare data
num_gpus = 1
ctx = [mx.gpu(i) for i in range(num_gpus)]
transform_fn = transforms.Compose([
video.VideoCenterCrop(size=(800,500)),
train_tool.VideoScale(size=(224,224)),
train_tool.VideoRearrange(fast_frame_num=32,slow_frame_num=4),
video.VideoToTensor(),
video.VideoNormalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
per_device_batch_size = 2
num_workers = 0
batch_size = per_device_batch_size * num_gpus
train_dataset = VideoClsCustom(root=os.path.expanduser(train_data_dir),
setting=os.path.expanduser(train_list_dir),
train=True,
transform=transform_fn,
#different models need different frame length
new_length=36,
video_loader=True,
use_decord=True,
video_ext='avi'
)
print('Load %d training samples.' % len(train_dataset))
print('Total %d classes' % class_num)
train_data = gluon.data.DataLoader(train_dataset, batch_size=batch_size,
shuffle=True, num_workers=num_workers)
#
model_name = 'slowfast_4x16_resnet50_custom'
net = get_model(name=model_name, nclass=5, use_kinetics_pretrain=True)
net.collect_params().reset_ctx(ctx)
print('%s model is successfully loaded.' % model_name)
# Learning rate decay factor
lr_decay = 0.1
# Epochs where learning rate decays
lr_decay_epoch = [40, 80, 100]
# Stochastic gradient descent
optimizer = 'sgd'
# Set parameters
optimizer_params = {'learning_rate': 0.001, 'wd': 0.0001, 'momentum': 0.9}
# Define our trainer for net
trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params)
loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
train_metric = mx.metric.Accuracy()
train_history = TrainingHistory(['training-acc'])
epochs = 60
lr_decay_count = 0
print("\n***Training model***\n")
for epoch in range(epochs):
tic = time.time()
train_metric.reset()
train_loss = 0
# Learning rate decay
if epoch == lr_decay_epoch[lr_decay_count]:
trainer.set_learning_rate(trainer.learning_rate*lr_decay)
lr_decay_count += 1
# Loop through each batch of training data
for i, batch in enumerate(train_data):
# Extract data and label
data = split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
label = split_and_load(batch[1], ctx_list=ctx, batch_axis=0)
# AutoGrad
with ag.record():
output = []
for _, X in enumerate(data):
X = X.reshape((-1,) + X.shape[2:])
#print(X.shape)
pred = net(X)
output.append(pred)
loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)]
# Backpropagation
for l in loss:
l.backward()
# Optimize
trainer.step(batch_size)
# Update metrics
train_loss += sum([l.mean().asscalar() for l in loss])
train_metric.update(label, output)
if i == 100:
break
name, acc = train_metric.get()
# Update history and print metrics
train_history.update([acc])
print('[Epoch %d] train=%f loss=%f time: %f' %
(epoch, acc, train_loss / (i+1), time.time()-tic))
print("\n**Training finish**\n")
# We can plot the metric scores with:
train_history.plot(save_path='acc.jpg')
print("Exporting...")
#export_block('./myTrainedModel',net)
net.save_parameters('./mypara.params')
classes = np.array(classes)
np.save('classes.npy',classes)
print("Export done:")
print(str(glob.glob('*.params')[0]))
print(str(glob.glob('*.npy')[0]))
# Load my net
print('Loading saved net')
mynet = get_model(name=model_name, nclass=class_num)
mynet.load_parameters('./mypara.params')
print('Load finished')
训练完成会输出acc曲线