Argparse
from argparse import ArgumentParser
parser = Argumentparser()
paser.add_argument("--dataset",,,)
args = parser.parse_args()
## 使用
dataset = args.dataset
config
自定义TrainConfig类, 里面存储训练需要的配置参数,并且init里面还可以写入一些预处理步骤。
class TrainConfig(object):
"""Training Configurations"""
input_window_size = 50 # Input window size during training
output_window_size = 10 # Output window size during training
'''
省略部分config
'''
def __init__(self, dataset, datatype, action, gpu, training, visualize):
self.device_ids = gpu # index of GPU used to train the model
self.train_model = training # train or predict
self.visualize = visualize # visualize the predicted motion during testing
self.dataset = dataset
self.datatype = datatype
self.filename = action
# number of bones
if dataset == 'Mouse':
self.nbones = 4
if self.decoder == 'Kinematics_lstm':
self.decoder = self.decoder_name[0]
print('You chose Kinematics_lstm as decoder, but lstm decoder is compatible for mouse dataset! Correct it automatically!!')
elif dataset == 'Human':
self.nbones = 18
'''
省略init
'''
调用参数进行训练
config = config.TrainConfig(args.dataset,,,)
##调用TrainConfig的参数进行训练和推理
if config.train_model is True:
train(config, checkpoint_dir)
else:
prediction(config, checkpoint_dir, output_dir)
prediction
选择数据集
from choose_dataset import DatasetChooser
choose = DatasetChooser(config)
## DatasetChooser主要代码如下
import load_data as loader
##省略##
if self.config.datatype == 'lie':
if self.dataset == 'Human':
bone_length_path = None
data_loader = loader.HumanPredictionDataset(self.config)
data = data_loader.get_data()
self.config.input_size = data[0][list(data[0].keys())[0]].shape[2]
上述代码使用自定义的load_data加载数据:
使用了load_data.py中的类class HumanPredictionDataset(object):
,这个类里面定义方法get_data(),使用它得到数据。
ST_HRN网络
forward输入encoder_inputs,来自train_dataset,在class DatasetChooser(object):中
import load_data as loader
data = loader.HumanDataset(self.config, train=train)
详细研究了class HumanDataset(Dataset):
,试图弄清楚lie代数数据是如何得到的,直接读取数据集还是有一个从xyz坐标到李代数的转换过程。结果发现是直接读取的lie数据的mat文件,具体lie代数是如何生成的,在utils.py中有很多方法,在这里面寻找:
- def expmap2rotmat(A): 对应罗德里格斯公式,将轴角向量转化为旋转矩阵。
- def lietomatrix(angle, trans): 对应由旋转矩阵和平移向量得到变换矩阵SE
确定训练和预测时输入输出的帧长度
参数input_window_size
, output_window_size
, 分别代表输入的帧长度和输出的帧长度:
## config.py的预设
input_window_size = 50 # Input window size during training
output_window_size = 10 # Output window size during training
## train.py的train方法,超长期预测输出100帧,长期输出10
if config.longterm is True:
config.output_window_size = 100
## train.py的prediction方法,h3.6m数据集预测过程中,输出帧数100
if config.dataset == 'Mouse':
config.output_window_size = 75
else:
config.output_window_size = 100
## longterm表示超长期预测,True时输出预测帧数为测试视频长度
if config.longterm is False:
prediction_dataset, bone_length = choose(prediction=True)
x_test, y_test, dec_in_test = prediction_dataset
actions = list(x_test.keys())
else:
# get raw validation data because the test data isn't usable
train_dataset, bone_length = choose(train=False)
test_set = train_dataset.data
x_test = {}
y_test = {}
dec_in_test = {}
test_set = test_set[0]
x_test[config.filename] = np.reshape(test_set[:config.input_window_size-1,:], [1, -1, config.input_size])
y_test[config.filename] = np.reshape(test_set[config.input_window_size:, :], [1, -1, config.input_size])
dec_in_test[config.filename] = np.reshape(test_set[config.input_window_size-1:-1, :], [1, -1, config.input_size])
## 注意这句表明输出帧数
config.output_window_size = y_test[config.filename].shape[1]
actions = [config.filename]
即,作者的设定是:
- 如果
longterm=False
,则训练时输入50帧,用于计算loss的输出是10帧;预测时输入50帧,输出是100帧 - 如果
longterm=True
,则训练时输入50帧,用于计算loss的输出是100帧;预测时输入50帧,输出帧数是测试集视频除去前50帧的视频总长度
改进实验
训练输入50帧,输出也改为50帧;
预测输入输出也改为50帧;
修改代码:
# config.py
input_window_size = 50 # Input window size during training
output_window_size = 50 # Output window size during training
# train.py: def prediction
if config.dataset == 'Mouse':
config.output_window_size = 75
else:
config.output_window_size = 50
# plot_animation.py: class plot_h36m:
def plot(self):
ani = FuncAnimation(self.fig, self.update, frames=self.nframes, interval=100, repeat=False)
plt.title(self.filename, fontsize=16)
ani.save('./GIF/output_50/'+self.filename + '.gif', writer='pillow')
# plt.show()
确定输入形式
明确怎样读取的输入数据,以及输入数据的形式才能改进
choose = DatasetChooser(config)
train_dataset, bone_length = choose(train=True)
train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
训练
# Train
#with torch.autograd.set_detect_anomaly(True):
for it in range(config.training_size):
for i, data in enumerate(train_loader, 0):
encoder_inputs = data['encoder_inputs'].float().to(device)
decoder_inputs = data['decoder_inputs'].float().to(device)
decoder_outputs = data['decoder_outputs'].float().to(device)
prediction = net(encoder_inputs, decoder_inputs, train=True)
loss = Loss(prediction, decoder_outputs, bone_length, config)
load_data中如何读取关节数据:
total_frames = self.config.input_window_size + self.config.output_window_size
video_frames = sample.shape[0]
idx = np.random.randint(1, video_frames - total_frames)
data_seq = sample[idx:idx + total_frames, :]
encoder_inputs = data_seq[:self.config.input_window_size - 1, :]
# 最后一个弃掉了,这里代码还可以精简
# decoder_inputs代表ground truth
if train:
decoder_inputs = data_seq[self.config.input_window_size - 1:
self.config.input_window_size - 1 + self.config.output_window_size, :]
else:
decoder_inputs = data_seq[self.config.input_window_size - 1:self.config.input_window_size, :]
decoder_outputs = data_seq[self.config.input_window_size:, :]
return {'encoder_inputs': encoder_inputs, 'decoder_inputs': decoder_inputs, 'decoder_outputs': decoder_outputs}
改进点
1.空间轴增加不同肢体部位的骨骼的信息交互
2.尝试多尺度的方法, 将一个骨骼链建模为一个LSTM, 而不是每个骨骼对应一个LSTM