'''
This script does all the data preprocessing.
You'll need to install CMU-Multimodal DataSDK
(https://github.com/A2Zadeh/CMU-MultimodalDataSDK) to use this script.
There's a packaged (and more up-to-date) version
of the utils below at https://github.com/Justin1904/tetheras-utils.
Preprocessing multimodal data is really tiring...
'''from __future__ import print_function
import mmdata
import numpy as np
from torch.utils.data import Dataset
defpad(data, max_len):"""Pads data without time stamps"""
data = remove_timestamps(data)
n_rows = data.shape[0]
dim = data.shape[1]if max_len >= n_rows:
diff = max_len - n_rows
padding = np.zeros((diff, dim))
padded = np.concatenate((padding, data))return padded
else:return data[-max_len:]defremove_timestamps(segment_data):"""Removes the start and end time stamps in the Multimodal Data SDK"""return np.array([feature[2]for feature in segment_data])classProcessedDataset(Dataset):"""The class object for processed data, pipelined from CMU-MultimodalDataSDK through MultimodalDataset"""def__init__(self, audio, visual, text, labels):
self.audio = audio
self.visual = visual
self.text = text
self.labels = labels
def__len__(self):"""Checks the number of data points are the same across different modalities, and return length"""assert self.audio.shape[1]== self.visual.shape[1]and self.visual.shape[1]== self.text.shape[1]and self.text.shape[1]== self.labels.shape[0]return self.audio.shape[1]def__getitem__(self, idx):"""Returns the target element by index"""return[self.audio[:, idx,:], self.visual[:, idx,:], self