pytorch提供了一个数据读取的方法,其由两个类构成:
(1)torch.utils.data.Dataset
(2)torch.utils.data.DataLoader
自定义类的说明
在定义自己的数据类时,需要继承torch.utils.data.Dataset,并且至少要重载两个方法__len__,和__getitem__,其中
(1)__len__返回的是数据集的大小
(2)__getitem__实现索引数据集中的某一个数据
# 一个自定义类示例
from torch.utils.data import DataLoader, Dataset
import torch
class MyDataset(Dataset): # 继承了Dataset类
def __init__(self, csv_file, root_dir, transform=None):
"""
Args:
csv_file (string): Path to the csv file with annotations.
root_dir (string): Directory with all the images.
transform (callable, optional): Optional transform to be applied
on a sample.
"""
self.landmarks_frame = pd.read_csv(csv_file)
self.root_dir = root_dir
self.transform = transform
# 负责读取当前batch需要的数据,以避免将所有数据导入内存,导致内存不够用
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
# 读取image
img_name = os.path.join(self.root_dir, self.landmarks_frame.iloc[idx, 0])
image = io.imread(img_name)
# 读取annotation
landmarks = self.landmarks_frame.iloc[idx, 1:]
landmarks = np.array([landmarks])
landmarks = landmarks.astype('float').reshape(-1, 2)
#
sample = {'image': image, 'landmarks': landmarks}
if self.transform:
sample = self.transform(sample)
return sample
def __len__(self):
return len(self.landmarks_frame)
face_dataset = FaceLandmarksDataset(csv_file='data/faces/face_landmarks.csv', root_dir='data/faces/')
for i in range(len(face_dataset)):
sample = face_dataset[i]
print(i, sample['image'].shape, sample['landmarks'].shape)