简介
最近都是看图像里边的语义分割部分内容,比较有趣,同时入门Pytorch。Pytorch的主要特点是基本上所有操作都是用类来进行封装,本身自带很多类,而且你也可以根据官方的类进行修改。
数据导入,本来Pytorch就有好几个类共同实现,分别是 DataSet, DataLoader, DataLoaderIter等。
DataSet指明读取图片的路径和数目,DataLoader 实现读取数据的并行。
1. ImageFolder + DataLoader
首先我的数据是存在data_dir里边,每个子文件夹作为一类。
data_dir = '/Ryoma/data/'
from torchvision import transforms
transform = transforms.Compose([
# you can add other transformations in this list
transforms.ToTensor()
])
train_sets = datasets.ImageFolder(data_dir, transform)
train_loader = torch.utils.data.DataLoader(train_sets, batch_size=10,
shuffle=True, num_workers=4)
print(train_loader)
inputs, classes = next(iter(train_loader))
# Visualize a few images
def imshow(inp, title=None):
"""Imshow for Tensor."""
print(inputs.shape)
inp = inp[0]
inp = inp.numpy().transpose((1, 2, 0))
# mean = np.array([0.485, 0.456, 0.406])
# std = np.array([0.229, 0.224, 0.225])
# inp = std * inp + mean
plt.imshow(inp)
if title is not None:
plt.title(title)
imshow(inputs)
划分数据集
如果需要对数据集进行划分,可以采用以下方法:
num_train = len(train_dataset)
indices = list(range(num_train))
split = int(np.floor(valid_size * num_train))
if shuffle:
np.random.seed(random_seed)
np.random.shuffle(indices)
train_idx, valid_idx = indices[split:], indices[:split]
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=batch_size, sampler=train_sampler,
num_workers=num_workers, pin_memory=pin_memory,
)
valid_loader = torch.utils.data.DataLoader(
valid_dataset, batch_size=batch_size, sampler=valid_sampler,
num_workers=num_workers, pin_memory=pin_memory,
)
2.Dataset + DataLoader
采用DataLoader是更加高效的方法。首先先编辑Dataset类,使得能够读取一张照片,然后,利用DataLoader进行批次读取。
import torch
from torch.utils import data
class Dataset(data.Dataset):
'Characterizes a dataset for PyTorch'
def __init__(self, list_IDs, labels):
'Initialization'
self.labels = labels
self.list_IDs = list_IDs
def __len__(self):
'Denotes the total number of samples'
return len(self.list_IDs)
def __getitem__(self, index):
'Generates one sample of data'
# Select sample
ID = self.list_IDs[index]
# Load data and get label
X = torch.load('data/' + ID + '.pt')
y = self.labels[ID]
return X, y
然后,
import torch
from torch.utils import data
from my_classes import Dataset
# CUDA for PyTorch
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
cudnn.benchmark = True
# Parameters
params = {'batch_size': 64,
'shuffle': True,
'num_workers': 6}
max_epochs = 100
# Datasets
partition = # IDs
labels = # Labels
# Generators
training_set = Dataset(partition['train'], labels)
training_generator = data.DataLoader(training_set, **params)
validation_set = Dataset(partition['validation'], labels)
validation_generator = data.DataLoader(validation_set, **params)
# Loop over epochs
for epoch in range(max_epochs):
# Training
for local_batch, local_labels in training_generator:
# Transfer to GPU
local_batch, local_labels = local_batch.to(device), local_labels.to(device)
# Model computations
[...]
# Validation
with torch.set_grad_enabled(False):
for local_batch, local_labels in validation_generator:
# Transfer to GPU
local_batch, local_labels = local_batch.to(device), local_labels.to(device)
# Model computations
[...]
3. 其他 读取CSV文件
# -*- coding: utf-8 -*-
import csv
import pandas as pd
import numpy as np
import torch
import torch.utils.data as data
class FaceLandmarksDataset(data.Dataset):
"""Face Landmarks dataset."""
def __init__(self, csv_file):
"""
Args:
csv_file (string): Path to the csv file with annotations.
root_dir (string): Directory with all the images.
transform (callable, optional): Optional transform to be applied
on a sample.
"""
self.landmarks_frame = pd.read_csv(csv_file, iterator=True)
def __len__(self):
#print len(self.landmarks_frame)
#return len(self.landmarks_frame)
return 1800000
def __getitem__(self, idx):
print idx
landmarks = self.landmarks_frame.get_chunk(128).as_matrix().astype('float')
# landmarks = self.landmarks_frame.ix[idx, 1:].as_matrix().astype('float')
# 采用这个,不错。
return landmarks
filename = '/media/czn/e04e3ecf-cf63-416c-afd7-6d737e09968a/zhongkeyuan/dataset/CSV/HGG_pandas.csv'
dataset = FaceLandmarksDataset(filename)
train_loader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True)
for data in train_loader:
print data
或者是利用 pd.read_csv
的迭代器。
import torch
from torch.utils import data
import pandas as pd
class MyDataset(data.Dataset):
def __init__(self, csv_path, chunkSize):
self.chunksize = chunkSize
self.reader = pd.read_csv(csv_path, sep=',', chunksize=self.chunksize, header=None, iterator=True)
def __len__(self):
return self.chunksize
def __getitem__(self, index):
data = self.reader.get_chunk(self.chunksize)
tensorData = torch.as_tensor(data.values, dtype=torch.float32)
inputs = tensorData[:, :-1]
labels = tensorData[:, 99]
return inputs, labels
def main():
batch_size = 100
kwargs = {}
custom_data_from_csv = MyDataset('data/mydata.txt', batch_size)
train_loader = data.DataLoader(dataset=custom_data_from_csv, batch_size=batch_size, shuffle=True, **kwargs)
for inputData, target in enumerate(train_loader):
print(inputData)
print(target)
if __name__ == '__main__':
main()
4. TensorDataset
class TensorDataset(Dataset[Tuple[Tensor, ...]]):
r"""Dataset wrapping tensors.
Each sample will be retrieved by indexing tensors along the first dimension.
Arguments:
*tensors (Tensor): tensors that have the same size of the first dimension.
"""
tensors: Tuple[Tensor, ...]
def __init__(self, *tensors: Tensor) -> None:
assert all(tensors[0].size(0) == tensor.size(0) for tensor in tensors)
self.tensors = tensors
def __getitem__(self, index):
return tuple(tensor[index] for tensor in self.tensors)
def __len__(self):
return self.tensors[0].size(0)
5. 流式读取文件
import tensorflow as tf
import torch
import random
class TfRecordDataset(torch.utils.data.IterableDataset):
def __init__(self, tfrecord_path):
"tfrecord_path contains tf record files"""
self.buffer_size = buffer_size
self.tfrecord_list = sorted(glob.glob(tfrecord_path))
super(TfRecordDataset, self).__init__()
def __iter__(self):
worker_info = torch.utils.data.get_worker_info()
if worker_info is None:
tfrecord_list = self.tfrecord_list
else:
worker_id = worker_info.id
num_workers = worker_info.num_workers
tfrecord_list = self.tfrecord_list[worker_id::num_workers]
random.seed(worker_info.seed)
def tf_record_iterator():
for filename in tfrecord_list:
record_iterator = tf.python_io.tf_record_iterator(filename,
tf.python_io.TFRecordCompressionType.NONE)
for string_record in record_iterator:
example = tf.train.Example()
example.ParseFromString(string_record)
do_stuff_with_example(example)
yield example
return tf_record_iterator()
5. 注意
DataLoader中的shuffle为True时,只有在它是作为迭代器的时候才生效,在迭代前,进行对数据的打乱。
参考: