class DataLoader(object):
r"""
Data loader. Combines a dataset and a sampler, and provides
single- or multi-process iterators over the dataset.
Arguments:
dataset (Dataset): dataset from which to load the data.
batch_size (int, optional): how many samples per batch to load
(default: 1).
shuffle (bool, optional): set to ``True`` to have the data reshuffled
at every epoch (default: False).
sampler (Sampler, optional): defines the strategy to draw samples from
the dataset. If specified, ``shuffle`` must be False.
batch_sampler (Sampler, optional): like sampler, but returns a batch of
indices at a time. Mutually exclusive with batch_size, shuffle,
sampler, and drop_last.
num_workers (int, optional): how many subprocesses to use for data
loading. 0 means that the data will be loaded in the main process.
(default: 0)
collate_fn (callable, optional): merges a list of samples to form a mini-batch.
pin_memory (bool, optional): If ``True``, the data loader will copy tensors
into CUDA pinned memory before returning them.
drop_last (bool, optional): set to ``True`` to drop the last incomplete batch,
if the dataset size is not divisible by the batch size. If ``False`` and
the size of dataset is not divisible by the batch size, then the last batch
will be smaller. (default: False)
timeout (numeric, optional): if positive, the timeout value for collecting a batch
from workers. Should always be non-negative. (default: 0)
worker_init_fn (callable, optional): If not None, this will be called on each
worker subprocess with the worker id (an int in ``[0, num_workers - 1]``) as
input, after seeding and before data loading. (default: None)
.. note:: By default, each worker will have its PyTorch seed set to
``base_seed + worker_id``, where ``base_seed`` is a long generated
by main process using its RNG. However, seeds for other libraies
may be duplicated upon initializing workers (w.g., NumPy), causing
each worker to return identical random numbers. (See
:ref:`dataloader-workers-random-seed` section in FAQ.) You may
use ``torch.initial_seed()`` to access the PyTorch seed for each
worker in :attr:`worker_init_fn`, and use it to set other seeds
before data loading.
.. warning:: If ``spawn`` start method is used, :attr:`worker_init_fn` cannot be an
unpicklable object, e.g., a lambda function.
"""
__initialized = False
def __init__(self, dataset, batch_size=1, shuffle=False, sampler=None, batch_sampler=None,
num_workers=0, collate_fn=default_collate, pin_memory=False, drop_last=False,
timeout=0, worker_init_fn=None):
self.dataset = dataset
self.batch_size = batch_size
self.num_workers = num_workers
self.collate_fn = collate_fn
self.pin_memory = pin_memory
self.drop_last = drop_last
self.timeout = timeout
self.worker_init_fn = worker_init_fn
if timeout < 0:
raise ValueError('timeout option should be non-negative')
if batch_sampler is not None:
if batch_size > 1 or shuffle or sampler is not None or drop_last:
raise ValueError('batch_sampler option is mutually exclusive '
'with batch_size, shuffle, sampler, and '
'drop_last')
self.batch_size = None
self.drop_last = None
if sampler is not None and shuffle:
raise ValueError('sampler option is mutually exclusive with '
'shuffle')
if self.num_workers < 0:
raise ValueError('num_workers option cannot be negative; '
'use num_workers=0 to disable multiprocessing.')
if batch_sampler is None:
if sampler is None:
if shuffle:
sampler = RandomSampler(dataset)
else:
sampler = SequentialSampler(dataset)
batch_sampler = BatchSampler(sampler, batch_size, drop_last)
self.sampler = sampler
self.batch_sampler = batch_sampler
self.__initialized = True
def __setattr__(self, attr, val):
if self.__initialized and attr in ('batch_size', 'sampler', 'drop_last'):
raise ValueError('{} attribute should not be set after {} is '
'initialized'.format(attr, self.__class__.__name__))
super(DataLoader, self).__setattr__(attr, val)
def __iter__(self):
return _DataLoaderIter(self)
def __len__(self):
return len(self.batch_sampler)
数据加载器,结合了数据集和取样器,并且可以提供多个线程处理数据集。
在训练模型时使用到此函数,用来把训练数据分成多个小组,此函数每次抛出一组数据。直至把所有的数据都抛出。就是做一个数据的初始化。
此函数的参数:
dataset:包含所有数据的数据集
batch_size :每一小组所包含数据的数量
Shuffle : 是否打乱数据位置,当为Ture时打乱数据,全部抛出数据后再次dataloader时重新打乱。
sampler : 自定义从数据集中采样的策略,如果制定了采样策略,shuffle则必须为False.
Batch_sampler:和sampler一样,但是每次返回一组的索引,和batch_size, shuffle, sampler, drop_last 互斥。
num_workers : 使用线程的数量,当为0时数据直接加载到主程序,默认为0。
collate_fn:不太了解
pin_memory:s 是一个布尔类型,为T时将会把数据在返回前先复制到CUDA的固定内存中
drop_last:布尔类型,为T时将会把最后不足batch_size的数据丢掉,为F将会把剩余的数据作为最后一小组。
timeout:默认为0。当为正数的时候,这个数值为时间上限,每次取一个batch超过这个值的时候会报错。此参数必须为正数。
worker_init_fn:和进程有关系,暂时用不到
应用实例:
'''
批训练:把数据分为一小批一小批进行训练
Dataloader就是用来包装使用的数据,
比如说该程序中把数据5个5个的打包,
每一次抛出一组数据进行操作。
'''
import torch
import torch.utils.data as Data
torch.manual_seed(1)
BATCH_SIZE = 5
x = torch.linspace(1,10,10)
y = torch.linspace(10,1,10)
torch_dataset = Data.TensorDataset(x,y) #把数据放在数据库中
loader = Data.DataLoader(
# 从dataset数据库中每次抽出batch_size个数据
dataset=torch_dataset,
batch_size=BATCH_SIZE,
shuffle=True,#将数据打乱
num_workers=2, #使用两个线程
)
def show_batch():
for epoch in range(3): #对全部数据进行3次训练
for step,(batch_x,batch_y) in enumerate(loader): # 每一次挑选出来的size个数据
# training
# 打印出来,观察数据
print('Epoch:',epoch,'|Step:',step,'|batch x:',
batch_x.numpy(),'|batch y:',batch_y.numpy())
if __name__ == '__main__':
show_batch()
结果:
Epoch: 0 |Step: 0 |batch x: [ 5. 7. 10. 3. 4.] |batch y: [6. 4. 1. 8. 7.]
Epoch: 0 |Step: 1 |batch x: [2. 1. 8. 9. 6.] |batch y: [ 9. 10. 3. 2. 5.]
Epoch: 1 |Step: 0 |batch x: [ 4. 6. 7. 10. 8.] |batch y: [7. 5. 4. 1. 3.]
Epoch: 1 |Step: 1 |batch x: [5. 3. 2. 1. 9.] |batch y: [ 6. 8. 9. 10. 2.]
Epoch: 2 |Step: 0 |batch x: [ 4. 2. 5. 6. 10.] |batch y: [7. 9. 6. 5. 1.]
Epoch: 2 |Step: 1 |batch x: [3. 9. 1. 8. 7.] |batch y: [ 8. 2. 10. 3. 4.]
在使用的时候遇到了一些小问题,当num_workers不为0的时候,也就是用多个线程处理数据的时候,直接运行会报错。使用
if __name__ == '__main__': 就可以拯救,虽然不知道具体的机理,但是大概的原因就是因为使用了多线程,需要同时启动多个线程同时工作。