使用dali加速,前提是 gpu没有跑满,不然效果也不大
+apex 混合精度训练
5、tf ,mxnet ,caffe, 数据输入,转换成 pytorch 训练
pytorch 训练 通常直接是图片,如果是mxnet rec数据 或者 caffe lmdb数据,就使用官方提供的方法
就是找这个办法,网上一不小心就找到 External 自定义接口,去了,写好了 速度又根本没有提升,最后找到了这里 Using PyTorch DALI plugin: using various readers
别人写的 DALI_pytorch_demo, 速度可以
https://github.com/yaysummeriscoming/DALI_pytorch_demo
问题 pytorch + apex 训练很慢的问题 Training very slow using dali+apex on imagenet with v100#1834
例如人脸识别的rec 数据直接 rec dali 读取训练,不用转换成图片 或者numpy
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3' #放在首位 指定 显卡
import os.path
# MXNet RecordIO
db_folder = os.path.join("/nfs-data/xingwg/data/Glint360k/celeb_deepglint/")
from nvidia.dali.pipeline import Pipeline
import nvidia.dali.ops as ops
import nvidia.dali.types as types
class CommonPipeline(Pipeline):
def __init__(self, batch_size, num_threads, device_id):
super(CommonPipeline, self).__init__(batch_size, num_threads, device_id)
self.decode = ops.ImageDecoder(device = "mixed", output_type = types.RGB)
self.resize = ops.Resize(device = "gpu",
interp_type = types.INTERP_LINEAR)
self.cmn = ops.CropMirrorNormalize(device = "gpu",
dtype = types.FLOAT,
# crop = (227, 227),
mean = [127.5, 127.5, 127.5],
std = [127.5,127.5, 127.5])
self.coin = ops.CoinFlip(probability=0.5)
def base_define_graph(self, inputs, labels):
rng = self.coin()
images = self.decode(inputs)
# images = self.resize(images, resize_shorter = self.resize_rng())
output = self.cmn(images, mirror=rng)
return (output, labels)
class MXNetReaderPipeline(CommonPipeline):
def __init__(self, batch_size, num_threads, device_id, num_gpus):
super(MXNetReaderPipeline, self).__init__(batch_size, num_threads, device_id)
self.input = ops.MXNetReader(path = [db_folder+"train.rec"], index_path=[db_folder+"train.idx"],
random_shuffle = True, shard_id = device_id, num_shards = num_gpus)
def define_graph(self):
images, labels = self.input(name="Reader")
return self.base_define_graph(images, labels)
if __name__=="__main__":
import numpy as np
import time
from nvidia.dali.plugin.pytorch import DALIGenericIterator
N = 1 # number of GPUs
BATCH_SIZE = 512 # batch size per GPU
ITERATIONS = 32
IMAGE_SIZE = 3
pipe_types = [[MXNetReaderPipeline, (0, 999)]] #label 最大 最小值
for pipe_t in pipe_types:
pipe_name, label_range = pipe_t
print("RUN: " + pipe_name.__name__)
pipes = [pipe_name(batch_size=BATCH_SIZE, num_threads=8, device_id=device_id, num_gpus=N) for device_id in
range(N)]
pipes[0].build()
dali_iter = DALIGenericIterator(pipes, ['data', 'label'], pipes[0].epoch_size("Reader"))
for epoch in range(3):
t1 = time.time()
for i, data in enumerate(dali_iter):
if i >= ITERATIONS:
break
print(data[0]["label"].shape) #batch nchw
print(data[0]["data"].shape) # n2 ,每一行第一个是label
# Testing correctness of labels
for d in data:
label = d["label"]
image = d["data"]
print(label[:,0])
## labels need to be integers
assert(np.equal(np.mod(label, 1), 0).all())
## labels need to be in range pipe_name[2]
assert((label >= label_range[0]).all())
assert((label <= label_range[1]).all())
print("time", time.time() - t1)
print("OK : " + pipe_name.__name__) #0.5s , 同样的条件 pytorch 运行时间是 32 s ,j
1、【Pytorch】nvidia-dali——一种加速数据增强的方法
https://blog.csdn.net/weixin_42028608/article/details/105564060
2、官方自定义数据接口使用介绍
`
import types
import collections
import numpy as np
from random import shuffle
from nvidia.dali.pipeline import Pipeline
import nvidia.dali.ops as ops
import nvidia.dali.types as types
batch_size = 4
class ExternalInputIterator(object):
def init(self, batch_size):
self.batch_size = batch_size
def __iter__(self):
self.i = 0
self.n = 1
return self
def __next__(self):
batch = []
labels = []
for _ in range(self.batch_size):
f = open("/mnt/yunle/byebye/coco28/images/train/0.jpg", 'rb')
batch.append(np.frombuffer(f.read(), dtype = np.uint8))
#batch.append(np.frombuffer(b'/mnt/yunle/byebye/coco28/images/train/0.jpg', dtype = np.uint8))
labels.append(np.array([1], dtype = np.uint8))
self.i = (self.i + 1) % self.n
return (batch, labels)
eii = ExternalInputIterator(batch_size)
class ExternalSourcePipeline(Pipeline):
def __init__(self, batch_size, eii, num_threads, device_id):
super(ExternalSourcePipeline, self).__init__(batch_size,
num_threads,
device_id,
seed=12)
self.source = ops.ExternalSource(source = eii, num_outputs = 2)
self.decode = ops.ImageDecoder(device = "mixed", output_type = types.RGB)
self.enhance = ops.BrightnessContrast(device = "gpu", contrast = 2)
def define_graph(self):
jpegs, labels = self.source()
images = self.decode(jpegs)
output = self.enhance(images)
return (output, labels)
pipe = ExternalSourcePipeline(batch_size=batch_size, eii=eii, num_threads=2, device_id = 0)
pipe.build()
pipe_out = pipe.run()
print('Success!')
`
或者别人写好的一个数据加载Dali 模块
https://blog.csdn.net/u014365862/article/details/104412294
from __future__ import division
import torch
import types
import joblib
import collections
import numpy as np
import pandas as pd
from random import shuffle
from nvidia.dali.pipeline import Pipeline
import nvidia.dali.ops as ops
import nvidia.dali.types as types
import nvidia.dali.plugin.pytorch as dalitorch
from nvidia.dali.plugin.pytorch import DALIGenericIterator as PyTorchIterator
def grid2x2(img):
h, w, c = img.shape
left_top = img[0:h//2, 0:w//2, :]
left_bottom = img[h//2:h, 0:w//2, :]
right_top = img[0:h//2, w//2:w, :]
right_bottom = img[h//2:h, w//2:w, :]
return left_top, right_top, left_bottom, left_bottom
class ExternalInputIterator(object):
def __init__(self, images_dir, txt_path, batch_size, device_id, num_gpus):
self.images_dir = images_dir
self.batch_size = batch_size
with open(txt_path, 'r') as f:
self.files = [line.rstrip() for line in f if line is not '']
# whole data set size
self.data_set_len = len(self.files)
# based on the device_id and total number of GPUs - world size
# get proper shard
self.files = self.files[self.data_set_len * device_id // num_gpus:
self.data_set_len * (device_id + 1) // num_gpus]
self.n = len(self.files)
def __iter__(self):
self.i = 0
shuffle(self.files)
return self
def __next__(self):
batch = []
labels = []
if self.i >= self.n:
raise StopIteration
for _ in range(self.batch_size):
jpeg_filename, label = self.files[self.i].split(',')
f = open(self.images_dir + jpeg_filename, 'rb')
# jpeg_filename, label = self.files[self.i], 1
# f = open(jpeg_filename, 'rb')
batch.append(np.frombuffer(f.read(), dtype = np.uint8))
labels.append(np.array([int(label)], dtype = np.uint8))
self.i = (self.i + 1) % self.n
return (batch, labels)
@property
def size(self,):
return self.data_set_len
next = __next__
class ExternalSourcePipeline(Pipeline):
def __init__(self, resize, batch_size, num_threads, device_id, external_data):
super(ExternalSourcePipeline, self).__init__(batch_size,
num_threads,
device_id,
seed=12,
exec_async=False,
exec_pipelined=False,
)
self.input = ops.ExternalSource()
self.input_label = ops.ExternalSource()
self.decode = ops.ImageDecoder(device = "cpu", output_type = types.RGB)
# PythonFunction: exec_async and exec_pipelined need to be False, and input must cpu
self.grid = ops.PythonFunction(function=grid2x2, num_outputs=4)
# self.grid = dalitorch.TorchPythonFunction(function=grid2x2, num_outputs=5)
self.resize = ops.Resize(device="gpu",
resize_x=resize,
resize_y=resize,
interp_type=types.INTERP_LINEAR)
# self.cast = ops.Cast(device = "gpu",
# dtype = types.UINT8)
self.external_data = external_data
self.iterator = iter(self.external_data)
def define_graph(self):
self.jpegs = self.input()
self.labels = self.input_label()
images = self.decode(self.jpegs)
images1, images2, images3, images4 = self.grid(images)
images = self.resize(images.gpu())
images1 = self.resize(images1.gpu())
images2 = self.resize(images2.gpu())
images3 = self.resize(images3.gpu())
images4 = self.resize(images4.gpu())
return (images, images1, images2, images3, images4, self.labels)
def iter_setup(self):
try:
images, labels = self.iterator.next()
self.feed_input(self.jpegs, images)
self.feed_input(self.labels, labels)
except StopIteration:
self.iterator = iter(self.external_data)
raise StopIteration
def create_dataloder(img_dir,
txt_path,
resize,
batch_size,
device_id=0,
num_gpus=1,
num_threads=6):
eii = ExternalInputIterator(img_dir,
txt_path,
batch_size=batch_size,
device_id=device_id,
num_gpus=num_gpus)
pipe = ExternalSourcePipeline(resize=resize,
batch_size=batch_size,
num_threads=num_threads,
device_id = 0,
external_data = eii)
pii = PyTorchIterator(pipe,
output_map=["data0", "data1", "data2", "data3", "data4", "label"],
size=eii.size,
last_batch_padded=True,
fill_last_batch=False)
return pii
if __name__ == '__main__':
batch_size = 32
num_gpus = 1
num_threads = 8
epochs = 1
pii = create_dataloder('/home/hanbing/hanbing_data/datasets/deepfake/train_videos/',
resize=224,
batch_size=batch_size,
txt_path='./txt/train_5.txt',
)
for e in range(epochs):
print('tttt', len(pii))
for i, data in enumerate(pii):
imgs = data[0]["data4"]
labels = data[0]["label"]
print("epoch: {}, iter {}".format(e, i), imgs.shape, labels.shape)
pii.reset()
3、 第2个例子是,读取的是图片路径,作为输入,编解码的过程
如果是读取的图片是直接的numpy
https://docs.nvidia.com/deeplearning/dali/user-guide/docs/examples/general/data_loading/external_input.html#Defining-the-data-source
import cupy as cp
import imageio
class ExternalInputGpuIterator(object):
def __init__(self, batch_size):
self.images_dir = "../../data/images/"
self.batch_size = batch_size
with open(self.images_dir + "file_list.txt", 'r') as f:
self.files = [line.rstrip() for line in f if line is not '']
shuffle(self.files)
def __iter__(self):
self.i = 0
self.n = len(self.files)
return self
def __next__(self):
batch = []
labels = []
for _ in range(self.batch_size):
jpeg_filename, label = self.files[self.i].split(' ')
im = imageio.imread(self.images_dir + jpeg_filename)
im = cp.asarray(im)
im = im * 0.6;
batch.append(im.astype(cp.uint8))
labels.append(cp.array([label], dtype = np.uint8))
self.i = (self.i + 1) % self.n
return (batch, labels)
4、补充说明
常用到的数据接口
自己实现数据增强接口,包装到dali 中,
ops.PythonFunction 输入是numpy
ops.TorchPythonFunction 输入是torch
ops.ImageDecoder(device="mixed", output_type=types.RGB) #这里是读取图片路径,HWC,输入给的是图片路径,这里解码
ops.NumpyReader 输入是图片保存的numpy
#减均值除以方差的功能 (input - mean) / std
self.cmnp = ops.CropMirrorNormalize(device="gpu",
output_dtype=types.FLOAT,
output_layout=types.NCHW,
image_type=types.RGB,
mean=[0.5 * 255, 0.5 * 255, 0.5 * 255],
std=[0.5 * 255, 0.5 * 255, 0.5 * 255])
# mean=[0.485 * 255, 0.456 * 255, 0.406 * 255],
# std=[0.229 * 255, 0.224 * 255, 0.225 * 255])
# 输出是这样的 out = scale * (in - mean) / stddev + shift
self.normalize = ops.Normalize(mean=0.5, stddev=0.5, device='gpu')