pytorch dali 加速 dali支持的数据处理列表，mxnet tensorflow caff读取数据转换 pytorch训练

最新推荐文章于 2024-06-07 09:55:26 发布

贝猫说python

最新推荐文章于 2024-06-07 09:55:26 发布

阅读量1.5k

点赞数 2

本文链接：https://blog.csdn.net/m0_37192554/article/details/111478302

版权

使用dali加速，前提是 gpu没有跑满，不然效果也不大

+apex 混合精度训练

5、tf ,mxnet ,caffe, 数据输入，转换成 pytorch 训练

https://github.com/NVIDIA/DALI/blob/master/docs/examples/frameworks/pytorch/pytorch-various-readers.ipynb

	pytorch 训练 通常直接是图片，如果是mxnet rec数据   或者 caffe  lmdb数据，就使用官方提供的方法
	就是找这个办法，网上一不小心就找到 External 自定义接口，去了，写好了  速度又根本没有提升，最后找到了这里   Using PyTorch DALI plugin: using various readers
	
	别人写的 DALI_pytorch_demo， 速度可以
	https://github.com/yaysummeriscoming/DALI_pytorch_demo

问题 pytorch + apex 训练很慢的问题 Training very slow using dali+apex on imagenet with v100#1834

例如人脸识别的rec 数据直接 rec dali 读取训练，不用转换成图片或者numpy

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'  #放在首位  指定 显卡
import os.path
# MXNet RecordIO
db_folder = os.path.join("/nfs-data/xingwg/data/Glint360k/celeb_deepglint/")
from nvidia.dali.pipeline import Pipeline
import nvidia.dali.ops as ops
import nvidia.dali.types as types



class CommonPipeline(Pipeline):
    def __init__(self, batch_size, num_threads, device_id):
        super(CommonPipeline, self).__init__(batch_size, num_threads, device_id)

        self.decode = ops.ImageDecoder(device = "mixed", output_type = types.RGB)
        self.resize = ops.Resize(device = "gpu",
                                 interp_type = types.INTERP_LINEAR)
        self.cmn = ops.CropMirrorNormalize(device = "gpu",
                                            dtype = types.FLOAT,
                                            # crop = (227, 227),
                                            mean = [127.5, 127.5, 127.5],
                                            std = [127.5,127.5, 127.5])

        self.coin = ops.CoinFlip(probability=0.5)

    def base_define_graph(self, inputs, labels):
        rng = self.coin()
        images = self.decode(inputs)
        # images = self.resize(images, resize_shorter = self.resize_rng())
        output = self.cmn(images, mirror=rng)
        return (output, labels)

class MXNetReaderPipeline(CommonPipeline):
    def __init__(self, batch_size, num_threads, device_id, num_gpus):
        super(MXNetReaderPipeline, self).__init__(batch_size, num_threads, device_id)
        self.input = ops.MXNetReader(path = [db_folder+"train.rec"], index_path=[db_folder+"train.idx"],
                                     random_shuffle = True, shard_id = device_id, num_shards = num_gpus)

    def define_graph(self):
        images, labels = self.input(name="Reader")
        return self.base_define_graph(images, labels)


if __name__=="__main__":
    import numpy as np
    import time

    from nvidia.dali.plugin.pytorch import DALIGenericIterator
    N = 1  # number of GPUs
    BATCH_SIZE = 512  # batch size per GPU
    ITERATIONS = 32
    IMAGE_SIZE = 3

    pipe_types = [[MXNetReaderPipeline, (0, 999)]]  #label 最大 最小值

    for pipe_t in pipe_types:
        pipe_name, label_range = pipe_t
        print("RUN: " + pipe_name.__name__)
        pipes = [pipe_name(batch_size=BATCH_SIZE, num_threads=8, device_id=device_id, num_gpus=N) for device_id in
                 range(N)]
        pipes[0].build()
        dali_iter = DALIGenericIterator(pipes, ['data', 'label'], pipes[0].epoch_size("Reader"))

        for epoch in range(3):
            t1 = time.time()
            for i, data in enumerate(dali_iter):
                if i >= ITERATIONS:
                    break
                print(data[0]["label"].shape)  #batch  nchw
                print(data[0]["data"].shape)   # n2  ,每一行第一个是label


                # Testing correctness of labels
                for d in data:
                    label = d["label"]
                    image = d["data"]
                    print(label[:,0])
                    ## labels need to be integers
                    assert(np.equal(np.mod(label, 1), 0).all())
                    ## labels need to be in range pipe_name[2]
                    assert((label >= label_range[0]).all())
                    assert((label <= label_range[1]).all())


            print("time", time.time() - t1)
        print("OK : " + pipe_name.__name__)   #0.5s ， 同样的条件 pytorch  运行时间是 32 s  ,j

1、【Pytorch】nvidia-dali——一种加速数据增强的方法

https://blog.csdn.net/weixin_42028608/article/details/105564060

2、官方自定义数据接口使用介绍

https://github.com/NVIDIA/DALI/blob/master/docs/examples/frameworks/pytorch/pytorch-external_input.ipynb

在这里插入图片描述 `
import types
import collections
import numpy as np
from random import shuffle
from nvidia.dali.pipeline import Pipeline
import nvidia.dali.ops as ops
import nvidia.dali.types as types

batch_size = 4

class ExternalInputIterator(object):
def init(self, batch_size):
self.batch_size = batch_size

def __iter__(self):
    self.i = 0
    self.n = 1
    return self

def __next__(self):
    batch = []
    labels = []
    for _ in range(self.batch_size):
        f = open("/mnt/yunle/byebye/coco28/images/train/0.jpg", 'rb')
        batch.append(np.frombuffer(f.read(), dtype = np.uint8))
        
        #batch.append(np.frombuffer(b'/mnt/yunle/byebye/coco28/images/train/0.jpg', dtype = np.uint8))
        
        labels.append(np.array([1], dtype = np.uint8))
        self.i = (self.i + 1) % self.n
    return (batch, labels)

eii = ExternalInputIterator(batch_size)

class ExternalSourcePipeline(Pipeline):
    def __init__(self, batch_size, eii, num_threads, device_id):
        super(ExternalSourcePipeline, self).__init__(batch_size,
                                      num_threads,
                                      device_id,
                                      seed=12)

    self.source = ops.ExternalSource(source = eii, num_outputs = 2)
    self.decode = ops.ImageDecoder(device = "mixed", output_type = types.RGB)
    self.enhance = ops.BrightnessContrast(device = "gpu", contrast = 2)

def define_graph(self):
    jpegs, labels = self.source()
    images = self.decode(jpegs)
    output = self.enhance(images)
    return (output, labels)



pipe = ExternalSourcePipeline(batch_size=batch_size, eii=eii, num_threads=2, device_id = 0)
pipe.build()
pipe_out = pipe.run()

print('Success!')

或者别人写好的一个数据加载Dali 模块

https://blog.csdn.net/u014365862/article/details/104412294


from __future__ import division
import torch
import types
import joblib
import collections
import numpy as np
import pandas as pd
from random import shuffle
from nvidia.dali.pipeline import Pipeline
import nvidia.dali.ops as ops
import nvidia.dali.types as types
import nvidia.dali.plugin.pytorch as dalitorch
from nvidia.dali.plugin.pytorch import DALIGenericIterator as PyTorchIterator
 
 
 
def grid2x2(img):
    h, w, c = img.shape
    left_top = img[0:h//2, 0:w//2, :]
    left_bottom = img[h//2:h, 0:w//2, :]
    right_top = img[0:h//2, w//2:w, :]
    right_bottom = img[h//2:h, w//2:w, :]
    return left_top, right_top, left_bottom, left_bottom
 
 
 
 
class ExternalInputIterator(object):
    def __init__(self, images_dir, txt_path, batch_size, device_id, num_gpus):
        self.images_dir = images_dir
        self.batch_size = batch_size
        with open(txt_path, 'r') as f:
            self.files = [line.rstrip() for line in f if line is not '']
        
        # whole data set size
        self.data_set_len = len(self.files)
        # based on the device_id and total number of GPUs - world size
        # get proper shard
        self.files = self.files[self.data_set_len * device_id // num_gpus:
                                self.data_set_len * (device_id + 1) // num_gpus]
        self.n = len(self.files)
 
    def __iter__(self):
        self.i = 0
        shuffle(self.files)
        return self
 
    def __next__(self):
        batch = []
        labels = []
 
        if self.i >= self.n:
            raise StopIteration
 
        for _ in range(self.batch_size):
            jpeg_filename, label = self.files[self.i].split(',')
            f = open(self.images_dir + jpeg_filename, 'rb')
            # jpeg_filename, label = self.files[self.i], 1
            # f = open(jpeg_filename, 'rb')
            batch.append(np.frombuffer(f.read(), dtype = np.uint8))
            labels.append(np.array([int(label)], dtype = np.uint8))
            self.i = (self.i + 1) % self.n
        return (batch, labels)
 
    @property
    def size(self,):
        return self.data_set_len
 
    next = __next__
 
 
class ExternalSourcePipeline(Pipeline):
    def __init__(self, resize, batch_size, num_threads, device_id, external_data):
        super(ExternalSourcePipeline, self).__init__(batch_size,
                                      num_threads,
                                      device_id,
                                      seed=12,
                                      exec_async=False,
                                      exec_pipelined=False,
                                    )
        self.input = ops.ExternalSource()
        self.input_label = ops.ExternalSource()
        self.decode = ops.ImageDecoder(device = "cpu", output_type = types.RGB)
        # PythonFunction: exec_async and exec_pipelined need to be False, and input must cpu
        self.grid = ops.PythonFunction(function=grid2x2, num_outputs=4)
        # self.grid = dalitorch.TorchPythonFunction(function=grid2x2, num_outputs=5)
        self.resize = ops.Resize(device="gpu", 
                                 resize_x=resize, 
                                 resize_y=resize,
                                 interp_type=types.INTERP_LINEAR)
        # self.cast = ops.Cast(device = "gpu",
        #                      dtype = types.UINT8)
        self.external_data = external_data
        self.iterator = iter(self.external_data)
 
 
 
    def define_graph(self):
        self.jpegs = self.input()
        self.labels = self.input_label()
        images = self.decode(self.jpegs)
        
        images1, images2, images3, images4 = self.grid(images)
        images = self.resize(images.gpu())
        images1 = self.resize(images1.gpu())
        images2 = self.resize(images2.gpu())
        images3 = self.resize(images3.gpu())
        images4 = self.resize(images4.gpu())
        return (images, images1, images2, images3, images4, self.labels)
 
    def iter_setup(self):
        try:
            images, labels = self.iterator.next()
            self.feed_input(self.jpegs, images)
            self.feed_input(self.labels, labels)
        except StopIteration:
            self.iterator = iter(self.external_data)
            raise StopIteration
 
 
def create_dataloder(img_dir, 
                     txt_path, 
                     resize,
                     batch_size,
                     device_id=0,
                     num_gpus=1,
                     num_threads=6):
    eii = ExternalInputIterator(img_dir,
                                txt_path, 
                                batch_size=batch_size, 
                                device_id=device_id,
                                num_gpus=num_gpus)
    pipe = ExternalSourcePipeline(resize=resize,
                                  batch_size=batch_size, 
                                  num_threads=num_threads, 
                                  device_id = 0,
                                  external_data = eii)
 
    pii = PyTorchIterator(pipe, 
                          output_map=["data0", "data1", "data2", "data3", "data4", "label"], 
                          size=eii.size, 
                          last_batch_padded=True, 
                          fill_last_batch=False)
 
    return pii
 
 
if __name__ == '__main__':
    batch_size = 32
    num_gpus = 1
    num_threads = 8
    epochs = 1
 
    pii = create_dataloder('/home/hanbing/hanbing_data/datasets/deepfake/train_videos/',
                            resize=224,
                            batch_size=batch_size,
                            txt_path='./txt/train_5.txt',
                            )
 
 
    for e in range(epochs):
        print('tttt', len(pii))
        for i, data in enumerate(pii):
            imgs = data[0]["data4"]
            labels = data[0]["label"]
            print("epoch: {}, iter {}".format(e, i), imgs.shape, labels.shape)
 
        pii.reset()

3、第2个例子是，读取的是图片路径，作为输入，编解码的过程
如果是读取的图片是直接的numpy
https://docs.nvidia.com/deeplearning/dali/user-guide/docs/examples/general/data_loading/external_input.html#Defining-the-data-source

import cupy as cp
import imageio

class ExternalInputGpuIterator(object):
    def __init__(self, batch_size):
        self.images_dir = "../../data/images/"
        self.batch_size = batch_size
        with open(self.images_dir + "file_list.txt", 'r') as f:
            self.files = [line.rstrip() for line in f if line is not '']
        shuffle(self.files)

    def __iter__(self):
        self.i = 0
        self.n = len(self.files)
        return self

    def __next__(self):
        batch = []
        labels = []
        for _ in range(self.batch_size):
            jpeg_filename, label = self.files[self.i].split(' ')
            im = imageio.imread(self.images_dir + jpeg_filename)
            im = cp.asarray(im)
            im = im * 0.6;
            batch.append(im.astype(cp.uint8))
            labels.append(cp.array([label], dtype = np.uint8))
            self.i = (self.i + 1) % self.n
        return (batch, labels)

4、补充说明

常用到的数据接口

	自己实现数据增强接口，包装到dali 中，
	ops.PythonFunction  输入是numpy
	ops.TorchPythonFunction  输入是torch
	
	ops.ImageDecoder(device="mixed", output_type=types.RGB)  #这里是读取图片路径，HWC,输入给的是图片路径，这里解码
	ops.NumpyReader  输入是图片保存的numpy  

	#减均值除以方差的功能   (input - mean) / std
    self.cmnp = ops.CropMirrorNormalize(device="gpu",
                                      output_dtype=types.FLOAT,
                                      output_layout=types.NCHW,
                                      image_type=types.RGB,
                                      mean=[0.5 * 255, 0.5 * 255, 0.5 * 255],
                                      std=[0.5 * 255, 0.5 * 255, 0.5 * 255])
                                      # mean=[0.485 * 255, 0.456 * 255, 0.406 * 255],
                                      # std=[0.229 * 255, 0.224 * 255, 0.225 * 255])

	# 输出是这样的 out = scale * (in - mean) / stddev + shift
	self.normalize = ops.Normalize(mean=0.5, stddev=0.5, device='gpu')

dali支持的数据处理列表链接
在这里插入图片描述

贝猫说python

关注

2
点赞
踩
5

收藏

觉得还不错? 一键收藏
0
评论
pytorch dali 加速 dali支持的数据处理列表，mxnet tensorflow caff读取数据转换 pytorch训练

使用dali加速，前提是 gpu没有跑满，不然效果也不大+apex 混合精度训练1、【Pytorch】nvidia-dali——一种加速数据增强的方法https://blog.csdn.net/weixin_42028608/article/details/1055640602、官方自定义数据接口使用介绍https://github.com/NVIDIA/DALI/blob/master/docs/examples/frameworks/pytorch/pytorch-external_input
复制链接

扫一扫