TensorRT(3):FP16与分析网络每层的消耗时间

最新推荐文章于 2024-05-21 15:57:46 发布

hjxu2016

最新推荐文章于 2024-05-21 15:57:46 发布

阅读量3.3k

点赞数 3

分类专栏：推理框架|TensorRT 文章标签： 1024程序员节

本文链接：https://blog.csdn.net/hjxu2016/article/details/109258566

版权

推理框架|TensorRT 专栏收录该内容

12 篇文章 55 订阅

订阅专栏

TensorRT系列传送门(不定期更新): 深度框架|TensorRT

文章目录

以解析caffe分类模型为例，学习fp16量化和分析网路每层消耗的时间
参考自
TensorRT(4)-Profiling and 16-bit Inference

目的：
1、分析TRT中网络每一层的运行时间
2、fp16量化学习

FP16更省内存和更节约推理时间。
官方文档3.0上表示，如果只是使用FP16进度代替FP32,实际上性能不会有多达的提升，真正提升性能的是 half2mode。half2mode是TRT的一种执行模型，这种模式下，图片上相邻区域的tensor以16位交叉存储的方式存在的，在batchsize大于1的情况下，这种模型的运输速度是最快的。原理不是很懂，大概如下图所示
在这里插入图片描述

一、如何使用half2mode

在调用NvCaffePaser工具解析模型时，使用FP16的数据初始化 network对象，用DataType::kHALF 参数
如下：

    // 首先 使用float 16 精度的数据 来初始化 network 对象，工具解析 caffe模型时，
    // 使用 DataType::kHALF 参数，如下：
    bool useFp16 = builder->platformHasFastFp16();
        // 判断当前的GPU设备是否支持 FP16的精度
    DataType modelDataType = useFp16 ? DataType::kHALF : DataType::kFLOAT; // create a 16-bit model if it's natively supported
    if(useFp16)
    {
        std::cout << "当前设备支持 fp16" << std::endl;
    }
    else
    {
        std::cout << "当前设备不支持 fp16" << std::endl;
    }
    const IBlobNameToTensor *blobNameToTensor = parser->parse("./model/caffeProfile/deploy_vgg16_places365.prototxt",
                                        "./model/caffeProfile/vgg_iter_100000.caffemodel",
                                        *network,
                                        modelDataType);

再配置buildder使用 half2mode

 builder->setFp16Mode(true);

二、分析网络的每层消耗的时间

分析网络每层消耗的时间，需要创建一个IProfuler接口，并以回调函数的方式添加到 execution context中
创建接口

// profile类，继承自 IProfiler
struct Profiler : public IProfiler
{
    typedef std::pair<std::string, float> Record;
    std::vector<Record> mProfile;
    // 将每一层的运行时间存放到 vector中
    virtual void reportLayerTime(const char* layerName, float ms)
    {
        // find_if找到第一个 r.first 与 layerName 相同的层，返回一个迭代器
        auto record = std::find_if(mProfile.begin(), mProfile.end(), [&](const Record& r){ return r.first == layerName; });
        // 如果是新的层就push_back进vector
        if (record == mProfile.end())
            mProfile.push_back(std::make_pair(layerName, ms));
        // 如果是vector中已有的层就直接累加时间，因为他是迭代1000次的，肯定会重复，所以要累加时间
        else
            record->second += ms;
    }
    // 打印各层的运行时间，打印时要除掉 总的迭代次数
    void printLayerTimes()
    {
        float totalTime = 0;
        for (size_t i = 0; i < mProfile.size(); i++)
        {
            printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(), mProfile[i].second / TIMING_ITERATIONS);
            totalTime += mProfile[i].second;
        }
        printf("Time over all layers: %4.3f\n", totalTime / TIMING_ITERATIONS);
    }
} gProfiler;

以回调函数的方式加入到context中

context.profiler = &gProfiler;

注意，执行的时候，prodiling不支持异步的方式，因此需要用TRT的同步执行函数 execution()

    // 因为需要加上gProfilder，需要同步执行
    // 调用 context->execute 同步执行
    for (int i = 0; i < TIMING_ITERATIONS;i++)
    {
         context->execute(1, buffers);
    }

三、整体代码

/*====================================================================
文件 ： sampleCaffeClassf.cc
功能 ： TensorRT学习系列2、以caffe为例，输出fp16量化，输出trt每层消耗的时间 
继承  IProfiler 类
====================================================================*/
#include "NvCaffeParser.h"
#include "NvInfer.h"
#include "NvInferPlugin.h"
#include "logger.h"
#include "cuda_runtime_api.h"
#include "common.h"

#include <cstdlib>
#include <fstream>
#include <iostream>
#include <sstream>
#include <opencv2/opencv.hpp>

using namespace nvinfer1;
using namespace plugin;
using namespace nvcaffeparser1;

const int MODEL_HEIGHT = 256;
const int MODEL_WIDTH = 256;
const int MODEL_CHANNEL = 3;
const int MODEL_OUTPUT_SIZE = 5; // 5分类

static const int TIMING_ITERATIONS = 100;
/**********************************
 * @brief 先resize、再减均值、除方差
 *
 * @param src 
 * @param dst 
 * @return 
 *********************************/
void preData(cv::Mat &matSrc, cv::Mat &matDst)
{   
    cv::resize(matSrc, matSrc, cv::Size(MODEL_WIDTH, MODEL_HEIGHT));
    cv::Mat matMean(MODEL_HEIGHT, MODEL_WIDTH, CV_32FC3, \
                        cv::Scalar(103.53f, 116.28f, 123.675f)); // 均值
    cv::Mat matStd(256, 256, CV_32FC3, \
                        cv::Scalar(1.0f, 1.0f, 1.0f)); // 方差
    cv::Mat matF32Img;
    matSrc.convertTo(matF32Img, CV_32FC3);
    matDst = (matF32Img - matMean) / matStd;
}

// profile类，继承自 IProfiler
struct Profiler : public IProfiler
{
    typedef std::pair<std::string, float> Record;
    std::vector<Record> mProfile;
    // 将每一层的运行时间存放到 vector中
    virtual void reportLayerTime(const char* layerName, float ms)
    {
        // find_if找到第一个 r.first 与 layerName 相同的层，返回一个迭代器
        auto record = std::find_if(mProfile.begin(), mProfile.end(), [&](const Record& r){ return r.first == layerName; });
        // 如果是新的层就push_back进vector
        if (record == mProfile.end())
            mProfile.push_back(std::make_pair(layerName, ms));
        // 如果是vector中已有的层就直接累加时间，因为他是迭代1000次的，肯定会重复，所以要累加时间
        else
            record->second += ms;
    }
    // 打印各层的运行时间，打印时要除掉 总的迭代次数
    void printLayerTimes()
    {
        float totalTime = 0;
        for (size_t i = 0; i < mProfile.size(); i++)
        {
            printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(), mProfile[i].second / TIMING_ITERATIONS);
            totalTime += mProfile[i].second;
        }
        printf("Time over all layers: %4.3f\n", totalTime / TIMING_ITERATIONS);
    }
} gProfiler;

int main()
{
    std::string strTrtSavedPath = "./savedTrt.trt";
    // gLogger
    // 1、创建一个builder, gLogger是一个日志类，必须要有，但又不是那么重要，可以自己继承
    IBuilder* builder = createInferBuilder(gLogger);
    // 2、创建一个netwok，推荐使用V2，这时候netWork只是一个空架子,因为是解析caffe模型，那后面的必须是0U
    // 别问我为啥，官方这么写的
    INetworkDefinition* network = builder->createNetworkV2(0U);
    // TensorRt提供了一个高级别的API，CaffePaser，用于解析caffe模型
    ICaffeParser *parser = createCaffeParser();
    // 首先 使用float 16 精度的数据 来初始化 network 对象，工具解析 caffe模型时，
    // 使用 DataType::kHALF 参数，如下：
    bool useFp16 = builder->platformHasFastFp16();
        // 判断当前的GPU设备是否支持 FP16的精度
    DataType modelDataType = useFp16 ? DataType::kHALF : DataType::kFLOAT; // create a 16-bit model if it's natively supported
    if(useFp16)
    {
        std::cout << "当前设备支持 fp16" << std::endl;
    }
    else
    {
        std::cout << "当前设备不支持 fp16" << std::endl;
    }
    
    const IBlobNameToTensor *blobNameToTensor = parser->parse("./model/caffeProfile/deploy_vgg16_places365.prototxt",
                                        "./model/caffeProfile/vgg_iter_100000.caffemodel",
                                        *network,
                                        modelDataType);
    // 3、标记输入Tensor的节点名
    network->markOutput(*blobNameToTensor->find("prob"));

    //  config是用来填充network的参数
    IBuilderConfig *config = builder->createBuilderConfig();
    // 设置最大batchSize的大小
    builder->setMaxBatchSize(1);
    //配置builder 使用 half2mode ，这个很简单，就一个语句就完成了：
    builder->setFp16Mode(true);
    // 设置工作空间
    config->setMaxWorkspaceSize(8 << 20);
    // 4、建立 engine，进行层之间融合或者进度校准方式
    ICudaEngine *engine = builder->buildEngineWithConfig(*network, *config);

    if (0) // 如果需要离线保存模型
    {
        IHostMemory* trtModelStream{ nullptr };
        trtModelStream = engine->serialize();
        std::ofstream modeStreamoutfile(strTrtSavedPath, std::ofstream::binary);
        assert(!modeStreamoutfile.fail());
        modeStreamoutfile.write((char*)trtModelStream->data(), trtModelStream->size());
        gLogInfo<<"Saving TRT engine " << strTrtSavedPath << "." <<std::endl;
    }
    
    // inference推断过程
    IExecutionContext *context = engine->createExecutionContext();
    context->setProfiler(&gProfiler);
   int nInputIdx = engine->getBindingIndex("data");
    int nOutputIndex = engine->getBindingIndex("prob");
    std::cout << " nINputIdx = " << nInputIdx << std::endl;
    std::cout << " nOutputIdx = " << nOutputIndex << std::endl;
    // 
    std::cout << " n = " << engine->getNbBindings() << std::endl;

    Dims3 inputDims = static_cast<Dims3&&>(engine->getBindingDimensions(nInputIdx));
    Dims3 outputDims = static_cast<Dims3&&>(engine->getBindingDimensions(nOutputIndex));
    size_t inputSize = 1 * inputDims.d[0] * inputDims.d[1] * inputDims.d[2] * sizeof(float);
    size_t outputSize = 1 * outputDims.d[0] * outputDims.d[1] * outputDims.d[2] * sizeof(float);
    //申请GPU显存
    // Allocate GPU memory for Input / Output data
    void* buffers[2] = {NULL, NULL};
    int nBatchSize = 1;
    int nOutputSize = MODEL_OUTPUT_SIZE;
    CHECK(cudaMalloc(&buffers[nInputIdx], nBatchSize * MODEL_CHANNEL * MODEL_HEIGHT * MODEL_WIDTH * sizeof(float)));
    CHECK(cudaMalloc(&buffers[nOutputIndex], nBatchSize * nOutputSize * sizeof(float)));

    // 创建cuda流
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));
    cudaEvent_t start, end; //calculate run time
    CHECK(cudaEventCreate(&start));
    CHECK(cudaEventCreate(&end));

    cv::Mat matBgrImg = cv::imread("./data/fram_25.jpg");
    cv::Mat matNormImage;
    preData(matBgrImg, matNormImage); // 减均值除方差


    std::vector<std::vector<cv::Mat>> nChannels;
    std::vector<cv::Mat> rgbChannels(3);
    cv::split(matNormImage, rgbChannels);
    nChannels.push_back(rgbChannels); //  NHWC  转NCHW 

    void *data = malloc(nBatchSize * MODEL_CHANNEL * MODEL_HEIGHT * MODEL_WIDTH *sizeof(float));;
    if (NULL == data)
    {
        printf("malloc error!\n");
        return 0;
    }
    for (int c = 0; c < 3; ++c) 
    {
        cv::Mat cur_imag_plane = nChannels[0][c];
        memcpy(data + c * MODEL_HEIGHT * MODEL_WIDTH * sizeof(float), cur_imag_plane.ptr<unsigned char>(0), 256 *256 * sizeof(float));
    }

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[nInputIdx], data, \
        nBatchSize * MODEL_CHANNEL * MODEL_WIDTH * MODEL_HEIGHT * sizeof(float), cudaMemcpyHostToDevice, stream));

    // 因为需要加上gProfilder，为了计算时间的准确性，需要同步执行
    // 调用 context->execute 同步执行
    for (int i = 0; i < TIMING_ITERATIONS;i++)
    {
         context->execute(1, buffers);
    }
    // 打印时间
    gProfiler.printLayerTimes();
   
    float prob[nBatchSize * nOutputSize];

    CHECK(cudaMemcpyAsync(prob, buffers[nOutputIndex], 1 * 5 * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);
    cudaEventDestroy(start);
    cudaEventDestroy(end);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[nInputIdx]));
    CHECK(cudaFree(buffers[nOutputIndex]));

    for(int i=0; i< 5; ++i)
    {
        std::cout << prob[i] << " ";
    }
    std::cout << std::endl;
    
    parser->destroy();
    network->destroy();
    config->destroy();
    builder->destroy();
    printf("hello world \n");
    return 0;
}

hjxu2016

关注

3
点赞
踩
21

收藏

觉得还不错? 一键收藏
0
评论
TensorRT(3):FP16与分析网络每层的消耗时间

文章目录一、如何使用half2mode二、分析网络的每层消耗的时间三、整体代码以解析caffe分类模型为例，学习fp16量化和分析网路每层消耗的时间参考自TensorRT(4)-Profiling and 16-bit Inference目的：1、分析TRT中网络每一层的运行时间2、fp16量化学习FP16更省内存和更节约推理时间。官方文档3.0上表示，如果只是使用FP16进度代替FP32,实际上性能不会有多达的提升，真正提升性能的是 half2mode。half2mode是TRT的一种执行
复制链接

扫一扫