TensorRT（C++）优化GoogleNet.caffemodel用于图像分类

No One

已于 2024-04-05 16:40:08 修改

阅读量48

点赞数 1

分类专栏：深度学习文章标签： tensorrt c++ googlenet opencv caffe

于 2018-06-01 11:45:59 首次发布

本文链接：https://blog.csdn.net/u013527937/article/details/80535738

版权

C++ 同时被 3 个专栏收录

9 篇文章 0 订阅

订阅专栏

深度学习

3 篇文章 0 订阅

订阅专栏

linux

2 篇文章 0 订阅

订阅专栏

TensorRT（C++版本）优化GoogleNet.caffemodel用于图像分类

最近做一些模型优化的工作，主要用到Nvidia的TensorRT(https://developer.nvidia.com/tensorrt)对训练好的模型进行优化，以加快inference时间，下面是在sampleMNIST示例上进行修改后的代码，使用opencv读取彩色图片，输出预测结果并记录各个步骤的耗时，并将模型保存成磁盘，后面又从磁盘加载到内存等。代码如下：

代码

注意：由于该代码使用了opencv，需要在TensorRT示例的makefille中加上：CFLAGS _OPENCV= `pkg-config --cflags opencv`和LIBS_OPENCV = `pkg-config --libs opencv`，并添加到Makefile文件中相关编译命令的合适位置。

/*
    sample code about converting caffemodel to trt
    and run some test about time and accuracy
*/


#include <assert.h>
#include <fstream>
#include <sstream>
#include <iostream>
#include <cmath>
#include <algorithm>
#include <sys/stat.h>
#include <stdio.h>    
#include <sys/time.h>
#include <cuda_runtime_api.h>

#include "NvInfer.h"
#include "NvCaffeParser.h"
#include "common.h"

#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>

using namespace nvinfer1;
using namespace nvcaffeparser1;

static const int INPUT_C = 3;
static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int OUTPUT_SIZE = 100;

static Logger gLogger;

const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";

// model and data path
std::string ROOT_DIR = "/home/wjx/trt_examples/model/bvlc_googlenet";
std::string MODEL_PROTOTXT = ROOT_DIR + "/" + "deploy.prototxt";
std::string CAFFE_MODEL = ROOT_DIR + "/" + "googlenet_train_iter_10000.caffemodel";

std::string GIE_MODEL = ROOT_DIR + "/" + "googlenet_train_iter_10000.caffemodel" + ".gie";

std::string DATA_DIR = "/home/wjx/trt_examples/imgs_baidu/data";
std::string STR_TEST_TXT = DATA_DIR + "/" + "train.txt";

std::string STR_RESULT_TXT_FP32 = "res_googlenet_fp32.txt";
std::string STR_RESULT_TXT_FP16 = "res_googlenet_fp16.txt";
std::string STR_RESULT_TXT_INT8 = "res_googlenet_int8.txt";
std::string STR_RESULT_TXT = "";
int nFpType = 32; // only 32 or 16

// std::string IMAGE_MEAN = "/home/wjx/caffe/examples/imgs_baidu/data" + "/" + "imagenet_mean.binaryproto";
float fMeanBGR[] = {97.31883386, 104.40468039, 109.49523134};

// caffe to GIR model
void caffeToGIEModel(const std::string& deployFile,				// name for caffe prototxt
					 const std::string& modelFile,				// name for model 
					 const std::vector<std::string>& outputs,   // network outputs
					 unsigned int maxBatchSize,					// batch size - NB must be at least as large as the batch we want to run with)
					 IHostMemory *&gieModelStream)    // output buffer for the GIE model
{
	// create the builder
	IBuilder* builder = createInferBuilder(gLogger);

	// parse the caffe model to populate the network, then set the outputs
	INetworkDefinition* network = builder->createNetwork();
	ICaffeParser* parser = createCaffeParser();
	const IBlobNameToTensor* blobNameToTensor 
		= parser->parse(deployFile.c_str(), modelFile.c_str(), *network,
			nFpType == 32 ? nvinfer1::DataType::kFLOAT : nvinfer1::DataType::kHALF);

	// specify which tensors are outputs
	for (auto& s : outputs)
		network->markOutput(*blobNameToTensor->find(s.c_str()));

	// Build the engineI
	if(nFpType == 16) builder->setHalf2Mode(true);
	builder->setMaxBatchSize(maxBatchSize);
	builder->setMaxWorkspaceSize(1 << 20);

	ICudaEngine* engine = builder->buildCudaEngine(*network);
	assert(engine);

	// we don't need the network any more, and we can destroy the parser
	network->destroy();
	parser->destroy();

	// serialize the engine, then close everything down
	gieModelStream = engine->serialize();
	engine->destroy();
	builder->destroy();
	shutdownProtobufLibrary();
}

// read and preprocess image
int getImgData(const std::string &strImgPath, float *fData) {
	cv::Mat img = cv::imread(strImgPath);
	if(img.empty()) return -1;

	cv::resize(img, img, cv::Size(INPUT_W, INPUT_H));
	int nr = img.rows, nc = img.cols, nChannels = img.channels();

	if(INPUT_C != nChannels) {
		cout << "INPUT_C != nChannels" << endl;
		cout << "INPUT_C = "  << INPUT_C << ", nChannels = " << nChannels << endl;
		return -1;
	}

	switch (nChannels) {
		case 1:

			break;
		case 3:
			cv::Vec3b* pr;
			for(int i = 0; i < nr; ++i)
			{
				pr = img.ptr<cv::Vec3b>(i);
				for (int j = 0; j < nc; ++j)
				{
					for(int ic = 0; ic < nChannels; ++ic) {
						fData[nr * nc * ic + i * nc + j] = float(pr[j][ic]) - fMeanBGR[ic];
						// fData[kkk++] = float(pr[j][ic]) - fMeanBGR[ic];
					}
				}
			}

			break;
		default:
			return -1;
	}
	return 1;
}

void saveGieToFile(IHostMemory *gieModelStream,
					std::string strGiePath) {
	
	// save as string to local path
	int dataLen = gieModelStream->size();
	cout << "dataLen = " << dataLen << endl;
	std::string binaryData;
	binaryData.resize(dataLen);

	uchar* pData = static_cast<uchar*> (gieModelStream->data());
	binaryData.assign(pData, pData + dataLen);
	
	// memcpy(&binaryData[0], gieModelStream->data(), dataLen);

	cout << "binaryData.size = " << binaryData.size() << endl;
	cout << int(binaryData[0]) << endl;

	// write to file
	std::ofstream out(strGiePath);
	out << binaryData;
	out.close();
	cout << "write done, path = " << strGiePath << endl;
}

std::string loadGieModel(std::string strGiePath) {

	// load from string
	std::ifstream in(strGiePath);
	ostringstream buf;
	char ch;
	while(buf && in.get(ch)) {
		buf.put(ch);
	}
	in.close();

	std::string binaryData = buf.str();
	cout << "binaryData.size = " << binaryData.size() << endl;
	cout << int(binaryData[0]) << endl;

	return binaryData;
}

template<typename T>
inline std::string num2string(T t) {
	std::ostringstream strs;
	strs << t;
	std::string str = strs.str();
	return str;
}

double getCurrentTime()    
{    
   struct timeval tv;    
   gettimeofday(&tv, NULL);    
   return double(tv.tv_sec) * 1000 + double(tv.tv_usec) / 1000;    
}   

// run inference test
void runTest(const string strTestTxtPath, IExecutionContext& context, int batchSize) {
    //
    const ICudaEngine& engine = context.getEngine();
	// input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
	// of these, but in this case we know that there is exactly one input and one output.
	assert(engine.getNbBindings() == 2);
	void* buffers[2];

	// In order to bind the buffers, we need to know the names of the input and output tensors.
	// note that indices are guaranteed to be less than IEngine::getNbBindings()
	int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME), 
		outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

	// create GPU buffers and a stream
	CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_H * INPUT_W * INPUT_C * sizeof(float)));
	CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

	cudaStream_t stream;
	CHECK(cudaStreamCreate(&stream));	

    // load image
    std::ifstream in(strTestTxtPath);
    std::string strImgName, strImePath, strRes = "";
    int label;
    int iImg = 0;
    int nMax = 1000;

    double start, end;

    while(in >> strImgName) {
        in >> label;
        strImePath = DATA_DIR + "/train" + strImgName;
        cout << iImg << "\t" << strImePath << "\t" << label << endl;

        // read image
        cout << "read image ..." << endl;
        start = getCurrentTime();
        float fData[INPUT_W * INPUT_H * INPUT_C];
        int nGetImage = getImgData(strImePath, fData);
        if(nGetImage != 1){
            cout << "read image error" << endl;
            break;
        }
        end = getCurrentTime();
        double dur_read_image = end - start;
        cout << "\t" << "time = " << dur_read_image  << " ms" << endl;

        float prob[OUTPUT_SIZE];

        // execute inference
        // DMA the input to the GPU,  execute the batch asynchronously, and DMA it back:
        cout << "execute inference ..." << endl;
        start = getCurrentTime();
	    CHECK(cudaMemcpyAsync(buffers[inputIndex], fData, batchSize * INPUT_H * INPUT_W * INPUT_C * sizeof(float), cudaMemcpyHostToDevice, stream));
	    context.enqueue(batchSize, buffers, stream, nullptr);
	    CHECK(cudaMemcpyAsync(prob, buffers[outputIndex], batchSize * OUTPUT_SIZE*sizeof(float), cudaMemcpyDeviceToHost, stream));
	    cudaStreamSynchronize(stream);
        end = getCurrentTime();
        double dur_inference = end - start;
        cout << "\t" << "time = " << dur_inference  << " ms" << endl;

        // get result
        // cout << endl << "output result ..." << endl;
        // print a histogram of the output distribution
        float val{0.0f};
        int idx{0};
        for (unsigned int i = 0; i < OUTPUT_SIZE; i++)
        {
            if(prob[i] > val) {
                val = prob[i]; idx = i;
            }
            // std::cout << i << ": " << std::string(int(std::floor(prob[i] * 10 + 0.5f)), '*') << "\n";
        }
        // for(int i = 100222; i < 100233; ++i) {
        // 	cout << fData[i] << " ";
        // }
        // cout << endl;
        cout << "pred class = " << idx << ", pred prob = " << val << endl;
        std::cout << std::endl;

		strRes += num2string(iImg) + "\t" + num2string(label) + "\t" + num2string(idx) 
			+ "\t" + num2string(dur_read_image) + "\t" + num2string(dur_inference)
			+ "\t" + num2string(val) + "\n";

        ++iImg;
        if(iImg == nMax)
            break;
    }
    in.close();

    // release the stream and the buffers
	cudaStreamDestroy(stream);
	CHECK(cudaFree(buffers[inputIndex]));
	CHECK(cudaFree(buffers[outputIndex]));

	// write test result
	std::ofstream out(STR_RESULT_TXT);
	out << strRes;
	out.close();
}

int main(int argc, char const *argv[])
{
    //
	clock_t start, end;

	if(nFpType == 8) STR_RESULT_TXT = STR_RESULT_TXT_INT8;
	else if(nFpType == 16) STR_RESULT_TXT = STR_RESULT_TXT_FP16;
	else if(nFpType == 32) STR_RESULT_TXT = STR_RESULT_TXT_FP32;

    // create a GIE model from the caffe model and serialize it to a stream
	cout << "convert caffe model to trt ..." << endl;
	start = clock();
    IHostMemory *gieModelStream{nullptr};
   	caffeToGIEModel(MODEL_PROTOTXT.c_str(), CAFFE_MODEL.c_str(),
	    std::vector <std::string> { OUTPUT_BLOB_NAME }, 1, gieModelStream);

	end = clock();
	double dur_convert = (double)(end - start);
    cout << "\t" << "time = " << (dur_convert / CLOCKS_PER_SEC) << " s" << endl;
	// save model (serialize to local path)
	cout << "write model to local path ..." << endl;
	saveGieToFile(gieModelStream, GIE_MODEL);
	if (gieModelStream) gieModelStream->destroy();

	// load model as string
	cout << "load model from local path ..." << endl;
	std::string strModel = loadGieModel(GIE_MODEL);	
    end = clock();

    // deserialize the engine
	cout << "create runtime and engine..." << endl;
	IRuntime* runtime = createInferRuntime(gLogger);
	ICudaEngine* engine = runtime->deserializeCudaEngine(strModel.data(), strModel.size(), nullptr);
    if(strModel.size() > 0) strModel.clear();

	IExecutionContext *context = engine->createExecutionContext();

    // run inference test
    runTest(STR_TEST_TXT, *context, 1);
	
    // destroy the engine
	context->destroy();
	engine->destroy();
	runtime->destroy();

	//
	cout << endl << "... Done" << endl;
	cout << endl << "fp type = " << nFpType << endl << "res path = " << STR_RESULT_TXT << endl;

    /* end */
    return 0;
}

No One

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
TensorRT（C++）优化GoogleNet.caffemodel用于图像分类

TensorRT（C++版本）优化GoogleNet.caffemodel用于图像分类最近做一些模型优化的工作，主要用到Nvidia的TensorRT(https://developer.nvidia.com/tensorrt)对训练好的模型进行优化，以加快inference时间，下面是在sampleMNIST示例上进行修改后的代码，使用opencv读取彩色图片，输出预测结果并记录各个步骤的耗...
复制链接

扫一扫