TensorRT(C++版本)优化GoogleNet.caffemodel用于图像分类
最近做一些模型优化的工作,主要用到Nvidia的TensorRT(https://developer.nvidia.com/tensorrt)对训练好的模型进行优化,以加快inference时间,下面是在sampleMNIST示例上进行修改后的代码,使用opencv读取彩色图片,输出预测结果并记录各个步骤的耗时,并将模型保存成磁盘,后面又从磁盘加载到内存等。代码如下:
代码
注意:由于该代码使用了opencv,需要在TensorRT示例的makefille中加上:CFLAGS _OPENCV= `pkg-config --cflags opencv`和LIBS_OPENCV = `pkg-config --libs opencv`,并添加到Makefile文件中相关编译命令的合适位置。
/*
sample code about converting caffemodel to trt
and run some test about time and accuracy
*/
#include <assert.h>
#include <fstream>
#include <sstream>
#include <iostream>
#include <cmath>
#include <algorithm>
#include <sys/stat.h>
#include <stdio.h>
#include <sys/time.h>
#include <cuda_runtime_api.h>
#include "NvInfer.h"
#include "NvCaffeParser.h"
#include "common.h"
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
using namespace nvinfer1;
using namespace nvcaffeparser1;
static const int INPUT_C = 3;
static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int OUTPUT_SIZE = 100;
static Logger gLogger;
const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";
// model and data path
std::string ROOT_DIR = "/home/wjx/trt_examples/model/bvlc_googlenet";
std::string MODEL_PROTOTXT = ROOT_DIR + "/" + "deploy.prototxt";
std::string CAFFE_MODEL = ROOT_DIR + "/" + "googlenet_train_iter_10000.caffemodel";
std::string GIE_MODEL = ROOT_DIR + "/" + "googlenet_train_iter_10000.caffemodel" + ".gie";
std::string DATA_DIR = "/home/wjx/trt_examples/imgs_baidu/data";
std::string STR_TEST_TXT = DATA_DIR + "/" + "train.txt";
std::string STR_RESULT_TXT_FP32 = "res_googlenet_fp32.txt";
std::string STR_RESULT_TXT_FP16 = "res_googlenet_fp16.txt";
std::string STR_RESULT_TXT_INT8 = "res_googlenet_int8.txt";
std::string STR_RESULT_TXT = "";
int nFpType = 32; // only 32 or 16
// std::string IMAGE_MEAN = "/home/wjx/caffe/examples/imgs_baidu/data" + "/" + "imagenet_mean.binaryproto";
float fMeanBGR[] = {97.31883386, 104.40468039, 109.49523134};
// caffe to GIR model
void caffeToGIEModel(const std::string& deployFile, // name for caffe prototxt
const std::string& modelFile, // name for model
const std::vector<std::string>& outputs, // network outputs
unsigned int maxBatchSize, // batch size - NB must be at least as large as the batch we want to run with)
IHostMemory *&gieModelStream) // output buffer for the GIE model
{
// create the builder
IBuilder* builder = createInferBuilder(gLogger);
// parse the caffe model to populate the network, then set the outputs
INetworkDefinition* network = builder->createNetwork();
ICaffeParser* parser = createCaffeParser();
const IBlobNameToTensor* blobNameToTensor
= parser->parse(deployFile.c_str(), modelFile.c_str(), *network,
nFpType == 32 ? nvinfer1::DataType::kFLOAT : nvinfer1::DataType::kHALF);
// specify which tensors are outputs
for (auto& s : outputs)
network->markOutput(*blobNameToTensor->find(s.c_str()));
// Build the engineI
if(nFpType == 16) builder->setHalf2Mode(true);
builder->setMaxBatchSize(maxBatchSize);
builder->setMaxWorkspaceSize(1 << 20);
ICudaEngine* engine = builder->buildCudaEngine(*network);
assert(engine);
// we don't need the network any more, and we can destroy the parser
network->destroy();
parser->destroy();
// serialize the engine, then close everything down
gieModelStream = engine->serialize();
engine->destroy();
builder->destroy();
shutdownProtobufLibrary();
}
// read and preprocess image
int getImgData(const std::string &strImgPath, float *fData) {
cv::Mat img = cv::imread(strImgPath);
if(img.empty()) return -1;
cv::resize(img, img, cv::Size(INPUT_W, INPUT_H));
int nr = img.rows, nc = img.cols, nChannels = img.channels();
if(INPUT_C != nChannels) {
cout << "INPUT_C != nChannels" << endl;
cout << "INPUT_C = " << INPUT_C << ", nChannels = " << nChannels << endl;
return -1;
}
switch (nChannels) {
case 1:
break;
case 3:
cv::Vec3b* pr;
for(int i = 0; i < nr; ++i)
{
pr = img.ptr<cv::Vec3b>(i);
for (int j = 0; j < nc; ++j)
{
for(int ic = 0; ic < nChannels; ++ic) {
fData[nr * nc * ic + i * nc + j] = float(pr[j][ic]) - fMeanBGR[ic];
// fData[kkk++] = float(pr[j][ic]) - fMeanBGR[ic];
}
}
}
break;
default:
return -1;
}
return 1;
}
void saveGieToFile(IHostMemory *gieModelStream,
std::string strGiePath) {
// save as string to local path
int dataLen = gieModelStream->size();
cout << "dataLen = " << dataLen << endl;
std::string binaryData;
binaryData.resize(dataLen);
uchar* pData = static_cast<uchar*> (gieModelStream->data());
binaryData.assign(pData, pData + dataLen);
// memcpy(&binaryData[0], gieModelStream->data(), dataLen);
cout << "binaryData.size = " << binaryData.size() << endl;
cout << int(binaryData[0]) << endl;
// write to file
std::ofstream out(strGiePath);
out << binaryData;
out.close();
cout << "write done, path = " << strGiePath << endl;
}
std::string loadGieModel(std::string strGiePath) {
// load from string
std::ifstream in(strGiePath);
ostringstream buf;
char ch;
while(buf && in.get(ch)) {
buf.put(ch);
}
in.close();
std::string binaryData = buf.str();
cout << "binaryData.size = " << binaryData.size() << endl;
cout << int(binaryData[0]) << endl;
return binaryData;
}
template<typename T>
inline std::string num2string(T t) {
std::ostringstream strs;
strs << t;
std::string str = strs.str();
return str;
}
double getCurrentTime()
{
struct timeval tv;
gettimeofday(&tv, NULL);
return double(tv.tv_sec) * 1000 + double(tv.tv_usec) / 1000;
}
// run inference test
void runTest(const string strTestTxtPath, IExecutionContext& context, int batchSize) {
//
const ICudaEngine& engine = context.getEngine();
// input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
// of these, but in this case we know that there is exactly one input and one output.
assert(engine.getNbBindings() == 2);
void* buffers[2];
// In order to bind the buffers, we need to know the names of the input and output tensors.
// note that indices are guaranteed to be less than IEngine::getNbBindings()
int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME),
outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
// create GPU buffers and a stream
CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_H * INPUT_W * INPUT_C * sizeof(float)));
CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));
// load image
std::ifstream in(strTestTxtPath);
std::string strImgName, strImePath, strRes = "";
int label;
int iImg = 0;
int nMax = 1000;
double start, end;
while(in >> strImgName) {
in >> label;
strImePath = DATA_DIR + "/train" + strImgName;
cout << iImg << "\t" << strImePath << "\t" << label << endl;
// read image
cout << "read image ..." << endl;
start = getCurrentTime();
float fData[INPUT_W * INPUT_H * INPUT_C];
int nGetImage = getImgData(strImePath, fData);
if(nGetImage != 1){
cout << "read image error" << endl;
break;
}
end = getCurrentTime();
double dur_read_image = end - start;
cout << "\t" << "time = " << dur_read_image << " ms" << endl;
float prob[OUTPUT_SIZE];
// execute inference
// DMA the input to the GPU, execute the batch asynchronously, and DMA it back:
cout << "execute inference ..." << endl;
start = getCurrentTime();
CHECK(cudaMemcpyAsync(buffers[inputIndex], fData, batchSize * INPUT_H * INPUT_W * INPUT_C * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize, buffers, stream, nullptr);
CHECK(cudaMemcpyAsync(prob, buffers[outputIndex], batchSize * OUTPUT_SIZE*sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
end = getCurrentTime();
double dur_inference = end - start;
cout << "\t" << "time = " << dur_inference << " ms" << endl;
// get result
// cout << endl << "output result ..." << endl;
// print a histogram of the output distribution
float val{0.0f};
int idx{0};
for (unsigned int i = 0; i < OUTPUT_SIZE; i++)
{
if(prob[i] > val) {
val = prob[i]; idx = i;
}
// std::cout << i << ": " << std::string(int(std::floor(prob[i] * 10 + 0.5f)), '*') << "\n";
}
// for(int i = 100222; i < 100233; ++i) {
// cout << fData[i] << " ";
// }
// cout << endl;
cout << "pred class = " << idx << ", pred prob = " << val << endl;
std::cout << std::endl;
strRes += num2string(iImg) + "\t" + num2string(label) + "\t" + num2string(idx)
+ "\t" + num2string(dur_read_image) + "\t" + num2string(dur_inference)
+ "\t" + num2string(val) + "\n";
++iImg;
if(iImg == nMax)
break;
}
in.close();
// release the stream and the buffers
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers[inputIndex]));
CHECK(cudaFree(buffers[outputIndex]));
// write test result
std::ofstream out(STR_RESULT_TXT);
out << strRes;
out.close();
}
int main(int argc, char const *argv[])
{
//
clock_t start, end;
if(nFpType == 8) STR_RESULT_TXT = STR_RESULT_TXT_INT8;
else if(nFpType == 16) STR_RESULT_TXT = STR_RESULT_TXT_FP16;
else if(nFpType == 32) STR_RESULT_TXT = STR_RESULT_TXT_FP32;
// create a GIE model from the caffe model and serialize it to a stream
cout << "convert caffe model to trt ..." << endl;
start = clock();
IHostMemory *gieModelStream{nullptr};
caffeToGIEModel(MODEL_PROTOTXT.c_str(), CAFFE_MODEL.c_str(),
std::vector <std::string> { OUTPUT_BLOB_NAME }, 1, gieModelStream);
end = clock();
double dur_convert = (double)(end - start);
cout << "\t" << "time = " << (dur_convert / CLOCKS_PER_SEC) << " s" << endl;
// save model (serialize to local path)
cout << "write model to local path ..." << endl;
saveGieToFile(gieModelStream, GIE_MODEL);
if (gieModelStream) gieModelStream->destroy();
// load model as string
cout << "load model from local path ..." << endl;
std::string strModel = loadGieModel(GIE_MODEL);
end = clock();
// deserialize the engine
cout << "create runtime and engine..." << endl;
IRuntime* runtime = createInferRuntime(gLogger);
ICudaEngine* engine = runtime->deserializeCudaEngine(strModel.data(), strModel.size(), nullptr);
if(strModel.size() > 0) strModel.clear();
IExecutionContext *context = engine->createExecutionContext();
// run inference test
runTest(STR_TEST_TXT, *context, 1);
// destroy the engine
context->destroy();
engine->destroy();
runtime->destroy();
//
cout << endl << "... Done" << endl;
cout << endl << "fp type = " << nFpType << endl << "res path = " << STR_RESULT_TXT << endl;
/* end */
return 0;
}