void performInference() { // 在导出mlp.engine后可直接进行推理
// stream to write model
char *trtModelStream{nullptr}; //用于读取模型文件
size_t size{0};
// read model from the engine file
std::ifstream file("../mlp.engine", std::ios::binary);
if (file.good()) {
file.seekg(0, file.end);
size = file.tellg();
file.seekg(0, file.beg);
trtModelStream = new char[size];
assert(trtModelStream);
file.read(trtModelStream, size);
file.close();
}
// create a runtime (required for deserialization of model) with NVIDIA's logger
IRuntime *runtime = createInferRuntime(gLogger); // InferRuntime用于加载并反序列化
assert(runtime != nullptr);
// deserialize engine for using the char-stream
ICudaEngine *engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
assert(engine != nullptr);
// create execution context -- required for inference executions
IExecutionContext *context = engine->createExecutionContext(); //engine的推理空间
assert(context != nullptr);
float out[1]; // array for output
float data[1]; // array for input
for (float &i: data)
i = 12.0; // put any value for input
// time the execution
auto start = std::chrono::system_clock::now();
// do inference using the parameters //这里作者包装了一下推理的过程,在下面解释
doInference(*context, data, out, 1);
// time the execution
auto end = std::chrono::system_clock::now();
std::cout << "\n[INFO]: Time taken by execution: "
<< std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
// free the captured space
context->destroy();
engine->destroy();
runtime->destroy();
std::cout << "\nInput:\t" << data[0];
std::cout << "\nOutput:\t";
for (float i: out) {
std::cout << i;
}
std::cout << std::endl;
}
void doInference(IExecutionContext &context, float *input, float *output, int batchSize) {
/**
* Perform inference using the CUDA context
*
* @param context: context created by engine
* @param input: input from the host
* @param output: output to save on host
* @param batchSize: batch size for TRT model
*/
// Get engine from the context
const ICudaEngine &engine = context.getEngine();// context中"包含"engine
// Pointers to input and output device buffers to pass to engine.
// Engine requires exactly IEngine::getNbBindings() number of buffers.
assert(engine.getNbBindings() == 2);
void *buffers[2];
// In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
const int inputIndex = engine.getBindingIndex("data");// 名称在构建时定义的ITensor *data = network->addInput("data", DataType::kFLOAT, Dims3{1, 1, 1});
const int outputIndex = engine.getBindingIndex("out");// 实际上这个值是1,inputIndex 是0
// Create GPU buffers on device -- allocate memory for input and output
cudaMalloc(&buffers[inputIndex], batchSize * INPUT_SIZE * sizeof(float));
cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float));
// create CUDA stream for simultaneous CUDA operations
cudaStream_t stream;
cudaStreamCreate(&stream);
// copy input from host (CPU) to device (GPU) in stream
cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_SIZE * sizeof(float), cudaMemcpyHostToDevice, stream);
// execute inference using context provided by engine
context.enqueue(batchSize, buffers, stream, nullptr);//进入(en)处理queue,使用stream将结果推理到buffers
// copy output back from device (GPU) to host (CPU)
cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost,
stream);
// synchronize the stream to prevent issues
// (block CUDA and wait for CUDA operations to be completed)
cudaStreamSynchronize(stream);
// Release stream and buffers (memory)
cudaStreamDestroy(stream);
cudaFree(buffers[inputIndex]);
cudaFree(buffers[outputIndex]);
}
ICudaEngine *createMLPEngine(unsigned int maxBatchSize, IBuilder *builder, IBuilderConfig *config, DataType dt) {
/**
* Create Multi-Layer Perceptron using the TRT Builder and Configurations
*
* @param maxBatchSize: batch size for built TRT model
* @param builder: to build engine and networks
* @param config: configuration related to Hardware
* @param dt: datatype for model layers
* @return engine: TRT model
*/
std::cout << "[INFO]: Creating MLP using TensorRT..." << std::endl;
// Load Weights from relevant file
std::map<std::string, Weights> weightMap = loadWeights("../mlp.wts");
// Create an empty network
INetworkDefinition *network = builder->createNetworkV2(0U);
// Create an input with proper *name
ITensor *data = network->addInput("data", DataType::kFLOAT, Dims3{1, 1, 1});
assert(data);
// Add layer for MLP
IFullyConnectedLayer *fc1 = network->addFullyConnected(*data, 1,
weightMap["linear.weight"],
weightMap["linear.bias"]);
assert(fc1);
// set output with *name
fc1->getOutput(0)->setName("out");
// mark the output
network->markOutput(*fc1->getOutput(0));
// Set configurations
builder->setMaxBatchSize(1);
// Set workspace size
config->setMaxWorkspaceSize(1 << 20);
// Build CUDA Engine using network and configurations
ICudaEngine *engine = builder->buildEngineWithConfig(*network, *config);
assert(engine != nullptr);
// Don't need the network any more
// free captured memory
network->destroy();
// Release host memory
for (auto &mem: weightMap) {
free((void *) (mem.second.values));
}
return engine;
}
APIToModel
void APIToModel(unsigned int maxBatchSize, IHostMemory **modelStream) {
/**
* Create engine using TensorRT APIs
*
* @param maxBatchSize: for the deployed model configs
* @param modelStream: shared memory to store serialized model
*/
// Create builder with the help of logger
IBuilder *builder = createInferBuilder(gLogger);
// Create hardware configs
IBuilderConfig *config = builder->createBuilderConfig();
// Build an engine
ICudaEngine *engine = createMLPEngine(maxBatchSize, builder, config, DataType::kFLOAT);
assert(engine != nullptr);
// serialize the engine into binary stream
(*modelStream) = engine->serialize();
// free up the memory
engine->destroy();
builder->destroy();
}
full example
cmake_minimum_required(VERSION 3.14) # change the version, if asked by compiler
project(mlp)
set(CMAKE_CXX_STANDARD 14)
# include and link dirs of tensorrt, you need adapt them if yours are different
include_directories(/usr/local/TensorRT-8.5.1.7/include)
link_directories(/usr/local/TensorRT-8.5.1.7/lib)
#include_directories(/usr/local/TensorRT-7.2.3.4/targets/x86_64-linux-gnu/include)
#link_directories(/usr/local/TensorRT-7.2.3.4/targets/x86_64-linux-gnu)
#include_directories(/usr/include/x86_64-linux-gnu/)
#link_directories(/usr/lib/x86_64-linux-gnu/)
#include_directories(include ${CUDA_INCLUDE_DIRS})
#find_package(CUDA REQUIRED)
# include and link dirs of cuda for inference
include_directories(/usr/local/cuda/include)
link_directories(/usr/local/cuda/lib64)
# create link for executable files
add_executable(mlp mlp.cpp)
# perform linking with nvinfer libraries
target_link_libraries(mlp nvinfer)
# link with cuda libraries for Inference
target_link_libraries(mlp cudart)
add_definitions(-O2 -pthread)
#include "NvInfer.h" // TensorRT library
#include "iostream" // Standard input/output library
#include "logging.h" // logging file -- by NVIDIA
#include <map> // for weight maps
#include <fstream> // for file-handling
#include <chrono> // for timing the execution
//#include <cuda.h>
//#include <cuda_runtime.h>
// provided by nvidia for using TensorRT APIs
using namespace nvinfer1;
// Logger from TRT API
static Logger gLogger;
const int INPUT_SIZE = 1;
const int OUTPUT_SIZE = 1;
/**
// DEPLOYMENT RELATED /
*/
std::map<std::string, Weights> loadWeights(const std::string file) {
/**
* Parse the .wts file and store weights in dict format.
*
* @param file path to .wts file
* @return weight_map: dictionary containing weights and their values
*/
std::cout << "[INFO]: Loading weights..." << file << std::endl;
std::map<std::string, Weights> weightMap;
// Open Weight file
std::ifstream input(file);
assert(input.is_open() && "[ERROR]: Unable to load weight file...");
// Read number of weights
int32_t count;
input >> count;
assert(count > 0 && "Invalid weight map file.");
// Loop through number of line, actually the number of weights & biases
while (count--) {
// TensorRT weights
Weights wt{DataType::kFLOAT, nullptr, 0};
uint32_t size;
// Read name and type of weights
std::string w_name;
input >> w_name >> std::dec >> size;
wt.type = DataType::kFLOAT;
uint32_t *val = reinterpret_cast<uint32_t *>(malloc(sizeof(val) * size));
for (uint32_t x = 0, y = size; x < y; ++x) {
// Change hex values to uint32 (for higher values)
input >> std::hex >> val[x];
}
wt.values = val;
wt.count = size;
// Add weight values against its name (key)
weightMap[w_name] = wt;
}
return weightMap;
}
ICudaEngine *createMLPEngine(unsigned int maxBatchSize, IBuilder *builder, IBuilderConfig *config, DataType dt) {
/**
* Create Multi-Layer Perceptron using the TRT Builder and Configurations
*
* @param maxBatchSize: batch size for built TRT model
* @param builder: to build engine and networks
* @param config: configuration related to Hardware
* @param dt: datatype for model layers
* @return engine: TRT model
*/
std::cout << "[INFO]: Creating MLP using TensorRT..." << std::endl;
// Load Weights from relevant file
std::map<std::string, Weights> weightMap = loadWeights("../mlp.wts");
// Create an empty network
INetworkDefinition *network = builder->createNetworkV2(0U);
// Create an input with proper *name
ITensor *data = network->addInput("data", DataType::kFLOAT, Dims3{1, 1, 1});
assert(data);
// Add layer for MLP
IFullyConnectedLayer *fc1 = network->addFullyConnected(*data, 1,
weightMap["linear.weight"],
weightMap["linear.bias"]);
assert(fc1);
// set output with *name
fc1->getOutput(0)->setName("out");
// mark the output
network->markOutput(*fc1->getOutput(0));
// Set configurations
builder->setMaxBatchSize(1);
// Set workspace size
config->setMaxWorkspaceSize(1 << 20);
// Build CUDA Engine using network and configurations
ICudaEngine *engine = builder->buildEngineWithConfig(*network, *config);
assert(engine != nullptr);
// Don't need the network any more
// free captured memory
network->destroy();
// Release host memory
for (auto &mem: weightMap) {
free((void *) (mem.second.values));
}
return engine;
}
void APIToModel(unsigned int maxBatchSize, IHostMemory **modelStream) {
/**
* Create engine using TensorRT APIs
*
* @param maxBatchSize: for the deployed model configs
* @param modelStream: shared memory to store serialized model
*/
// Create builder with the help of logger
IBuilder *builder = createInferBuilder(gLogger);
// Create hardware configs
IBuilderConfig *config = builder->createBuilderConfig();
// Build an engine
ICudaEngine *engine = createMLPEngine(maxBatchSize, builder, config, DataType::kFLOAT);
assert(engine != nullptr);
// serialize the engine into binary stream
(*modelStream) = engine->serialize();
// free up the memory
engine->destroy();
builder->destroy();
}
void performSerialization() {
/**
* Serialization Function
*/
// Shared memory object
IHostMemory *modelStream{nullptr};
// Write model into stream
APIToModel(1, &modelStream);
assert(modelStream != nullptr);
std::cout << "[INFO]: Writing engine into binary..." << std::endl;
// Open the file and write the contents there in binary format
std::ofstream p("../mlp.engine", std::ios::binary);
if (!p) {
std::cerr << "could not open plan output file" << std::endl;
return;
}
p.write(reinterpret_cast<const char *>(modelStream->data()), modelStream->size());
// Release the memory
modelStream->destroy();
std::cout << "[INFO]: Successfully created TensorRT engine..." << std::endl;
std::cout << "\n\tRun inference using `./mlp -d`" << std::endl;
}
/**
// INFERENCE RELATED //
*/
void doInference(IExecutionContext &context, float *input, float *output, int batchSize) {
/**
* Perform inference using the CUDA context
*
* @param context: context created by engine
* @param input: input from the host
* @param output: output to save on host
* @param batchSize: batch size for TRT model
*/
// Get engine from the context
const ICudaEngine &engine = context.getEngine();
// Pointers to input and output device buffers to pass to engine.
// Engine requires exactly IEngine::getNbBindings() number of buffers.
assert(engine.getNbBindings() == 2);
void *buffers[2];
// In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
const int inputIndex = engine.getBindingIndex("data");
const int outputIndex = engine.getBindingIndex("out");
// Create GPU buffers on device -- allocate memory for input and output
cudaMalloc(&buffers[inputIndex], batchSize * INPUT_SIZE * sizeof(float));
cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float));
// create CUDA stream for simultaneous CUDA operations
cudaStream_t stream;
cudaStreamCreate(&stream);
// copy input from host (CPU) to device (GPU) in stream
cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_SIZE * sizeof(float), cudaMemcpyHostToDevice, stream);
// execute inference using context provided by engine
context.enqueue(batchSize, buffers, stream, nullptr);
// copy output back from device (GPU) to host (CPU)
cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost,
stream);
// synchronize the stream to prevent issues
// (block CUDA and wait for CUDA operations to be completed)
cudaStreamSynchronize(stream);
// Release stream and buffers (memory)
cudaStreamDestroy(stream);
cudaFree(buffers[inputIndex]);
cudaFree(buffers[outputIndex]);
}
void performInference() {
/**
* Get inference using the pre-trained model
*/
// stream to write model
char *trtModelStream{nullptr};
size_t size{0};
// read model from the engine file
std::ifstream file("../mlp.engine", std::ios::binary);
if (file.good()) {
file.seekg(0, file.end);
size = file.tellg();
file.seekg(0, file.beg);
trtModelStream = new char[size];
assert(trtModelStream);
file.read(trtModelStream, size);
file.close();
}
// create a runtime (required for deserialization of model) with NVIDIA's logger
IRuntime *runtime = createInferRuntime(gLogger);
assert(runtime != nullptr);
// deserialize engine for using the char-stream
ICudaEngine *engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
assert(engine != nullptr);
// create execution context -- required for inference executions
IExecutionContext *context = engine->createExecutionContext();
assert(context != nullptr);
float out[1]; // array for output
float data[1]; // array for input
for (float &i: data)
i = 12.0; // put any value for input
// time the execution
auto start = std::chrono::system_clock::now();
// do inference using the parameters
doInference(*context, data, out, 1);
// time the execution
auto end = std::chrono::system_clock::now();
std::cout << "\n[INFO]: Time taken by execution: "
<< std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
// free the captured space
context->destroy();
engine->destroy();
runtime->destroy();
std::cout << "\nInput:\t" << data[0];
std::cout << "\nOutput:\t";
for (float i: out) {
std::cout << i;
}
std::cout << std::endl;
}
int checkArgs(int argc, char **argv) {
/**
* Parse command line arguments
*
* @param argc: argument count
* @param argv: arguments vector
* @return int: a flag to perform operation
*/
if (argc != 2) {
std::cerr << "[ERROR]: Arguments not right!" << std::endl;
std::cerr << "./mlp -s // serialize model to plan file" << std::endl;
std::cerr << "./mlp -d // deserialize plan file and run inference" << std::endl;
return -1;
}
if (std::string(argv[1]) == "-s") {
return 1;
} else if (std::string(argv[1]) == "-d") {
return 2;
}
return -1;
}
int main(int argc, char **argv) {
int args = checkArgs(argc, argv);
if (args == 1)
performSerialization();
else if (args == 2)
performInference();
// performSerialization();
// performInference();
return 0;
}