TensorRT/samples/common/buffers.h - BufferManager源碼研讀
buffers.h - BufferManager
buffers.h
定義了用於緩存管理的BufferManager
。這個類別的核心是ManagedBuffer
數據結構。ManagedBuffer
由DeviceBuffer
及HostBuffer
組成。這兩者都是GenericBuffer
的特例。GenericBuffer
則是一個RAII class,管理著緩存的申請,釋放及查詢。
本文接著上文介紹的GenericBuffer
,繼續介紹BufferManager
。
/*
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef TENSORRT_BUFFERS_H
#define TENSORRT_BUFFERS_H
#include "NvInfer.h"
#include "common.h"
#include "half.h"
#include <cassert>
#include <cuda_runtime_api.h>
#include <iostream>
#include <iterator>
#include <memory>
#include <new>
#include <numeric>
#include <string>
#include <vector>
using namespace std;
namespace samplesCommon
{
//此處省略GenericBuffer,DeviceAllocator,DeviceFree,HostAllocator,HostFree,DeviceBuffer,HostBuffer,ManagedBuffer
//!
//! \brief The BufferManager class handles host and device buffer allocation and deallocation.
//!
//! \details This RAII class handles host and device buffer allocation and deallocation,
//! memcpy between host and device buffers to aid with inference,
//! and debugging dumps to validate inference. The BufferManager class is meant to be
//! used to simplify buffer management and any interactions between buffers and the engine.
//!
//處理host及device上buffer的分配,釋放及複製
class BufferManager
{
public:
//這個值等於18446744073709551615,但為何要這樣定義?
static const size_t kINVALID_SIZE_VALUE = ~size_t(0);
//!
//! \brief Create a BufferManager for handling buffer interactions with engine.
//!
//ICudaEngine定義於NvInferRuntime.h
//IExecutionContext定義於NvInferRuntime.h
//獲取各binding的type及volume,為它們申請內存,之後放入mDeviceBindings及mManagedBuffers這兩個vector內
BufferManager(std::shared_ptr<nvinfer1::ICudaEngine> engine, const int& batchSize,
const nvinfer1::IExecutionContext* context = nullptr)
: mEngine(engine)
, mBatchSize(batchSize)
{
// Create host and device buffers
//mEngine->getNbBindings():有多少Tensor被標為模型的輸出或輸入
for (int i = 0; i < mEngine->getNbBindings(); i++)
{
//virtual Dims IExecutionContext::getBindingDimensions(int bindingIndex) const noexcept = 0;
//virtual Dims ICudaEngine::getBindingDimensions(int bindingIndex) const noexcept = 0;
//獲取某個binding的維度
auto dims = context ? context->getBindingDimensions(i) : mEngine->getBindingDimensions(i);
//使用顯式地型別轉換:int->size_t
//context非nullptr時就設為1?
size_t vol = context ? 1 : static_cast<size_t>(mBatchSize);
//virtual DataType getBindingDataType(int bindingIndex) const noexcept = 0;
//獲取某個binding的buffer的資料型別
nvinfer1::DataType type = mEngine->getBindingDataType(i);
//virtual int getBindingVectorizedDim(int bindingIndex) const noexcept = 0;
//Return the dimension index that the buffer is vectorized.
//Specifically -1 is returned if scalars per vector is 1.
//第i個binding的buffer的第vecDim個維度被向量化?
int vecDim = mEngine->getBindingVectorizedDim(i);
if (-1 != vecDim) // i.e., 0 != lgScalarsPerVector
{
/*
virtual int getBindingComponentsPerElement(int bindingIndex) const noexcept = 0;
Return the number of components included in one element.
The number of elements in the vectors is returned if getBindingVectorizedDim() != -1.
*/
int scalarsPerVec = mEngine->getBindingComponentsPerElement(i);
//因為buffer的第vecDim個維度被向量化了,所以原來的長度是被高估的,此處將它還原?
//divUp來自TensorRT/samples/common/common.h,進行無條件進位的除法運算
dims.d[vecDim] = divUp(dims.d[vecDim], scalarsPerVec);
vol *= scalarsPerVec;
}
//vol計算的是buffer裡所包含的元素個數
//volume來自TensorRT/samples/common/common.h,計算各維度的連乘積
vol *= samplesCommon::volume(dims);
std::unique_ptr<ManagedBuffer> manBuf{new ManagedBuffer()};
//注意到host buffer與device buffer的大小是一樣的
manBuf->deviceBuffer = DeviceBuffer(vol, type);
manBuf->hostBuffer = HostBuffer(vol, type);
//還有一個用來存放device buffer的向量:mDeviceBindings
mDeviceBindings.emplace_back(manBuf->deviceBuffer.data());
mManagedBuffers.emplace_back(std::move(manBuf));
}
}
//!
//! \brief Returns a vector of device buffers that you can use directly as
//! bindings for the execute and enqueue methods of IExecutionContext.
//!
std::vector<void*>& getDeviceBindings()
{
return mDeviceBindings;
}
//!
//! \brief Returns a vector of device buffers.
//!
const std::vector<void*>& getDeviceBindings() const
{
return mDeviceBindings;
}
//!
//! \brief Returns the device buffer corresponding to tensorName.
//! Returns nullptr if no such tensor can be found.
//!
void* getDeviceBuffer(const std::string& tensorName) const
{
return getBuffer(false, tensorName);
}
//!
//! \brief Returns the host buffer corresponding to tensorName.
//! Returns nullptr if no such tensor can be found.
//!
void* getHostBuffer(const std::string& tensorName) const
{
return getBuffer(true, tensorName);
}
//!
//! \brief Returns the size of the host and device buffers that correspond to tensorName.
//! Returns kINVALID_SIZE_VALUE if no such tensor can be found.
//!
//檢查tensorName是否是engine的輸出入,如果是,則返回其host buffer所佔用的byte數(這同時也是其device buffer所佔用的byte數)
size_t size(const std::string& tensorName) const
{
int index = mEngine->getBindingIndex(tensorName.c_str());
if (index == -1)
return kINVALID_SIZE_VALUE;
return mManagedBuffers[index]->hostBuffer.nbBytes();
}
//!
//! \brief Dump host buffer with specified tensorName to ostream.
//! Prints error message to std::ostream if no such tensor can be found.
//!
//將host buffer裡的內容倒入ostream
void dumpBuffer(std::ostream& os, const std::string& tensorName)
{
int index = mEngine->getBindingIndex(tensorName.c_str());
if (index == -1)
{
os << "Invalid tensor name" << std::endl;
return;
}
void* buf = mManagedBuffers[index]->hostBuffer.data();
size_t bufSize = mManagedBuffers[index]->hostBuffer.nbBytes();
nvinfer1::Dims bufDims = mEngine->getBindingDimensions(index);
//rowCount代表要將多少元素放在同一行裡
//將rowCount設為一維及以上buffer最後一維的長度或mBatchSize?
//bufDims.nbDims有可能為0?
size_t rowCount = static_cast<size_t>(bufDims.nbDims >= 1 ? bufDims.d[bufDims.nbDims - 1] : mBatchSize);
//首先輸出buffer的形狀
os << "[" << mBatchSize;
for (int i = 0; i < bufDims.nbDims; i++)
os << ", " << bufDims.d[i];
os << "]" << std::endl;
switch (mEngine->getBindingDataType(index))
{
case nvinfer1::DataType::kINT32: print<int32_t>(os, buf, bufSize, rowCount); break;
case nvinfer1::DataType::kFLOAT: print<float>(os, buf, bufSize, rowCount); break;
case nvinfer1::DataType::kHALF: print<half_float::half>(os, buf, bufSize, rowCount); break;
//assert(0 && <other_expression>)一定會失敗,這時就會打印出Assertion `0 && "Int8 network-level input and output is not supported"' failed的訊息
case nvinfer1::DataType::kINT8: assert(0 && "Int8 network-level input and output is not supported"); break;
case nvinfer1::DataType::kBOOL: assert(0 && "Bool network-level input and output are not supported"); break;
}
}
//!
//! \brief Templated print function that dumps buffers of arbitrary type to std::ostream.
//! rowCount parameter controls how many elements are on each line.
//! A rowCount of 1 means that there is only 1 element on each line.
//!
template <typename T>
void print(std::ostream& os, void* buf, size_t bufSize, size_t rowCount)
{
//每行必須至少包含一個元素
assert(rowCount != 0);
assert(bufSize % sizeof(T) == 0);
//typeBuf:型別已經轉換為T的buf
T* typedBuf = static_cast<T*>(buf);
//buffer所包含的元素個數
size_t numItems = bufSize / sizeof(T);
for (int i = 0; i < static_cast<int>(numItems); i++)
{
// Handle rowCount == 1 case
if (rowCount == 1 && i != static_cast<int>(numItems) - 1)
//每行只包含一個元素並且非最後一個元素
os << typedBuf[i] << std::endl;
else if (rowCount == 1)
//每行只包含一個元素並且是最後一個元素
os << typedBuf[i];
// Handle rowCount > 1 case
else if (i % rowCount == 0)
//每行的第一個元素,前面不加空格
os << typedBuf[i];
else if (i % rowCount == rowCount - 1)
//每行的最後一個元素,加上空行符號
os << " " << typedBuf[i] << std::endl;
else
//一行裡非首尾的元素,前面要加空格
os << " " << typedBuf[i];
}
}
//!
//! \brief Copy the contents of input host buffers to input device buffers synchronously.
//!
void copyInputToDevice()
{
memcpyBuffers(true, false, false);
}
//!
//! \brief Copy the contents of output device buffers to output host buffers synchronously.
//!
void copyOutputToHost()
{
memcpyBuffers(false, true, false);
}
//!
//! \brief Copy the contents of input host buffers to input device buffers asynchronously.
//!
void copyInputToDeviceAsync(const cudaStream_t& stream = 0)
{
memcpyBuffers(true, false, true, stream);
}
//!
//! \brief Copy the contents of output device buffers to output host buffers asynchronously.
//!
void copyOutputToHostAsync(const cudaStream_t& stream = 0)
{
memcpyBuffers(false, true, true, stream);
}
~BufferManager() = default;
private:
//檢查tensorName是否是engine的輸入或輸出,如果不是,回傳空指標
//如果是,依照isHost來決定要回傳host或device上的buffer
void* getBuffer(const bool isHost, const std::string& tensorName) const
{
/*
ICudaEngine::getBindingIndex來自NvinferRuntime.h
virtual int getBindingIndex(const char* name) const noexcept = 0;
Retrieve the binding index for a named tensor.
*/
int index = mEngine->getBindingIndex(tensorName.c_str());
if (index == -1)
return nullptr;
return (isHost ? mManagedBuffers[index]->hostBuffer.data() : mManagedBuffers[index]->deviceBuffer.data());
}
//copyInput參數?
void memcpyBuffers(const bool copyInput, const bool deviceToHost, const bool async, const cudaStream_t& stream = 0)
{
for (int i = 0; i < mEngine->getNbBindings(); i++)
{
//依據參數deviceToHost來決定誰是src,誰是dst
void* dstPtr
= deviceToHost ? mManagedBuffers[i]->hostBuffer.data() : mManagedBuffers[i]->deviceBuffer.data();
//依據參數deviceToHost來決定誰是src,誰是dst
const void* srcPtr
= deviceToHost ? mManagedBuffers[i]->deviceBuffer.data() : mManagedBuffers[i]->hostBuffer.data();
const size_t byteSize = mManagedBuffers[i]->hostBuffer.nbBytes();
//依據參數deviceToHost來決定複製的方向
const cudaMemcpyKind memcpyType = deviceToHost ? cudaMemcpyDeviceToHost : cudaMemcpyHostToDevice;
//這個檢查是?
/*
ICudaEngine::bindingIsInput來自NvInferRuntime.h
virtual bool bindingIsInput(int bindingIndex) const noexcept = 0;
Determine whether a binding is an input binding.
*/
if ((copyInput && mEngine->bindingIsInput(i)) || (!copyInput && !mEngine->bindingIsInput(i)))
{
if (async)
/*__host____device__cudaError_t cudaMemcpyAsync ( void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream = 0 )
Copies data between host and device.
*/
/*
CHECK是TensorRT/samples/common/common.h裡的一個macro
用於檢查函數的回傳值是否正常
*/
CHECK(cudaMemcpyAsync(dstPtr, srcPtr, byteSize, memcpyType, stream));
else
//__host__cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind )
//Copies data between host and device.
CHECK(cudaMemcpy(dstPtr, srcPtr, byteSize, memcpyType));
}
}
}
//為何此處要使用shared_ptr?
std::shared_ptr<nvinfer1::ICudaEngine> mEngine; //!< The pointer to the engine
int mBatchSize; //!< The batch size
//為何此處要使用unique_ptr?
std::vector<std::unique_ptr<ManagedBuffer>> mManagedBuffers; //!< The vector of pointers to managed buffers
std::vector<void*> mDeviceBindings; //!< The vector of device buffers needed for engine execution
};
} // namespace samplesCommon
#endif // TENSORRT_BUFFERS_H
static_cast
在BufferManager
的建構子中,使用static_cast
將int
型別的變數轉為size_t
型別。關於static_cast
,詳見C++ static_cast。
emplace_back
在BufferManager
的建構子內,使用了emplace_back
來將物件放入vector
末尾。至於emplace_back
跟push_back
的區別在哪呢?詳見C++ emplace_back。
unique_ptr,shared_ptr
在定義BufferManager
的私有成員變數mEngine
及mManagedBuffers
時,用到了unqiue_ptr
及shared_ptr
,詳見C++ smart pointer,unique_ptr,shared_ptr。
constant parameters
在BufferManager
的建構子中,將參數batchSize
及context
加上const
修飾字,根據Reference and Constant Parameters:
When you put "const" in front of a parameter,
it means that it cannot be modified in the function.
這將使得這兩個變數無法在函數裡被修改。
vector::emplace_back(move(unique_ptr))
在BufferManager
的建構子中有著這麼一句:
//std::vector<std::unique_ptr<ManagedBuffer>> mManagedBuffers;
//std::unique_ptr<ManagedBuffer> manBuf{new ManagedBuffer()};
mManagedBuffers.emplace_back(std::move(manBuf));
其中的std::move
在這裡發揮的是什麼作用呢?詳見C++ move semantics。
vectorize
在BufferManager
的建構子中有著這麼一段:
int vecDim = mEngine->getBindingVectorizedDim(i);
if (-1 != vecDim) // i.e., 0 != lgScalarsPerVector
{
int scalarsPerVec = mEngine->getBindingComponentsPerElement(i);
dims.d[vecDim] = divUp(dims.d[vecDim], scalarsPerVec);
vol *= scalarsPerVec;
}
這似乎在說在TensorRT的內部機制中,會將tensor的某一個維度給向量化,但目前並未查到相關資料,此部份將在有新資料佐證後進行更新。
assert(0 && “xxx”)
在BufferManager::dumpBuffer
中使用assert(0 && "xxx")
的寫法,這是什麼意思呢?詳見C++ assert(0)。
= default
在定義BufferManager
的destructor時用到了= default
,這代表使用編譯器自動生成的destructor。使用"defaulted special member function"使得BufferManager
仍是一個trivial type。詳見C++ Explicitly defaulted function。
參考連結
CUDA Runtime API - 5.9. Memory Management