一、 tensorRt量化
-
实际操作中,
input[float32], w[int8], bias[float32], output[float32]
-
具体操作如下:
input[int8] = to_int8(input[float32]) y[int16] = input[int8] * w[int8] # 此处乘法由计算机转换为int16,保证精度 output[float32] = to_float32(y[int16] + bias[float32])
-
整个量化过程只是为了减少float32的乘法数量以实现提速
-
对于to_int8过程,并不是直接线性缩放,而是使用KL散度进行阈值截断,使得量化前后权重的分布差异尽可能小。
二、构建模型的区别
1. config配置setFlag
```cpp
config->setFlag(nvinfer1::BuilderFlag::kINT8);
```
2. 配置int8标定数据读取工具:用于评估量化前后的分布改变
-
实现int8熵校准器(Int8EntropyCalibrator类),继承自属于nvinfer1的IInt8EntropyCalibrator2类,覆盖其中的一些函数,实现自己的需求
-
getBatchSize:告诉引擎,这次标定的batch大小
-
getBatch:告诉引擎,这次标定的输入数据是什么,把指针赋值给bindings,返回false说明已没有数据了
-
readCalibrationCache:从缓存文件中加载标定信息,抗压避免读取文件和预处理,若该函数返回空指针表示没有缓存,程序会重新通过getBatch进行计算
-
writeCalibrationCache:标定结束后,调用该函数,储存标定后的结果,多次标定可以使用该缓存实现加速
// int8熵校准器:用于评估量化前后的分布改变 class Int8EntropyCalibrator : public nvinfer1::IInt8EntropyCalibrator2 { public: Int8EntropyCalibrator(const vector<string>& imagefiles, nvinfer1::Dims dims, const Int8Process& preprocess) { assert(preprocess != nullptr); this->dims_ = dims; this->allimgs_ = imagefiles; this->preprocess_ = preprocess; this->fromCalibratorData_ = false; files_.resize(dims.d[0]); } // 这个构造函数,是允许从缓存数据中加载标定结果,这样不用重新读取图像处理 Int8EntropyCalibrator(const vector<uint8_t>& entropyCalibratorData, nvinfer1::Dims dims, const Int8Process& preprocess) { assert(preprocess != nullptr); this->dims_ = dims; this->entropyCalibratorData_ = entropyCalibratorData; this->preprocess_ = preprocess; this->fromCalibratorData_ = true; files_.resize(dims.d[0]); } virtual ~Int8EntropyCalibrator(){ if(tensor_host_ != nullptr){ checkRuntime(cudaFreeHost(tensor_host_)); checkRuntime(cudaFree(tensor_device_)); tensor_host_ = nullptr; tensor_device_ = nullptr; } } // 想要按照多少的batch进行标定 int getBatchSize() const noexcept { return dims_.d[0]; } bool next() { int batch_size = dims_.d[0]; if (cursor_ + batch_size > allimgs_.size()) return false; for(int i = 0; i < batch_size; ++i) files_[i] = allimgs_[cursor_++]; if(tensor_host_ == nullptr){ size_t volumn = 1; for(int i = 0; i < dims_.nbDims; ++i) volumn *= dims_.d[i]; bytes_ = volumn * sizeof(float); checkRuntime(cudaMallocHost(&tensor_host_, bytes_)); checkRuntime(cudaMalloc(&tensor_device_, bytes_)); } preprocess_(cursor_, allimgs_.size(), files_, dims_, tensor_host_); checkRuntime(cudaMemcpy(tensor_device_, tensor_host_, bytes_, cudaMemcpyHostToDevice)); return true; } bool getBatch(void* bindings[], const char* names[], int nbBindings) noexcept { if (!next()) return false; bindings[0] = tensor_device_; return true; } const vector<uint8_t>& getEntropyCalibratorData() { return entropyCalibratorData_; } const void* readCalibrationCache(size_t& length) noexcept { if (fromCalibratorData_) { length = this->entropyCalibratorData_.size(); return this->entropyCalibratorData_.data(); } length = 0; return nullptr; } virtual void writeCalibrationCache(const void* cache, size_t length) noexcept { entropyCalibratorData_.assign((uint8_t*)cache, (uint8_t*)cache + length); } private: Int8Process preprocess_; vector<string> allimgs_; size_t batchCudaSize_ = 0; int cursor_ = 0; size_t bytes_ = 0; nvinfer1::Dims dims_; vector<string> files_; float* tensor_host_ = nullptr; float* tensor_device_ = nullptr; vector<uint8_t> entropyCalibratorData_; bool fromCalibratorData_ = false; };
-
配置int8标定数据读取工具
shared_ptr<Int8EntropyCalibrator> calib(new Int8EntropyCalibrator( {"kej.jpg"}, input_dims, preprocess )); config->setInt8Calibrator(calib.get());
3. Int8EntropyCalibrator作用
- 读取并预处理过的图像数据作为输入,进行量化标定
- 标定过程的理解:对于输入图像A,使用float32进行推理得到P1,用int8推理得到P2,调整int8权重使得P1与P2足够接近
- 开发时,使用100张左右图像即可