量化相关介绍
模型量化非为动态量化,和静态量化,以及量化感知训练。
动态量化:只对权重进行量化,(会带来较大的精度损失)。
静态量化:需要对权重和每一层量化op的激活值进行量化操作(通过对激活值的量化处理,减少精度损失)。
量化感知训练:是在训练的过程中进行量化处理。
这里我们主要介绍静态量化。
int8量化方法
想要将float32的数据映射到int8上,直接转成int数据必然是不可行的,int8的取值范围[-128, 127]。
有两种映射方式,不饱和、饱和映射。
不饱和
如图直接将float数据的 ±|max| 映射成-127, 127,中间的数值则根据所得到的映射关系,线性插入该区间内。这种暴力的方式,必然会带来精度的损失。
饱和
这种做法不是将 ±|max| 映射为 ±127,而是存在一个 阈值 |T| ,将 ±|T| 映射为±127,显然这里 |T|<|max|。
超出阈值 ±|T| 外的直接映射为阈值 ±127。比如上图中的三个红色点,直接映射为-127。
称这种映射关系为饱和的(Saturate ),不对称的。
只要 阈值 选取得当,就能将分布散乱的较大的激活值舍弃掉,也就有可能使精度损失不至于降低太多。
怎么样选取的合适的阈值,采用下面的流程:
- 收集激活值的直方图;
- 基于不同的阈值产生不同的量化分布;
- 然后计算每个分布与原分布的相对熵,然后选择熵最少的一个,也就是跟原分布最像的一个。
Nvidia根据实验表明,对于权重使用该方式的提升不明显,对于激活值提升精度较大。
MNN量化代码分析
量化代码入口
int main(int argc, const char* argv[]) {
if (argc < 4) {
DLOG(INFO) << "Usage: ./quantized.out src.mnn dst.mnn preTreatConfig.json\n";
return 0;
}
const char* modelFile = argv[1];
const char* preTreatConfig = argv[3];
const char* dstFile = argv[2];
DLOG(INFO) << ">>> modelFile: " << modelFile;
DLOG(INFO) << ">>> preTreatConfig: " << preTreatConfig;
DLOG(INFO) << ">>> dstFile: " << dstFile;
// 加载待量化的模型
std::unique_ptr<MNN::NetT> netT;
{
std::ifstream input(modelFile);
std::ostringstream outputOs;
outputOs << input.rdbuf();
netT = MNN::UnPackNet(outputOs.str().c_str());
}
// temp build net for inference
flatbuffers::FlatBufferBuilder builder(1024);
auto offset = MNN::Net::Pack(builder, netT.get());
builder.Finish(offset);
int size = builder.GetSize();
auto ocontent = builder.GetBufferPointer();
// model buffer for creating mnn Interpreter
std::unique_ptr<uint8_t> modelForInference(new uint8_t[size]);
memcpy(modelForInference.get(), ocontent, size);
std::unique_ptr<uint8_t> modelOriginal(new uint8_t[size]);
memcpy(modelOriginal.get(), ocontent, size);
netT.reset();
netT = MNN::UnPackNet(modelOriginal.get());
// quantize model's weight
DLOG(INFO) << "Calibrate the feature and quantize model...";
// 构建Calibration对象,负责量化
std::shared_ptr<Calibration> calibration(
new Calibration(netT.get(), modelForInference.get(), size, preTreatConfig));
// 执行量化,更新参数为int8
calibration->runQuantizeModel();
// 将量化的参数写入json文件
calibration->dumpTensorScales(dstFile);
DLOG(INFO) << "Quantize model done!";
// 保存量化后的模型
flatbuffers::FlatBufferBuilder builderOutput(1024);
builderOutput.ForceDefaults(true);
auto len = MNN::Net::Pack(builderOutput, netT.get());
builderOutput.Finish(len);
{
std::ofstream output(dstFile);
output.write((const char*)builderOutput.GetBufferPointer(), builderOutput.GetSize());
}
return 0;
}
构建Calibration对象
量化的时候需要传入一个json格式的配置文件,如下所示。
{"format":"RGB",
"mean":[127.5,127.5,127.5],
"normal":[0.00784314,0.00784314,0.00784314],
"width":224,
"height":224,
"path":"path/to/images/",
"used_image_num":500,
"feature_quantize_method":"KL",
"weight_quantize_method":"MAX_ABS"
}
Calibration构造函数
主要有三个步骤:
1.解析json配置文件。
2.初始化session
3.初始化tensor map
Calibration::Calibration(MNN::NetT* model, const uint8_t* modelBuffer, const int bufferSize, const std::string& configPath)
: _originaleModel(model) {
// when the format of input image is RGB/BGR, channels equal to 3, GRAY is 1
int channles = 3;
// 解析量化json配置文件
rapidjson::Document document;
{
std::ifstream fileNames(configPath.c_str());
std::ostringstream output;
output << fileNames.rdbuf();
auto outputStr = output.str();
document.Parse(outputStr.c_str());
if (document.HasParseError()) {
MNN_ERROR("Invalid json\n");
return;
}
}
auto picObj = document.GetObject();
ImageProcess::Config config;
config.filterType = BILINEAR;
config.destFormat = BGR;
{
if (picObj.HasMember("format")) {
auto format = picObj["format"].GetString();
static std::map<std::string, ImageFormat> formatMap{{"BGR", BGR}, {"RGB", RGB}, {"GRAY", GRAY}, {"RGBA", RGBA}, {"BGRA", BGRA}};
if (formatMap.find(format) != formatMap.end()) {
config.destFormat = formatMap.find(format)->second;
}
}
}
switch (config.destFormat) {
case GRAY:
channles = 1;
break;
case RGB:
case BGR:
channles = 3;
break;
case RGBA:
case BGRA:
channles = 4;
break;
default:
break;
}
// 根据配置文件中的参数设置config
config.sourceFormat = RGBA;
std::string imagePath;
_imageNum = 0;
{
if (picObj.HasMember("mean")) {
auto mean = picObj["mean"].GetArray();
int cur = 0;
for (auto iter = mean.begin(); iter != mean.end(); iter++) {
config.mean[cur++] = iter->GetFloat();
}
}
if (picObj.HasMember("normal")) {
auto normal = picObj["normal"].GetArray();
int cur = 0;
for (auto iter = normal.begin(); iter != normal.end(); iter++) {
config.normal[cur++] = iter->GetFloat();
}
}
if (picObj.HasMember("width")) {
_width = picObj["width"].GetInt();
}
if (picObj.HasMember("height")) {
_height = picObj["height"].GetInt();
}
if (picObj.HasMember("path")) {
imagePath = picObj["path"].GetString();
}
if (picObj.HasMember("used_image_num")) {
_imageNum = picObj["used_image_num"].GetInt();
}
if (picObj.HasMember("feature_quantize_method")) {
std::string method = picObj["feature_quantize_method"].GetString();
if (Helper::featureQuantizeMethod.find(method) != Helper::featureQuantizeMethod.end()) {
_featureQuantizeMethod = method;
} else {
MNN_ERROR("not supported feature quantization method: %s\n", method.c_str());
return;
}
}
if (picObj.HasMember("weight_quantize_method")) {
std::string method = picObj["weight_quantize_method"].GetString();
if (Helper::weightQuantizeMethod.find(method) != Helper::weightQuantizeMethod.end()) {
_weightQuantizeMethod = method;
} else {
MNN_ERROR("not supported weight quantization method: %s\n", method.c_str());
return;
}
}
DLOG(INFO) << "Use feature quantization method: " << _featureQuantizeMethod;
DLOG(INFO) << "Use weight quantization method: " << _weightQuantizeMethod;
if (picObj.HasMember("feature_clamp_value")) {
float value = (int)picObj["feature_clamp_value"].GetFloat();
if (value < 0.0f || value > 127.0f) {
MNN_ERROR("feature_clamp_value should be in (0, 127], got: %f\n", value);
return;
}
_featureClampValue = value;
}
if (picObj.HasMember("weight_clamp_value")) {
float value = (int)picObj["weight_clamp_value"].GetFloat();
if (value < 0.0f || value > 127.0f) {
MNN_ERROR("weight_clamp_value should be in (0, 127], got: %f\n", value);
return;
}
_weightClampValue = value;
}
DLOG(INFO) << "feature_clamp_value: " << _featureClampValue;
DLOG(INFO) << "weight_clamp_value: " << _weightClampValue;
if (picObj.HasMember("skip_quant_op_names")) {
auto skip_quant_op_names = picObj["skip_quant_op_names"].GetArray();
for (auto iter = skip_quant_op_names.begin(); iter != skip_quant_op_names.end(); iter++) {
std::string skip_quant_op_name = iter->GetString();
_skip_quant_ops.emplace_back(skip_quant_op_name);
DLOG(INFO) << "skip quant op name: " << skip_quant_op_name;
}
}
if (picObj.HasMember("debug")) {
_debug = picObj["debug"].GetBool();
}
}
std::shared_ptr<ImageProcess> process(ImageProcess::create(config));
_process = process;
// read images file names
Helper::readImages(_imgaes, imagePath.c_str(), &_imageNum);
_initMNNSession(modelBuffer, bufferSize, channles);
_initMaps();
}
- 初始化MNN Session
和MNN推理过程是一样的,所以这里就不多做介绍。
这里会创建两个interpreter和两个session。
void Calibration::_initMNNSession(const uint8_t* modelBuffer, const int bufferSize, const int channels) {
_interpreterOrigin.reset(MNN::Interpreter::createFromBuffer(modelBuffer, bufferSize));
MNN::ScheduleConfig config;
_sessionOrigin = _interpreterOrigin->createSession(config);
_inputTensorOrigin = _interpreterOrigin->getSessionInput(_sessionOrigin, NULL);
// 权重量化
_fake_quant_weights();
flatbuffers::FlatBufferBuilder builder(1024);
auto offset = MNN::Net::Pack(builder, _originaleModel);
builder.Finish(offset);
int size = builder.GetSize();
auto buffer = builder.GetBufferPointer();
_interpreter.reset(MNN::Interpreter::createFromBuffer(buffer, size));
_session = _interpreter->createSession(config);
_inputTensor = _interpreter->getSessionInput(_session, NULL);
_inputTensorDims.resize(4);
auto inputTensorDataFormat = MNN::TensorUtils::getDescribe(_inputTensor)->dimensionFormat;
if (inputTensorDataFormat == MNN::MNN_DATA_FORMAT_NHWC) {
_inputTensorDims[0] = 1;
_inputTensorDims[1] = _height;
_inputTensorDims[2] = _width;
_inputTensorDims[3] = channels;
} else {
_inputTensorDims[0] = 1;
_inputTensorDims[1] = channels;
_inputTensorDims[2] = _height;
_inputTensorDims[3] = _width;
}
if (_featureQuantizeMethod == "KL") {
_interpreter->resizeTensor(_inputTensor, _inputTensorDims);
_interpreter->resizeSession(_session);
_interpreterOrigin->resizeTensor(_inputTensorOrigin, _inputTensorDims);
_interpreterOrigin->resizeSession(_sessionOrigin);
} else if (_featureQuantizeMethod == "ADMM") {
DCHECK((_imageNum * 4 * _height * _width) < (INT_MAX / 4)) << "Use Little Number of Images When Use ADMM";
_inputTensorDims[0] = _imageNum;
_interpreter->resizeTensor(_inputTensor, _inputTensorDims);
_interpreter->resizeSession(_session);
_interpreterOrigin->resizeTensor(_inputTensorOrigin, _inputTensorDims);
_interpreterOrigin->resizeSession(_sessionOrigin);
}
}
通过_fake_quant_weights()
函数量化_originaleModel的权重参数,这里采用的是不饱和映射。
void Calibration::_fake_quant_weights() {
// 找到权重中最大值(绝对值)的lamda函数
auto findAbsMax = [&] (const float* weights, const int size) {
float absMax = 0;
for (int i = 0; i < size; i++) {
if (std::fabs(weights[i]) > absMax) {
absMax = std::fabs(weights[i]);
}
}
return absMax;
};
for (const auto& op : _originaleModel->oplists) {
// 跳过指定不需要量化的op,以及非Convolution
std::vector<std::string>::iterator iter = std::find(_skip_quant_ops.begin(), _skip_quant_ops.end(), op->name);
if (iter != _skip_quant_ops.end()) {
continue;
}
const auto opType = op->type;
if (opType != MNN::OpType_Convolution && opType != MNN::OpType_ConvolutionDepthwise) {
continue;
}
auto param = op->main.AsConvolution2D();
const int kernelNum = param->common->outputCount;
std::vector<float> weights = param->weight;
const int weightSize = weights.size();
const int kernelSize = weightSize / kernelNum;
// 对每个kernel进行量化
for (int i = 0; i < kernelNum; i++) {
const int offset = i * kernelSize;
float absMax = findAbsMax(weights.data() + offset, kernelSize);
float scale = absMax / _weightClampValue; // 根据最大值计算缩放因子
if (absMax < 1e-6f) {
scale = absMax;
}
for (int j = 0; j < kernelSize; j++) {
float value = weights[offset + j];
float quantValue = std::round(value / scale); // 量化到int8的值
float clampedValue = std::max(std::min(quantValue, _weightClampValue), -_weightClampValue); // 小于-127的部分,映射到-127
float dequantValue = scale * clampedValue;
param->weight[offset + j] = dequantValue; // 反量化
}
}
}
}
- 初始化tensor map
void Calibration::_initMaps() {
_featureInfo.clear();
_featureInfoOrigin.clear();
_opInfo.clear(); // 记录 由未量化的权重计算所得到的所有输入输出tensor
_tensorMap.clear();
// run mnn once, initialize featureMap, opInfo map
// 为input tensor和output tensor分配TensirStatistic对象,并构建映射关系
MNN::TensorCallBackWithInfo before = [&](const std::vector<MNN::Tensor*>& nTensors, const MNN::OperatorInfo* info) {
std::string opName = info->name();
std::vector<std::string>::iterator iter = std::find(_skip_quant_ops.begin(), _skip_quant_ops.end(), opName);
if (iter != _skip_quant_ops.end()) {
return false;
}
_opInfo[opName].first = nTensors;
if (Helper::gNeedFeatureOp.find(info->type()) != Helper::gNeedFeatureOp.end()) {
int i = 0;
for (auto t : nTensors) {
if (_featureInfo.find(t) == _featureInfo.end()) {
_featureInfo[t] = std::shared_ptr<TensorStatistic>(
new TensorStatistic(t, _featureQuantizeMethod, opName + " input_tensor_" + flatbuffers::NumToString(i), _featureClampValue));
}
i++;
}
}
return false;
};
MNN::TensorCallBackWithInfo after = [this](const std::vector<MNN::Tensor*>& nTensors,
const MNN::OperatorInfo* info) {
std::string opName = info->name();
std::vector<std::string>::iterator iter = std::find(_skip_quant_ops.begin(), _skip_quant_ops.end(), opName);
if (iter != _skip_quant_ops.end()) {
return true;
}
_opInfo[opName].second = nTensors;
if (Helper::gNeedFeatureOp.find(info->type()) != Helper::gNeedFeatureOp.end()) {
int i = 0;
for (auto t : nTensors) {
if (_featureInfo.find(t) == _featureInfo.end()) {
_featureInfo[t] =
std::shared_ptr<TensorStatistic>(new TensorStatistic(t, _featureQuantizeMethod, opName + " output_tensor_" + flatbuffers::NumToString(i), _featureClampValue));
}
i++;
}
}
return true;
};
_interpreter->runSessionWithCallBackInfo(_session, before, after);
MNN::TensorCallBackWithInfo beforeOrigin = [&](const std::vector<MNN::Tensor*>& nTensors, const MNN::OperatorInfo* info) {
std::string opName = info->name();
std::vector<std::string>::iterator iter = std::find(_skip_quant_ops.begin(), _skip_quant_ops.end(), opName);
if (iter != _skip_quant_ops.end()) {
return false;
}
if (Helper::gNeedFeatureOp.find(info->type()) != Helper::gNeedFeatureOp.end()) {
int i = 0;
for (auto t : nTensors) {
if (_featureInfoOrigin.find(t) == _featureInfoOrigin.end()) {
_featureInfoOrigin[t] = std::shared_ptr<TensorStatistic>(
new TensorStatistic(t, _featureQuantizeMethod, opName + " input_tensor_" + flatbuffers::NumToString(i), _featureClampValue));
}
i++;
}
}
return false;
};
MNN::TensorCallBackWithInfo afterOrigin = [this](const std::vector<MNN::Tensor*>& nTensors,
const MNN::OperatorInfo* info) {
std::string opName = info->name();
std::vector<std::string>::iterator iter = std::find(_skip_quant_ops.begin(), _skip_quant_ops.end(), opName);
if (iter != _skip_quant_ops.end()) {
return true;
}
if (Helper::gNeedFeatureOp.find(info->type()) != Helper::gNeedFeatureOp.end()) {
int i = 0;
for (auto t : nTensors) {
if (_featureInfoOrigin.find(t) == _featureInfoOrigin.end()) {
_featureInfoOrigin[t] =
std::shared_ptr<TensorStatistic>(new TensorStatistic(t, _featureQuantizeMethod, opName + " output_tensor_" + flatbuffers::NumToString(i), _featureClampValue));
}
i++;
}
}
return true;
};
_interpreterOrigin->runSessionWithCallBackInfo(_sessionOrigin, beforeOrigin, afterOrigin);
/**
* _tensorMap用于每个op输入输出tensor,并记录每个tensor的index。_opInfo是未量化的模型的执行结果
**/
for (auto& op : _originaleModel->oplists) {
if (_opInfo.find(op->name) == _opInfo.end()) {
continue;
}
for (int i = 0; i < op->inputIndexes.size(); ++i) {
_tensorMap[op->inputIndexes[i]] = _opInfo[op->name].first[i];
}
for (int i = 0; i < op->outputIndexes.size(); ++i) {
_tensorMap[op->outputIndexes[i]] = _opInfo[op->name].second[i];
}
}
if (_featureQuantizeMethod == "KL") {
// set the tensor-statistic method of input tensor as THRESHOLD_MAX
auto inputTensorStatistic = _featureInfo.find(_inputTensor);
if (inputTensorStatistic != _featureInfo.end()) {
inputTensorStatistic->second->setThresholdMethod(THRESHOLD_MAX);
}
}
}
由代码可以看出,参数量化和未量化的session分别执行推理,并分配了执行前和执行后的回调函数,并为每个op的激活值tensor都会额外分配一个TensorStatistic。
此时_sessionOrigin的模型参数是量化为int8之后的,并将输入和输出tensor填入tensorMap中。
如果使用KL量化方法,为input tensor的统计方法设置阈值方法为THRESHOLD_MAX。
- TensorStatistic
class TensorStatistic {
public:
TensorStatistic(const MNN::Tensor* tensor, std::string method, const std::string& name, float featureClampValue, int binNumber = 2048, GET_THRESHOLD_METHOD thresholdMethod = THRESHOLD_KL);
~TensorStatistic() {
// Do nothing
}
void resetUpdatedDistributionFlag() {
mUpdatedDistributionFlag = false;
}
void resetUpdatedRangeFlags() {
mUpdatedRangeFlags = false;
}
void updateRange();
void resetDistribution();
void updateDistribution();
void setThresholdMethod(GET_THRESHOLD_METHOD thresholdMethod);
void setChannelWise(bool mergeChannel);
std::vector<float> finishAndCompute();
// only this one for ADMM
std::vector<float> computeScaleADMM();
std::string name() {
return mName;
}
bool visited() {
return mVisited;
}
void setVisited(bool visited) {
mVisited = visited;
}
std::pair<std::vector<float>, float> fakeQuantFeature();
float computeDistance(std::vector<float> fakeQuantedFeature);
private:
int _computeThreshold(const std::vector<float>& distribution);
std::vector<std::pair<float, float>> mRangePerChannel;
std::vector<float> mIntervals;
std::vector<bool> mValidChannel;
std::vector<std::vector<float>> mDistribution;
std::shared_ptr<MNN::Tensor> mHostTensor;
const MNN::Tensor* mOriginTensor;
int mBinNumber;
bool mUpdatedDistributionFlag = false;
bool mUpdatedRangeFlags = false;
bool mMergeChannel = true;
std::string mName;
GET_THRESHOLD_METHOD mThresholdMethod = THRESHOLD_KL;
bool mVisited = false;
std::vector<float> mScales;
float mFeatureClampValue = 127.0f;
};
至此,完成Calibration的对象的创建。
执行量化
执行的入口函数是runQuantizeModel
void Calibration::runQuantizeModel() {
if (_featureQuantizeMethod == "KL") {
_computeFeatureScaleKL();
} else if (_featureQuantizeMethod == "ADMM") {
_computeFeatureScaleADMM();
}
if (_debug) {
_computeQuantError();
}
_updateScale();
// 对于不支持量化的算子,对其输入进行逆量化成float32数据,对其输出,如果下面的算子需要量化,则将输出量化
_insertDequantize();
}
计算最有阈值
以KL量化方法为例,看_computeFeatureScaleKL
函数。
void Calibration::_computeFeatureScaleKL() {
// 统计每个通道上数据的分布直方图
_computeFeatureMapsRange();
_collectFeatureMapsDistribution();
_scales.clear();
for (auto& iter : _featureInfo) {
AUTOTIME;
_scales[iter.first] = iter.second->finishAndCompute();
}
//_featureInfo.clear();//No need now
}
- 统计数据分布
首先统计激活值每个通道上数据值域范围。
void Calibration::_computeFeatureMapsRange() {
// feed input data according to input images
int count = 0;
for (const auto& img : _imgaes) {
for (auto& iter : _featureInfo) {
iter.second->setVisited(false);
}
for (auto& iter : _featureInfo) {
iter.second->resetUpdatedRangeFlags();
}
count++;
Helper::preprocessInput(_process.get(), _width, _height, img, _inputTensor);
MNN::TensorCallBackWithInfo before = [&](const std::vector<MNN::Tensor*>& nTensors,
const MNN::OperatorInfo* info) {
for (auto t : nTensors) {
if (_featureInfo.find(t) != _featureInfo.end()) {
if (_featureInfo[t]->visited() == false) {
_featureInfo[t]->updateRange();
}
}
}
return true;
};
MNN::TensorCallBackWithInfo after = [&](const std::vector<MNN::Tensor*>& nTensors,
const MNN::OperatorInfo* info) {
for (auto t : nTensors) {
if (_featureInfo.find(t) != _featureInfo.end()) {
if (_featureInfo[t]->visited() == false) {
_featureInfo[t]->updateRange();
}
}
}
return true;
};
_interpreter->runSessionWithCallBackInfo(_session, before, after);
MNN_PRINT("\rComputeFeatureRange: %.2lf %%", (float)count * 100.0f / (float)_imageNum);
fflush(stdout);
}
MNN_PRINT("\n");
}
此时执行的是未量化参数的session,通过回调函数,统计input output tensor的最大最小值。
void TensorStatistic::updateRange() {
// 根据输入计算结果,得到特征图每个通道的最大最小值
if (mUpdatedRangeFlags) {
return;
}
mUpdatedRangeFlags = true;
mOriginTensor->copyToHostTensor(mHostTensor.get());
int batch = mHostTensor->batch();
int channel = mHostTensor->channel();
int width = mHostTensor->width();
int height = mHostTensor->height();
auto area = width * height;
for (int n = 0; n < batch; ++n) {
auto dataBatch = mHostTensor->host<float>() + n * mHostTensor->stride(0);
for (int c = 0; c < channel; ++c) {
int cIndex = c;
if (mMergeChannel) {
cIndex = 0;
}
auto minValue = mRangePerChannel[cIndex].first;
auto maxValue = mRangePerChannel[cIndex].second;
auto dataChannel = dataBatch + c * mHostTensor->stride(1);
for (int v = 0; v < area; ++v) {
minValue = std::min(minValue, dataChannel[v]);
maxValue = std::max(maxValue, dataChannel[v]);
}
mRangePerChannel[cIndex].first = minValue;
mRangePerChannel[cIndex].second = maxValue;
}
}
mVisited = true;
}
根据tensor值范围进行分布直方图统计。
void Calibration::_collectFeatureMapsDistribution() {
for (auto& iter : _featureInfo) {
iter.second->resetDistribution();
}
// feed input data according to input images
MNN::TensorCallBackWithInfo before = [&](const std::vector<MNN::Tensor*>& nTensors, const MNN::OperatorInfo* info) {
for (auto t : nTensors) {
if (_featureInfo.find(t) != _featureInfo.end()) {
if (_featureInfo[t]->visited() == false) {
_featureInfo[t]->updateDistribution();
}
}
}
return true;
};
MNN::TensorCallBackWithInfo after = [&](const std::vector<MNN::Tensor*>& nTensors, const MNN::OperatorInfo* info) {
for (auto t : nTensors) {
if (_featureInfo.find(t) != _featureInfo.end()) {
if (_featureInfo[t]->visited() == false) {
_featureInfo[t]->updateDistribution();
}
}
}
return true;
};
int count = 0;
for (const auto& img : _imgaes) {
count++;
for (auto& iter : _featureInfo) {
iter.second->setVisited(false);
}
for (auto& iter : _featureInfo) {
iter.second->resetUpdatedDistributionFlag();
}
Helper::preprocessInput(_process.get(), _width, _height, img, _inputTensor);
_interpreter->runSessionWithCallBackInfo(_session, before, after);
MNN_PRINT("\rCollectFeatureDistribution: %.2lf %%", (float)count * 100.0f / (float)_imageNum);
fflush(stdout);
}
MNN_PRINT("\n");
}
首先遍历所有用于特征图tensor,并进行初始化数据分布直方图。
然后进行推理计算所有的图片,同样在执行时带入了两个回调函数,分别对input和output tensor进行数据统计,并更新数据分布情况。
先来看直方图初始化,根据数据最大值计算映射到0~2048的直方图间隔。
void TensorStatistic::resetDistribution() {
for (int i = 0; i < mIntervals.size(); ++i) {
int cIndex = i;
if (mMergeChannel) {
cIndex = 0;
}
// 每个channel上的最大值
auto maxValue = std::max(fabsf(mRangePerChannel[cIndex].second), fabsf(mRangePerChannel[cIndex].first));
mValidChannel[cIndex] = maxValue > 0.00001f;
mIntervals[cIndex] = 0.0f;
if (mValidChannel[cIndex]) {
// mIntervals 代表是 原始float均匀映射到 [0-2048)时, 每个数字对应的bins间隔
// e.g. maxValue = 256.f; intervals = 8; 1.f就是在第8 bins,2.f就在第16 bin
mIntervals[cIndex] = (float)mBinNumber / maxValue;
}
}
// 每个通道上的分布都初始化为一个极小的值
for (auto& c : mDistribution) {
std::fill(c.begin(), c.end(), 1.0e-07);
}
// MNN_PRINT("==> %s max: %f\n", mName.c_str(),std::max(fabsf(mRangePerChannel[0].second),
// fabsf(mRangePerChannel[0].first)));
}
在执行推理的过程中,更新数据分布的过程如下。
void TensorStatistic::updateDistribution() {
if (mUpdatedDistributionFlag) {
return;
}
mUpdatedDistributionFlag = true;
mOriginTensor->copyToHostTensor(mHostTensor.get());
int batch = mHostTensor->batch();
int channel = mHostTensor->channel();
int width = mHostTensor->width();
int height = mHostTensor->height();
auto area = width * height;
for (int n = 0; n < batch; ++n) {
auto dataBatch = mHostTensor->host<float>() + n * mHostTensor->stride(0);
for (int c = 0; c < channel; ++c) {
int cIndex = c;
if (mMergeChannel) {
cIndex = 0;
}
if (!mValidChannel[cIndex]) {
continue;
}
auto multi = mIntervals[cIndex];
auto target = mDistribution[cIndex].data(); // 数据分布直方图
auto dataChannel = dataBatch + c * mHostTensor->stride(1); // 原始数据
for (int v = 0; v < area; ++v) {
auto data = dataChannel[v];
if (data == 0) {
continue;
}
int index = static_cast<int>(fabs(data) * multi);
// 找到原始数值映射到2048个bins的具体的bin上
index = std::min(index, mBinNumber - 1);
target[index] += 1.0f; // 具体bin的个数加1
}
}
}
}
- 计算量化阈值
std::vector<float> TensorStatistic::finishAndCompute() {
std::vector<float> scaleValue(mDistribution.size(), 0.0f);
if (mMergeChannel) {
if (!mValidChannel[0]) {
return scaleValue;
}
float sum = 0.0f;
auto& distribution = mDistribution[0];
std::for_each(distribution.begin(), distribution.end(), [&](float n) { sum += n; });
std::for_each(distribution.begin(), distribution.end(), [sum](float& n) { n /= sum; });
auto threshold = _computeThreshold(distribution);
auto scale = ((float)threshold + 0.5) / mIntervals[0] / mFeatureClampValue;
// MNN_PRINT("==> %s == %d, %f, %f\n", mName.c_str(),threshold, 1.0f / mIntervals[0], scale * mFeatureClampValue);
std::fill(scaleValue.begin(), scaleValue.end(), scale);
mScales = scaleValue;
return scaleValue;
}
for (int c = 0; c < mDistribution.size(); ++c) {
if (!mValidChannel[c]) {
continue;
}
float sum = 0.0f;
auto& distribution = mDistribution[c];
// 统计每个bin中数据总数
std::for_each(distribution.begin(), distribution.end(), [&](float n) { sum += n; });
// 每个bin中数目 n/sum,相当于每种数据分布概率
std::for_each(distribution.begin(), distribution.end(), [sum](float& n) { n /= sum; });
auto threshold = _computeThreshold(distribution);
// 根据计算所得阈值,计算得到每个tensor的缩放比例
scaleValue[c] = ((float)threshold + 0.5) / mIntervals[c] / mFeatureClampValue;
}
return scaleValue;
}
由代码可见,先计算得到数据的概率分布,然后通过_computeThreshold
计算得到映射阈值。
int TensorStatistic::_computeThreshold(const std::vector<float>& distribution) {
const int targetBinNums = 128;
int threshold = targetBinNums;
if (mThresholdMethod == THRESHOLD_KL) {
float minKLDivergence = 10000.0f;
float afterThresholdSum = 0.0f;
// 统计超过128以后(超过阈值)的个数afterThresholdSum
std::for_each(distribution.begin() + targetBinNums, distribution.end(),
[&](float n) { afterThresholdSum += n; });
for (int i = targetBinNums; i < mBinNumber; ++i) { // 从128 到 2048,找到新的threshold
std::vector<float> quantizedDistribution(targetBinNums);
std::vector<float> candidateDistribution(i);
std::vector<float> expandedDistribution(i);
// 0 ~ i-1 的分布情况, 之后的数据都列加到第i-1上,做饱和映射
std::copy(distribution.begin(), distribution.begin() + i, candidateDistribution.begin());
candidateDistribution[i - 1] += afterThresholdSum;
afterThresholdSum -= distribution[i];
// 大小i,重新映射到targetBinNums时的bin间隔
const float binInterval = (float)i / (float)targetBinNums;
// merge i bins to target bins
// [0, j]之间的分布,合并到[0, targetBinNums]大小的空间中,并保存在quantizedDistribution
for (int j = 0; j < targetBinNums; ++j) {
// [start, end]是映射到128长度的直方图中第j个bin所对应的数据
const float start = j * binInterval;
const float end = start + binInterval;
// 将超出左右范围的数据,按照距离两端距离远近作为系数,累加到当前bin中
const int leftUpper = static_cast<int>(std::ceil(start));
if (leftUpper > start) {
const float leftScale = leftUpper - start;
quantizedDistribution[j] += leftScale * distribution[leftUpper - 1];
}
const int rightLower = static_cast<int>(std::floor(end));
if (rightLower < end) {
const float rightScale = end - rightLower;
quantizedDistribution[j] += rightScale * distribution[rightLower];
}
// 统计并累加在范围内的数据
std::for_each(distribution.begin() + leftUpper, distribution.begin() + rightLower,
[&](float n) { quantizedDistribution[j] += n; });
}
// expand target bins to i bins
// 将0~targetBinNums空间逆映射会0~i的空间
for (int j = 0; j < targetBinNums; ++j) {
const float start = j * binInterval;
const float end = start + binInterval;
float count = 0;
const int leftUpper = static_cast<int>(std::ceil(start));
float leftScale = 0.0f;
if (leftUpper > start) {
leftScale = leftUpper - start;
if (distribution[leftUpper - 1] != 0) {
count += leftScale;
}
}
const int rightLower = static_cast<int>(std::floor(end));
float rightScale = 0.0f;
if (rightLower < end) {
rightScale = end - rightLower;
if (distribution[rightLower] != 0) {
count += rightScale;
}
}
std::for_each(distribution.begin() + leftUpper, distribution.begin() + rightLower, [&](float n) {
if (n != 0) {
count += 1;
}
});
if (count == 0) {
continue;
}
const float toExpandValue = quantizedDistribution[j] / count;
if (leftUpper > start && distribution[leftUpper - 1] != 0) {
expandedDistribution[leftUpper - 1] += toExpandValue * leftScale;
}
if (rightLower < end && distribution[rightLower] != 0) {
expandedDistribution[rightLower] += toExpandValue * rightScale;
}
for (int k = leftUpper; k < rightLower; ++k) {
if (distribution[k] != 0) {
expandedDistribution[k] += toExpandValue;
}
}
}
// 计算两个空间的KL散度,并记录最小的情况
const float curKL = _klDivergence(candidateDistribution, expandedDistribution);
// std::cout << "=====> KL: " << i << " ==> " << curKL << std::endl;
if (curKL < minKLDivergence) {
minKLDivergence = curKL;
threshold = i;
}
}
} else if (mThresholdMethod == THRESHOLD_MAX) {
threshold = mBinNumber - 1;
} else {
// TODO, support other method
MNN_ASSERT(false);
}
// KL散度最小的情况,所对应的threshold就是我们寻找的最优解
return threshold;
}
update scale
注意:该函数中是对_originaleModel,即量化参数的模型进行更新的。
void Calibration::_updateScale() {
for (const auto& op : _originaleModel->oplists) {
std::vector<std::string>::iterator iter = std::find(_skip_quant_ops.begin(), _skip_quant_ops.end(), op->name);
if (iter != _skip_quant_ops.end()) {
continue;
}
const auto opType = op->type;
if (opType != MNN::OpType_Convolution && opType != MNN::OpType_ConvolutionDepthwise &&
opType != MNN::OpType_Eltwise) {
continue;
}
auto tensorsPair = _opInfo.find(op->name);
if (tensorsPair == _opInfo.end()) {
MNN_ERROR("Can't find tensors for %s\n", op->name.c_str());
}
if (opType == MNN::OpType_Eltwise) {
auto param = op->main.AsEltwise();
// Now only support AddInt8
if (param->type != MNN::EltwiseType_SUM) {
continue;
}
const auto& inputScale0 = _scales[tensorsPair->second.first[0]];
const auto& inputScale1 = _scales[tensorsPair->second.first[1]];
const auto& outputScale = _scales[tensorsPair->second.second[0]];
const int outputScaleSize = outputScale.size();
std::vector<float> outputInvertScale(outputScaleSize);
Helper::invertData(outputInvertScale.data(), outputScale.data(), outputScaleSize);
op->type = MNN::OpType_EltwiseInt8;
op->main.Reset();
op->main.type = MNN::OpParameter_EltwiseInt8;
auto eltwiseInt8Param = new MNN::EltwiseInt8T;
auto input0ScaleParam = new MNN::QuantizedFloatParamT;
auto input1ScaleParam = new MNN::QuantizedFloatParamT;
auto outputScaleParam = new MNN::QuantizedFloatParamT;
input0ScaleParam->tensorScale = inputScale0;
input1ScaleParam->tensorScale = inputScale1;
outputScaleParam->tensorScale = outputInvertScale;
eltwiseInt8Param->inputQuan0 = std::unique_ptr<MNN::QuantizedFloatParamT>(input0ScaleParam);
eltwiseInt8Param->inputQuan1 = std::unique_ptr<MNN::QuantizedFloatParamT>(input1ScaleParam);
eltwiseInt8Param->outputQuan = std::unique_ptr<MNN::QuantizedFloatParamT>(outputScaleParam);
op->main.value = eltwiseInt8Param;
continue;
}
// below is Conv/DepthwiseConv
const auto& inputScale = _scales[tensorsPair->second.first[0]];
const auto& outputScale = _scales[tensorsPair->second.second[0]];
auto param = op->main.AsConvolution2D();
const int channles = param->common->outputCount;
const int weightSize = param->weight.size();
param->symmetricQuan.reset(new MNN::QuantizedFloatParamT);
// quantizedParam 是 param->symmetricQuan的引用
auto& quantizedParam = param->symmetricQuan;
quantizedParam->scale.resize(channles);
quantizedParam->weight.resize(weightSize);
quantizedParam->bias.resize(channles);
if (opType == MNN::OpType_Convolution) {
QuantizeConvPerChannel(param->weight.data(), param->weight.size(), param->bias.data(),
quantizedParam->weight.data(), quantizedParam->bias.data(),
quantizedParam->scale.data(), inputScale, outputScale, _weightQuantizeMethod, _weightClampValue);
op->type = MNN::OpType_ConvInt8;
} else if (opType == MNN::OpType_ConvolutionDepthwise) {
QuantizeDepthwiseConv(param->weight.data(), param->weight.size(), param->bias.data(),
quantizedParam->weight.data(), quantizedParam->bias.data(),
quantizedParam->scale.data(), inputScale, outputScale, _weightQuantizeMethod, _weightClampValue);
op->type = MNN::OpType_DepthwiseConvInt8;
}
if (param->common->relu6) {
param->common->relu = true;
param->common->relu6 = false;
}
// 清除原本的权重和bias
param->weight.clear();
param->bias.clear();
}
}
由代码可以看出,现根据上面计算得到的scale,更新权重参数。以QuantizeConvPerChannel
为例,继续分析。
int QuantizeConvPerChannel(const float* weight, const int size, const float* bias, int8_t* quantizedWeight,
int32_t* quantizedBias, float* scale, const std::vector<float>& inputScale,
const std::vector<float>& outputScale, std::string method, float weightClampValue, bool mergeChannel) {
const int inputChannels = inputScale.size();
const int outputChannels = outputScale.size();
const int icXoc = inputChannels * outputChannels;
DCHECK(size % icXoc == 0) << "Input Data Size Error!";
std::vector<float> quantizedWeightScale(outputChannels);
float inputScalexWeight = 1.0f;
if (mergeChannel) {
if (method == "MAX_ABS"){
SymmetricQuantizeWeight(weight, size, quantizedWeight, quantizedWeightScale.data(), outputChannels, weightClampValue);
}
else if (method == "ADMM") {
QuantizeWeightADMM(weight, size, quantizedWeight, quantizedWeightScale.data(), outputChannels, weightClampValue);
}
inputScalexWeight = inputScale[0];
} else {
const int kernelSize = size / icXoc;
const int ocStride = size / outputChannels;
// 每个权重都乘上对应scale
std::vector<float> weightMultiByInputScale(size);
for (int oc = 0; oc < outputChannels; ++oc) {
for (int ic = 0; ic < inputChannels; ++ic) {
for (int i = 0; i < kernelSize; ++i) {
const int index = oc * ocStride + ic * kernelSize + i;
weightMultiByInputScale[index] = inputScale[ic] * weight[index];
}
}
}
if (method == "MAX_ABS"){
SymmetricQuantizeWeight(weightMultiByInputScale.data(), size, quantizedWeight, quantizedWeightScale.data(), outputChannels, weightClampValue);
}
else if (method == "ADMM") {
QuantizeWeightADMM(weightMultiByInputScale.data(), size, quantizedWeight, quantizedWeightScale.data(), outputChannels, weightClampValue);
}
}
for (int i = 0; i < outputChannels; ++i) {
if (fabs(outputScale[i]) <= 1e-6) {
scale[i] = 0.0f;
} else {
scale[i] = inputScalexWeight * quantizedWeightScale[i] / outputScale[0];
}
}
if (bias) {
for (int i = 0; i < outputChannels; ++i) {
if (fabs(inputScalexWeight) <= 1e-6 || fabs(quantizedWeightScale[i]) <= 1e-6) {
quantizedBias[i] = 0;
} else {
quantizedBias[i] = static_cast<int32_t>(bias[i] / (inputScalexWeight * quantizedWeightScale[i]));
}
}
}
return 0;
}
int SymmetricQuantizeWeight(const float* weight, const int size, int8_t* quantizedWeight, float* scale,
const int channels, float weightClampValue) {
/** 对参数进行量化
* weight 为乘上scale后的权重,
* quantizedWeight 用于存放量化后的参数
* **/
DCHECK((size % channels) == 0) << "weight size error!";
const int channelStride = size / channels;
const int quantizedMaxValue = weightClampValue; // 127
for (int c = 0; c < channels; ++c) { // 对每个channel分别量化
const auto weightChannelStart = weight + c * channelStride;
auto quantizedWeightChannelStart = quantizedWeight + c * channelStride;
// 获取该channel内最大最小值
auto minmaxValue = std::minmax_element(weightChannelStart, weightChannelStart + channelStride);
const float dataAbsMax = std::fmax(std::fabs(*minmaxValue.first), std::fabs(*minmaxValue.second));
float scaleDataToInt8 = 1.0f;
if (dataAbsMax == 0) {
scale[c] = 0.0f;
} else {
// 用于逆量化时对用的scale
scale[c] = dataAbsMax / quantizedMaxValue;
// 映射到int8空间上的scale
scaleDataToInt8 = quantizedMaxValue / dataAbsMax;
}
for (int i = 0; i < channelStride; ++i) {
// 将输入权重乘上scale映射到int8上之后,对不在[-127,127]区间的都截断设置为-127或者127.
const int32_t quantizedInt8Value = static_cast<int32_t>(roundf(weightChannelStart[i] * scaleDataToInt8));
quantizedWeightChannelStart[i] =
std::min(quantizedMaxValue, std::max(-quantizedMaxValue, quantizedInt8Value));
}
}
return 0;
}
再接下来就是把权重等信息格式化后保存到文件中。总体流程如下图所示: