参考链接:https://zhuanlan.zhihu.com/p/153562409?from_voters_page=true
MNN量化工具使用
编译
cd MNN
mkdir build
cd build
cmake -DMNN_BUILD_QUANTOOLS=ON ..
make -j4
使用
./quantized.out origin.mnn quantized.mnn ModelConfig.json
也可以用python安装mnn
pip install mnn
mnnquant origin.mnn quantized.mnn ModelConfig.json
ModelConfig.json配置格式
{
"format":"GRAY",
"mean":[
0
],
"normal":[
0.00784314
],
"width":28,
"height":28,
"path":"/mldb/dataset/MNIST/test_data/8",
"used_image_num":100,
"feature_quantize_method":"KL",
"weight_quantize_method":"MAX_ABS"
}
format
图片统一按RGBA读取,然后转换到format指定格式,可选:“RGB”, “BGR”, “RGBA”, “GRAY”。
mean, normal
模型预处理需要的mean,normal, 数据按此公式填写:
width, height
模型输入的宽高
path
存放校正特征量化系数的图片目录
used_image_num
用于指定使用上述目录下多少张图片进行校正,默认使用path下全部图片
注意:请确保图片经过上述步骤处理之后的数据是输入到模型input接口的数据
feature_quantize_method
指定计算特征量化系数的方法,可选:
“KL”: 使用KL散度进行特征量化系数的校正,一般需要100 ~ 1000张图片
“ADMM”: 使用ADMM(Alternating Direction Method of Multipliers)方法进行特征量化系数的校正,一般需要一个batch的数据
默认:“KL”
weight_quantize_method
指定权值量化方法,可选:
“MAX_ABS”: 使用权值的绝对值的最大值进行对称量化
“ADMM”: 使用ADMM方法进行权值量化
默认:“MAX_ABS”
上述特征量化方法和权值量化方法可进行多次测试,择优使用。
源码详解
主函数quantized.cpp
int main(int argc, const char* argv[]) {
if (argc < 4) {
DLOG(INFO) << "Usage: ./quantized.out src.mnn dst.mnn preTreatConfig.json\n";
return 0;
}
const char* modelFile = argv[1];
const char* preTreatConfig = argv[3];
const char* dstFile = argv[2];
DLOG(INFO) << ">>> modelFile: " << modelFile;
DLOG(INFO) << ">>> preTreatConfig: " << preTreatConfig;
DLOG(INFO) << ">>> dstFile: " << dstFile
std::unique_ptr<MNN::NetT> netT;
{// 读取原始的model文件, 借助于flattbuffer生成Net对象
std::ifstream input(modelFile);
std::ostringstream outputOs;
outputOs << input.rdbuf();
netT = MNN::UnPackNet(outputOs.str().c_str()); //获取Net对象
}
// temp build net for inference
flatbuffers::FlatBufferBuilder builder(1024);
auto offset = MNN::Net::Pack(builder, netT.get());//打包模型准备放入buffer中
builder.Finish(offset);
int size = builder.GetSize();
auto ocontent = builder.GetBufferPointer();
// 创建两个buffer,两个都用来放模型数据
std::unique_ptr<uint8_t> modelForInference(new uint8_t[size]);
memcpy(modelForInference.get(), ocontent, size);
std::unique_ptr<uint8_t> modelOriginal(new uint8_t[size]);
memcpy(modelOriginal.get(), ocontent, size);
netT.reset();
netT = MNN::UnPackNet(modelOriginal.get());
// 进行量化操作, 主要这个靠的是Calibration类
DLOG(INFO) << "Calibrate the feature and quantize model...";
std::shared_ptr<Calibration> calibration(
new Calibration(netT.get(), modelForInference.get(), size, preTreatConfig));
calibration->runQuantizeModel();
DLOG(INFO) << "Quantize model done!";
// 量化后的模型写入到FlatBufferBuilder
flatbuffers::FlatBufferBuilder builderOutput(1024);
builderOutput.ForceDefaults(true);
auto len = MNN::Net::Pack(builderOutput, netT.get());
builderOutput.Finish(len);
// FlatBufferBuilder的内容写入文件,得到量化模型
{
std::ofstream output(dstFile);
output.write((const char*)builderOutput.GetBufferPointer(), builderOutput.GetSize());
}
}
Calibration类
MNN量化的核心类,权重量化,特征量化。
Calibration.hpp
class Calibration {
public:
// 参数 原始模型,模型uint8_t buffer,size,json配置文件
Calibration(MNN::NetT* model, uint8_t* modelBuffer, const int bufferSize, const std::string& configPath);
void runQuantizeModel();
private:
Calibration();
MNN::NetT* _originaleModel;// 需要量化的模型
std::shared_ptr<MNN::CV::ImageProcess> _process;// 负责image到tensor的转化类
const int _binNums = 2048;
int _imageNum = 0;
int _width;
int _height;
std::vector<std::string> _imgaes;//图片,用于校正特征量化系数的
// Tensor and Info
// tensor 到 对应的 TensorStatistic, TensorStatistic是描述tensor在量化过程中需要的统计数据,后面有解释
std::map<const MNN::Tensor*, std::shared_ptr<TensorStatistic>> _featureInfo;
// 所有的tensor
std::map<int, const MNN::Tensor*> _tensorMap;
// Op's name, Inputs, Outputs
// op到 input/output tensor的映射
std::map<std::string, std::pair<std::vector<MNN::Tensor*>, std::vector<MNN::Tensor*>>> _opInfo;
// The scale results
std::map<const MNN::Tensor*, std::vector<float>> _scales;
std::shared_ptr<MNN::Interpreter> _interpreter;
// keep mnn forward information
MNN::Session* _session;
MNN::Tensor* _inputTensor;
std::vector<int> _inputTensorDims;
std::string _featureQuantizeMethod = "KL";
std::string _weightQuantizeMethod = "MAX_ABS";
void _initMNNSession(const uint8_t* modelBuffer, const int bufferSize, const int channels);
void _initMaps();
void _computeFeatureMapsRange();
void _collectFeatureMapsDistribution();
void _computeFeatureScaleKL();
void _computeFeatureScaleADMM();
void _updateScale();
// insert the dequantization op before the not supported op(int8), and insert dequantization op
// after the output op, so that get original float data conveniently
void _insertDequantize();
};
#endif // CALIBRATION_HPP
Calibration构造函数
Calibration::Calibration(MNN::NetT* model, uint8_t* modelBuffer, const int bufferSize, const std::string& configPath)
: _originaleModel(model) {
// when the format of input image is RGB/BGR, channels equal to 3, GRAY is 1
int channles = 3;
// 解析json
rapidjson::Document document;
{
std::ifstream fileNames(configPath.c_str());
std::ostringstream output;
output << fileNames.rdbuf();
auto outputStr = output.str();
document.Parse(outputStr.c_str());
if (document.HasParseError()) {
MNN_ERROR("Invalid json\n");
return;
}
}
auto picObj = document.GetObject();
// 构造ImageProcess::config对象,将json内容传入
ImageProcess::Config config;
config.filterType = BILINEAR;
config.destFormat = BGR;
{
if (picObj.HasMember("format")) {
auto format = picObj["format"].GetString();
static std::map<std::string, ImageFormat> formatMap{{"BGR", BGR}, {"RGB", RGB}, {"GRAY", GRAY}};
if (formatMap.find(format) != formatMap.end()) {
config.destFormat = formatMap.find(format)->second;
}
}
}
if (config.destFormat == GRAY) {
channles = 1;
}
config.sourceFormat = RGBA;
std::string imagePath;
_imageNum = 0;
{
if (picObj.HasMember("mean")) {
auto mean = picObj["mean"].GetArray();
int cur = 0;
for (auto iter = mean.begin(); iter != mean.end(); iter++) {
config.mean[cur++] = iter->GetFloat();
}
}
if (picObj.HasMember("normal")) {
auto normal = picObj["normal"].GetArray();
int cur = 0;
for (auto iter = normal.begin(); iter != normal.end(); iter++) {
config.normal[cur++] = iter->GetFloat();
}
}
if (picObj.HasMember("width")) {
_width = picObj["width"].GetInt();
}
if (picObj.HasMember("height")) {
_height = picObj["height"].GetInt();
}
if (picObj.HasMember("path")) {
imagePath = picObj["path"].GetString();
}
if (picObj.HasMember("used_image_num")) {
_imageNum = picObj["used_image_num"].GetInt();
}
if (picObj.HasMember("feature_quantize_method")) {
std::string method = picObj["feature_quantize_method"].GetString();
if (Helper::featureQuantizeMethod.find(method) != Helper::featureQuantizeMethod.end()) {
_featureQuantizeMethod = method;
} else {
MNN_ERROR("not supported feature quantization method: %s\n", method.c_str());
return;
}
}
if (picObj.HasMember("weight_quantize_method")) {
std::string method = picObj["weight_quantize_method"].GetString();
if (Helper::weightQuantizeMethod.find(method) != Helper::weightQuantizeMethod.end()) {
_weightQuantizeMethod = method;
} else {
MNN_ERROR("not supported weight quantization method: %s\n", method.c_str());
return;
}
}
DLOG(INFO) << "Use feature quantization method: " << _featureQuantizeMethod;
DLOG(INFO) << "Use weight quantization method: " << _weightQuantizeMethod;
}
std::shared_ptr<ImageProcess> process(ImageProcess::create(config));// 生成ImageProcess对象
_process = process;
// read images file names
Helper::readImages(_imgaes, imagePath.c_str(), &_imageNum);
_initMNNSession(modelBuffer, bufferSize, channles);
_initMaps();
}
initMNNSession函数
主要用于初始化,做好模型推理的准备
void Calibration::_initMNNSession(const uint8_t* modelBuffer, const int bufferSize, const int channels) {
_interpreter.reset(MNN::Interpreter::createFromBuffer(modelBuffer, bufferSize));
MNN::ScheduleConfig config;
_session = _interpreter->createSession(config);
_inputTensor = _interpreter->getSessionInput(_session, NULL);
_inputTensorDims.resize(4);
auto inputTensorDataFormat = MNN::TensorUtils::getDescribe(_inputTensor)->dimensionFormat;
DCHECK(4 == _inputTensor->dimensions()) << "Only support 4 dimensions input";
if (inputTensorDataFormat == MNN::MNN_DATA_FORMAT_NHWC) {
_inputTensorDims[0] = 1;
_inputTensorDims[1] = _height;
_inputTensorDims[2] = _width;
_inputTensorDims[3] = channels;
} else if (inputTensorDataFormat == MNN::MNN_DATA_FORMAT_NC4HW4) {
_inputTensorDims[0] = 1;
_inputTensorDims[1] = channels;
_inputTensorDims[2] = _height;
_inputTensorDims[3] = _width;
} else {
DLOG(ERROR) << "Input Data Format ERROR!";
}
if (_featureQuantizeMethod == "KL") {
_interpreter->resizeTensor(_inputTensor, _inputTensorDims);
_interpreter->resizeSession(_session);
} else if (_featureQuantizeMethod == "ADMM") {
DCHECK((_imageNum * 4 * _height * _width) < (INT_MAX / 4)) << "Use Little Number of Images When Use ADMM";
_inputTensorDims[0] = _imageNum;
_interpreter->resizeTensor(_inputTensor, _inputTensorDims);
_interpreter->resizeSession(_session);
}
_interpreter->releaseModel();
}
_initMaps函数
定义tensor回调函数,为input tensor和output tensor创建TensorStatistic对象;遍历op,input和output加入tensorMap。
void Calibration::_initMaps() {
_featureInfo.clear();
_opInfo.clear();
_tensorMap.clear();
// run mnn once, initialize featureMap, opInfo map
// MNN提供了每个op计算的callback,一个计算前一个是计算后
// 计算前的callback完成的工作是为input tensor创建TensorStatistic对象; op info的填充 op->input,output的映射
MNN::TensorCallBackWithInfo before = [&](const std::vector<MNN::Tensor*>& nTensors, const MNN::OperatorInfo* info) {
_opInfo[info->name()].first = nTensors;
if (Helper::gNeedFeatureOp.find(info->type()) != Helper::gNeedFeatureOp.end()) {
for (auto t : nTensors) {
if (_featureInfo.find(t) == _featureInfo.end()) {
_featureInfo[t] = std::shared_ptr<TensorStatistic>(
new TensorStatistic(t, _featureQuantizeMethod, info->name() + "__input"));
}
}
}
return false;
};
// 计算后的callback完成的工作是 为output tensor创建TensorStatistic对象;op info的填充 op->input,output的映射
MNN::TensorCallBackWithInfo after = [this](const std::vector<MNN::Tensor*>& nTensors,
const MNN::OperatorInfo* info) {
_opInfo[info->name()].second = nTensors;
if (Helper::gNeedFeatureOp.find(info->type()) != Helper::gNeedFeatureOp.end()) {
for (auto t : nTensors) {
if (_featureInfo.find(t) == _featureInfo.end()) {
_featureInfo[t] =
std::shared_ptr<TensorStatistic>(new TensorStatistic(t, _featureQuantizeMethod, info->name()));
}
}
}
return true;
};
_interpreter->runSessionWithCallBackInfo(_session, before, after);
// 遍历op,由op的<input/output index,input/output>加入到 tensorMap
for (auto& op : _originaleModel->oplists) {
if (_opInfo.find(op->name) == _opInfo.end()) {
continue;
}
for (int i = 0; i < op->inputIndexes.size(); ++i) {
_tensorMap[op->inputIndexes[i]] = _opInfo[op->name].first[i];
}
for (int i = 0; i < op->outputIndexes.size(); ++i) {
_tensorMap[op->outputIndexes[i]] = _opInfo[op->name].second[i];
}
}
if (_featureQuantizeMethod == "KL") {
// set the tensor-statistic method of input tensor as THRESHOLD_MAX
auto inputTensorStatistic = _featureInfo.find(_inputTensor);
if (inputTensorStatistic != _featureInfo.end()) {
inputTensorStatistic->second->setThresholdMethod(THRESHOLD_MAX);
}
}
}
TensorStatistic类
由于在Calibration实例里有个map包含了
std::map<const MNN::Tensor*, std::shared_ptr<TensorStatistic>> _featureInfo;
表示每个tensor都有一个TensorStatistic对象与之对应,tensor在量化过程中数据存在TensorStatistic里面。
TensorStatistic.hpp
class TensorStatistic {
public:
TensorStatistic(const MNN::Tensor* tensor, std::string method, const std::string& name, int binNumber = 2048, GET_THRESHOLD_METHOD thresholdMethod = THRESHOLD_KL);
. . .
void updateRange();
void resetDistribution();
void updateDistribution();
void setThresholdMethod(GET_THRESHOLD_METHOD thresholdMethod);
void setChannelWise(bool mergeChannel);
std::vector<float> finishAndCompute();
// only this one for ADMM
std::vector<float> computeScaleADMM();
private:
int _computeThreshold(const std::vector<float>& distribution);
std::vector<std::pair<float, float>> mRangePerChannel;// 该tensor在每个channel上的最大最小值
// 记该channel上的 (取最大值绝对值,取最小值的绝对值,的最大值) 为maxValue
std::vector<float> mIntervals;//每个channel上的interval = mBinNumber / maxValue
std::vector<bool> mValidChannel; //如果channel上的maxValue>0.00001f, 为true
std::vector<std::vector<float>> mDistribution;// 对于每个channel,tensor上浮点数据做均匀映射,映射到[0,mBinNumber],统计各个整数出现次数, 形成了直方图。
std::shared_ptr<MNN::Tensor> mHostTensor; // 该tensor在cpu端的表示
const MNN::Tensor* mOriginTensor; // 原始的tensor
int mBinNumber; // 默认是2048
bool mUpdatedDistributionFlag = false;
bool mUpdatedRangeFlags = false;
bool mMergeChannel = true;
std::string mName;
GET_THRESHOLD_METHOD mThresholdMethod = THRESHOLD_KL;
};
runQuantizeModel函数
两种特征量化方法:KL和ADMM
void Calibration::runQuantizeModel() {
if (_featureQuantizeMethod == "KL") { // 如果配置文件里是 KL散度做
_computeFeatureScaleKL();
} else if (_featureQuantizeMethod == "ADMM") {// 如果配置文件里是ADMM
_computeFeatureScaleADMM();
}
_updateScale();
_insertDequantize();
}
computeFeatureScaleKL函数
void Calibration::_computeFeatureScaleKL() {
_computeFeatureMapsRange(); // 计算 feature map里的数据范围
_collectFeatureMapsDistribution(); // 计算feature map里的数据分布
_scales.clear();
for (auto& iter : _featureInfo) {
AUTOTIME;_imgaes
_scales[iter.first] = iter.second->finishAndCompute();//缩放系数
}
//_featureInfo.clear();//No need now
}
_computeFeatureMapsRange函数
用指定的图片集合做为模型输入,做推理,在推理过程中,每一个op的计算前后分别统计tensor的每一个channel上featuremap的最大值最小值,更新到该tensor关联的TensorStatistic对象实例里。其中updateRange函数在TensorStatistic.cpp中
void Calibration::_computeFeatureMapsRange() {
// feed input data according to input images
int count = 0;
for (const auto& img : _imgaes) { // 对于每一个图片文件
for (auto& iter : _featureInfo) {
iter.second->resetUpdatedRangeFlags();
}
count++;
// 读取图片,放到input tensor
Helper::preprocessInput(_process.get(), _width, _height, img, _inputTensor);
// 设置回调, 做推理时,每个op计算前,计算后分别调用
MNN::TensorCallBackWithInfo before = [&](const std::vector<MNN::Tensor*>& nTensors,
const MNN::OperatorInfo* info) {
for (auto t : nTensors) {
if (_featureInfo.find(t) != _featureInfo.end()) {
_featureInfo[t]->updateRange();// 统计输入tensor里的最大值最小值
}
}
return true;
};
MNN::TensorCallBackWithInfo after = [&](const std::vector<MNN::Tensor*>& nTensors,
const MNN::OperatorInfo* info) {
for (auto t : nTensors) {
if (_featureInfo.find(t) != _featureInfo.end()) {
_featureInfo[t]->updateRange();// 统计输出tensor里的最大值最小值
}
}
return true;
};
// 推理一遍
_interpreter->runSessionWithCallBackInfo(_session, before, after);
MNN_PRINT("\rComputeFeatureRange: %.2lf %%", (float)count * 100.0f / (float)_imageNum);
fflush(stdout);
}
MNN_PRINT("\n");
}
updateRange函数
得到特征的最大值和最小值
void TensorStatistic::updateRange() {
if (mUpdatedRangeFlags) {
return;
}
mUpdatedRangeFlags = true;
mOriginTensor->copyToHostTensor(mHostTensor.get());
int batch = mHostTensor->batch();
int channel = mHostTensor->channel();
int width = mHostTensor->width();
int height = mHostTensor->height();
auto area = width * height;
for (int n = 0; n < batch; ++n) {
auto dataBatch = mHostTensor->host<float>() + n * mHostTensor->stride(0);
for (int c = 0; c < channel; ++c) {
int cIndex = c;
if (mMergeChannel) {
cIndex = 0;
}
auto minValue = mRangePerChannel[cIndex].first;
auto maxValue = mRangePerChannel[cIndex].second;
auto dataChannel = dataBatch + c * mHostTensor->stride(1);
for (int v = 0; v < area; ++v) {
minValue = std::min(minValue, dataChannel[v]);
maxValue = std::max(maxValue, dataChannel[v]);
}
mRangePerChannel[cIndex].first = minValue;
mRangePerChannel[cIndex].second = maxValue;
}
}
}
_collectFeatureMapsDistribution函数
计算每个tensor的数据分布情况,其中resetDistributio函数和updateDistribution函数在TensorStatistic.cpp中。
void Calibration::_collectFeatureMapsDistribution() {
for (auto& iter : _featureInfo) {
iter.second->resetDistribution(); // 初始化,清空TensorStatistic实例里的记录数据分布的属性
}
// 定义两个回调, 分别在op计算前 计算后执行
// feed input data according to input images
MNN::TensorCallBackWithInfo before = [&](const std::vector<MNN::Tensor*>& nTensors, const MNN::OperatorInfo* info) {
for (auto t : nTensors) {
if (_featureInfo.find(t) != _featureInfo.end()) {
_featureInfo[t]->updateDistribution(); // 对该input tensor的TensorStatistic实例更新数据分布
}
}
return true;
};
MNN::TensorCallBackWithInfo after = [&](const std::vector<MNN::Tensor*>& nTensors, const MNN::OperatorInfo* info) {
for (auto t : nTensors) {
if (_featureInfo.find(t) != _featureInfo.end()) {
_featureInfo[t]->updateDistribution();// 对该output tensor的TensorStatistic实例更新数据分布
}
}
return true;
};
int count = 0;
for (const auto& img : _imgaes) {// 对所有的image,跑一遍推理
count++;
for (auto& iter : _fiter : _featureInfoeatureInfo) {
iter.second->resetUpdatedDistributionFlag();
}
Helper::preprocessInput(_process.get(), _width, _height, img, _inputTensor); //读图片填到input
_interpreter->runSessionWithCallBackInfo(_session, before, after); // 执行推理
MNN_PRINT("\rCollectFeatureDistribution: %.2lf %%", (float)count * 100.0f / (float)_imageNum);
fflush(stdout);
}
MNN_PRINT("\n");
}
resetDistributio、updateDistribution函数
void TensorStatistic::resetDistribution() { // 初始化分布
for (int i = 0; i < mIntervals.size(); ++i) { // 在每一个channel上做运算
int cIndex = i;
if (mMergeChannel) {
cIndex = 0;
}
// 最大值是 该channel上出现的数据的 绝对值的最大值
auto maxValue = std::max(fabsf(mRangePerChannel[cIndex].second), fabsf(mRangePerChannel[cIndex].first));
mValidChannel[cIndex] = maxValue > 0.00001f;// 最大值要足够大, 然后除以mBinNumber=2048,得到mIntervals
mIntervals[cIndex] = 0.0f;
if (mValidChannel[cIndex]) {
// mIntervals 代表是 原始float均匀映射到 [0-2048)时, 整数1对应到浮点数上的值
mIntervals[cIndex] = (float)mBinNumber / maxValue;
}
}
for (auto& c : mDistribution) {
std::fill(c.begin(), c.end(), 1.0e-07); // mDistribution初始值设为 接近0的很小的浮点数。
}
}
void TensorStatistic::updateDistribution() {
if (mUpdatedDistributionFlag) { //
return;
}
mUpdatedDistributionFlag = true;
// 取tensor上的数据, 和shape
mOriginTensor->copyToHostTensor(mHostTensor.get());
int batch = mHostTensor->batch();
int channel = mHostTensor->channel();
int width = mHostTensor->width();
int height = mHostTensor->height();
auto area = width * height;
for (int n = 0; n < batch; ++n) {
auto dataBatch = mHostTensor->host<float>() + n * mHostTensor->stride(0);// 取该batch上的数据
for (int c = 0; c < channel; ++c) {// 对每一个channel维度上计算
int cIndex = c;
if (mMergeChannel) {
cIndex = 0;
}
if (!mValidChannel[cIndex]) {
continue;
}
auto multi = mIntervals[cIndex]; // 取间隔值
auto target = mDistribution[cIndex].data();//取该channel上分布
auto dataChannel = dataBatch + c * mHostTensor->stride(1);//取该channel上的feature map
for (int v = 0; v < area; ++v) {
auto data = dataChannel[v]; // data是feature map上的数值点
if (data == 0) {
continue;
}
int index = static_cast<int>(fabs(data) * multi);// 该数值点 均匀映射到 整数点, 整数记为index
index = std::min(index, mBinNumber - 1); // 限制范围是 不超过mBinNumber - 1
target[index] += 1.0f; // 统计到 mDistribution里
}
}
}
}
finishAndCompute函数
计算出从float32到int8的缩放参数
std::vector<float> TensorStatistic::finishAndCompute() {
std::vector<float> scaleValue(mDistribution.size(), 0.0f);
. . .
for (int c = 0; c < mDistribution.size(); ++c) { // 对每一个channel
if (!mValidChannel[c]) {
continue;
}
float sum = 0.0f;
auto& distribution = mDistribution[c];
std::for_each(distribution.begin(), distribution.end(), [&](float n) { sum += n; });// 求和
std::for_each(distribution.begin(), distribution.end(), [sum](float& n) { n /= sum; }); // 除以和,等于占比, 相当于归一化了
auto threshold = _computeThreshold(distribution); // 计算一个阈值
scaleValue[c] = ((float)threshold + 0.5) / mIntervals[c] / 127.0; // 计算缩放系数
}
return scaleValue;
}
_computeThreshold函数
在targetBinNums 到mBinNumber之间找一个阈值,使得KL散度最小。
int TensorStatistic::_computeThreshold(const std::vector<float>& distribution) {
const int targetBinNums = 128;
int threshold = targetBinNums; // 默认128
if (mThresholdMethod == THRESHOLD_KL) { // 如果是通过KL散度
float minKLDivergence = 10000.0f;
float afterThresholdSum = 0.0f;
//targetBinNums=128, mBinNumber=2048
std::for_each(distribution.begin() + targetBinNums, distribution.end(),
[&](float n) { afterThresholdSum += n; });
for (int i = targetBinNums; i < mBinNumber; ++i) { // i 从128 到 2047, 寻找潜在的threshold
std::vector<float> quantizedDistribution(targetBinNums);
std::vector<float> candidateDistribution(i);
std::vector<float> expandedDistribution(i);
// candidateDistribution是保留了 从0到i的分布, 同时把 i到mBinNumber直接的分布加到最后i-1位置
std::copy(distribution.begin(), distribution.begin() + i, candidateDistribution.begin());
candidateDistribution[i - 1] += afterThresholdSum;
afterThresholdSum -= distribution[i];
// 空间大小i 缩小到targetBinNums时的间隔。
const float binInterval = (float)i / (float)targetBinNums;
// merge i bins to target bins j从0到127
// 把0-i之间分布, 映射到更小空间 0-targetBinNums, 当然映射过后会有损失, 结果保存到quantizedDistribution
for (int j = 0; j < targetBinNums; ++j) {
// [j,j+1)在mBinNumber空间
// [start, end]是 i空间 相对应的映射
const float start = j * binInterval;
const float end = start + binInterval;
// 向上取整
const int leftUpper = static_cast<int>(std::ceil(start));
if (leftUpper > start) {
const float leftScale = leftUpper - start;
quantizedDistribution[j] += leftScale * distribution[leftUpper - 1];
}// 向下取整
const int rightLower = static_cast<int>(std::floor(end));
if (rightLower < end) {
const float rightScale = end - rightLower;
quantizedDistribution[j] += rightScale * distribution[rightLower];
}
// 转化成在[0, targetBinNums]分布
std::for_each(distribution.begin() + leftUpper, distribution.begin() + rightLower,
[&](float n) { quantizedDistribution[j] += n; });
}
// expand target bins to i bins 从小空间 0-targetBinNums 在反过来映射回来到0-i之间
// 结果保存到expandedDistribution;
for (int j = 0; j < targetBinNums; ++j) {
const float start = j * binInterval;
const float end = start + binInterval;
float count = 0;
const int leftUpper = static_cast<int>(std::ceil(start));
float leftScale = 0.0f;
if (leftUpper > start) {
leftScale = leftUpper - start;
if (distribution[leftUpper - 1] != 0) {
count += leftScale;
}
}
const int rightLower = static_cast<int>(std::floor(end));
float rightScale = 0.0f;
if (rightLower < end) {
rightScale = end - rightLower;
if (distribution[rightLower] != 0) {
count += rightScale;
}
}
std::for_each(distribution.begin() + leftUpper, distribution.begin() + rightLower, [&](float n) {
if (n != 0) {
count += 1;
}
});
if (count == 0) {
continue;
}
const float toExpandValue = quantizedDistribution[j] / count;
if (leftUpper > start && distribution[leftUpper - 1] != 0) {
expandedDistribution[leftUpper - 1] += toExpandValue * leftScale;
}
if (rightLower < end && distribution[rightLower] != 0) {
expandedDistribution[rightLower] += toExpandValue * rightScale;
}
for (int k = leftUpper; k < rightLower; ++k) {
if (distribution[k] != 0) {
expandedDistribution[k] += toExpandValue;
}
}
}
// KL散度计算公式 Sum(P[i] * log(P[i] / Q[i]))
const float curKL = _klDivergence(candidateDistribution, expandedDistribution);
if (curKL < minKLDivergence) { // 记录最小的KL散度,以及使得KL散度最小的 threshold
minKLDivergence = curKL;
threshold = i;
}
}
} else if (mThresholdMethod == THRESHOLD_MAX) {
threshold = mBinNumber - 1;
} else {
// TODO, support other method
MNN_ASSERT(false);
}
return threshold;
}
权重量化_updateScale函数
主要是对权重进行量化,完成可量化op的量化
void Calibration::_updateScale() {
for (const auto& op : _originaleModel->oplists) {
const auto opType = op->type;
// 只针对 conv, 或者Eltwise类型op
if (opType != MNN::OpType_Convolution && opType != MNN::OpType_ConvolutionDepthwise &&
opType != MNN::OpType_Eltwise) {
continue;
}
auto tensorsPair = _opInfo.find(op->name);
if (tensorsPair == _opInfo.end()) {
MNN_ERROR("Can't find tensors for %s\n", op->name.c_str());
}
if (opType == MNN::OpType_Eltwise) {//Eltwise类型op
auto param = op->main.AsEltwise();
// Now only support AddInt8
if (param->type != MNN::EltwiseType_SUM) {
continue;
}
// 取出前面算出来的 scale值
const auto& inputScale0 = _scales[tensorsPair->second.first[0]];
const auto& inputScale1 = _scales[tensorsPair->second.first[1]];
const auto& outputScale = _scales[tensorsPair->second.second[0]];
const int outputScaleSize = outputScale.size();
std::vector<float> outputInvertScale(outputScaleSize);
Helper::invertData(outputInvertScale.data(), outputScale.data(), outputScaleSize);
op->type = MNN::OpType_EltwiseInt8; // 修改op类型为量化版本
op->main.Reset(); // 重新构造op的参数了
op->main.type = MNN::OpParameter_EltwiseInt8;
auto eltwiseInt8Param = new MNN::EltwiseInt8T;
auto input0ScaleParam = new MNN::QuantizedFloatParamT;
auto input1ScaleParam = new MNN::QuantizedFloatParamT;
auto outputScaleParam = new MNN::QuantizedFloatParamT;
input0ScaleParam->tensorScale = inputScale0;
input1ScaleParam->tensorScale = inputScale1;
outputScaleParam->tensorScale = outputInvertScale;
// 从int8恢复float32需要的scale参数
eltwiseInt8Param->inputQuan0 = std::unique_ptr<MNN::QuantizedFloatParamT>(input0ScaleParam);
eltwiseInt8Param->inputQuan1 = std::unique_ptr<MNN::QuantizedFloatParamT>(input1ScaleParam);
eltwiseInt8Param->outputQuan = std::unique_ptr<MNN::QuantizedFloatParamT>(outputScaleParam);
op->main.value = eltwiseInt8Param;
continue;
}
// below is Conv/DepthwiseConv
const auto& inputScale = _scales[tensorsPair->second.first[0]];
const auto& outputScale = _scales[tensorsPair->second.second[0]];
auto param = op->main.AsConvolution2D();
param->common->inputCount = tensorsPair->second.first[0]->channel();
const int channles = param->common->outputCount;
const int weightSize = param->weight.size();
param->symmetricQuan.reset(new MNN::QuantizedFloatParamT);// 参数重置
auto& quantizedParam = param->symmetricQuan;// 接着构建 int8版本参数
quantizedParam->scale.resize(channles);
quantizedParam->weight.resize(weightSize);
quantizedParam->bias.resize(channles);
// conv 和 deptwise_conv 分别算 weight,bias
if (opType == MNN::OpType_Convolution) {
QuantizeConvPerChannel(param->weight.data(), param->weight.size(), param->bias.data(),
quantizedParam->weight.data(), quantizedParam->bias.data(),
quantizedParam->scale.data(), inputScale, outputScale, _weightQuantizeMethod);
op->type = MNN::OpType_ConvInt8;
} else if (opType == MNN::OpType_ConvolutionDepthwise) {
QuantizeDepthwiseConv(param->weight.data(), param->weight.size(), param->bias.data(),
quantizedParam->weight.data(), quantizedParam->bias.data(),
quantizedParam->scale.data(), inputScale, outputScale, _weightQuantizeMethod);
op->type = MNN::OpType_DepthwiseConvInt8;
}
if (param->common->relu6) {
param->common->relu = true;
param->common->relu6 = false;
}// 有了int8版本的参数, 原始的浮点参数就清空了
param->weight.clear();
param->bias.clear();
}
}
QuantizeConvPerChannel函数
int QuantizeConvPerChannel(const float* weight, const int size, const float* bias, int8_t* quantizedWeight,
int32_t* quantizedBias, float* scale, const std::vector<float>& inputScale,
const std::vector<float>& outputScale, std::string method, bool mergeChannel) {
const int inputChannels = inputScale.size();
const int outputChannels = outputScale.size();
const int icXoc = inputChannels * outputChannels;
DCHECK(size % icXoc == 0) << "Input Data Size Error!";
std::vector<float> quantizedWeightScale(outputChannels);
float inputScalexWeight = 1.0f;
if (mergeChannel) {
if (method == "MAX_ABS"){
SymmetricQuantizeWeight(weight, size, quantizedWeight, quantizedWeightScale.data(), outputChannels);
}
else if (method == "ADMM") {
QuantizeWeightADMM(weight, size, quantizedWeight, quantizedWeightScale.data(), outputChannels);
}
inputScalexWeight = inputScale[0];
} else {
const int kernelSize = size / icXoc;
const int ocStride = size / outputChannels;
std::vector<float> weightMultiByInputScale(size);
for (int oc = 0; oc < outputChannels; ++oc) {
for (int ic = 0; ic < inputChannels; ++ic) {
for (int i = 0; i < kernelSize; ++i) {
const int index = oc * ocStride + ic * kernelSize + i;
weightMultiByInputScale[index] = inputScale[ic] * weight[index];
}
}
}
if (method == "MAX_ABS"){
SymmetricQuantizeWeight(weightMultiByInputScale.data(), size, quantizedWeight, quantizedWeightScale.data(), outputChannels);
}
else if (method == "ADMM") {
QuantizeWeightADMM(weightMultiByInputScale.data(), size, quantizedWeight, quantizedWeightScale.data(), outputChannels);
}
}
for (int i = 0; i < outputChannels; ++i) {
if (outputScale[i] == 0) {
scale[i] = 0.0f;
} else {
scale[i] = inputScalexWeight * quantizedWeightScale[i] / outputScale[0];
}
}
if (bias) {
for (int i = 0; i < outputChannels; ++i) {
if (inputScalexWeight == 0 || quantizedWeightScale[i] == 0) {
quantizedBias[i] = 0;
} else {
quantizedBias[i] = static_cast<int32_t>(bias[i] / (inputScalexWeight * quantizedWeightScale[i]));
}
}
}
return 0;
}
QuantizeDepthwiseConv函数
int QuantizeDepthwiseConv(const float* weight, const int size, const float* bias, int8_t* quantizedWeight,
int32_t* quantizedBias, float* scale, const std::vector<float>& inputScale,
const std::vector<float>& outputScale, std::string method) {
const int inputChannels = inputScale.size();
const int outputChannels = outputScale.size();
DCHECK(inputChannels == outputChannels) << "Input Data Size Error!";
std::vector<float> quantizedWeightScale(inputChannels);
if (method == "MAX_ABS") {
SymmetricQuantizeWeight(weight, size, quantizedWeight, quantizedWeightScale.data(), inputChannels);
}
else if (method == "ADMM") {
QuantizeWeightADMM(weight, size, quantizedWeight, quantizedWeightScale.data(), inputChannels);
}
for (int c = 0; c < inputChannels; ++c) {
const int index = c;
if (outputScale[c] == 0) {
scale[index] = 0.0f;
} else {
scale[index] = inputScale[c] * quantizedWeightScale[c] / outputScale[c];
}
}
if (bias) {
for (int i = 0; i < outputChannels; ++i) {
if (inputScale[i] == 0 || quantizedWeightScale[i] == 0) {
quantizedBias[i] = 0;
} else {
quantizedBias[i] = static_cast<int32_t>(bias[i] / (inputScale[i] * quantizedWeightScale[i]));
}
}
}
return 0;
}
添加反量化_insertDequantize函数
主要作用是添加反量化操作,对于不支持int8的op,前后要加反量化的层。 量化op输出是整数, 但是以量化op输出为输入的下一个op是float,可能需要是float版本输出,所以需要添加反量化。
大致说一下过程:
1.遍历op,找出所有被量化op的输入输出tensor
2.遍历op,找出所有不能量化的op
如果该op的输入tensor是可量化op产生的, 那么需要输入后面加反量化
如果该op的输出tensor是给到可量化op的, 那么需要在输出tensor后面加反量化
对于该图的输出tensor,后面加反量化, 确保输出结果是浮点的
void Calibration::_insertDequantize() {
// Search All Int Tensors
std::set<int> int8Tensors;
std::set<int> int8Outputs;
for (auto& op : _originaleModel->oplists) {// 遍历op
if (Helper::INT8SUPPORTED_OPS.count(op->type) > 0) {//如果op是 int8量化的, 记录其input和output
for (auto index : op->inputIndexes) {//记录其input
int8Tensors.insert(index);
}
for (auto index : op->outputIndexes) {//记录其output
int8Tensors.insert(index);
int8Outputs.insert(index);
}
}
}
for (auto& op : _originaleModel->oplists) {// 去重,tensor
for (auto index : op->inputIndexes) {
auto iter = int8Outputs.find(index);
if (iter != int8Outputs.end()) {
int8Outputs.erase(iter);
}
}
}
// Insert Convert For Not Support Int8 Ops, 对于不支持int8的op,前后要加反量化的层
for (auto iter = _originaleModel->oplists.begin(); iter != _originaleModel->oplists.end();) {
auto op = iter->get();
const auto opType = op->type;
const auto name = op->name;
// check whether is output op
// if Yes, insert dequantization op after this op 支持int8的op,跳过
if (Helper::INT8SUPPORTED_OPS.find(opType) != Helper::INT8SUPPORTED_OPS.end()) {
// this is quantized op
iter++;
continue;
}
auto& inputIndexes = op->inputIndexes;
const int inputSize = inputIndexes.size();
// insert dequantization op before this op
for (int i = 0; i < inputSize; ++i) {// 对于该op的所有输入 tensor
const auto curInputIndex = inputIndexes[i];
if (int8Tensors.find(curInputIndex) == int8Tensors.end()) {// 如果该tensor不是量化版的,跳过
continue;
}
auto input = _tensorMap[curInputIndex];
auto inputOpScale = _scales[input];
// construct new op, 创建一个 定点转浮点的 op
auto dequantizationOp = new MNN::OpT;
dequantizationOp->main.type = MNN::OpParameter_QuantizedFloatParam;
dequantizationOp->name = "___Int8ToFloat___For_" + name + flatbuffers::NumToString(i);
// 填充参数
dequantizationOp->type = MNN::OpType_Int8ToFloat;
auto dequantizationParam = new MNN::QuantizedFloatParamT;
dequantizationOp->main.value = dequantizationParam;
dequantizationParam->tensorScale = inputOpScale;
dequantizationOp->inputIndexes.push_back(curInputIndex);
dequantizationOp->outputIndexes.push_back(_originaleModel->tensorName.size());
_originaleModel->tensorName.push_back(dequantizationOp->name);
// reset current op's input index at i, 新创建定点转浮点op的 输出 接到 op的输入
inputIndexes[i] = dequantizationOp->outputIndexes[0];
iter = _originaleModel->oplists.insert(iter, std::unique_ptr<MNN::OpT>(dequantizationOp));
iter++;
}
iter++;
// LOG(INFO) << "insert quantization op after this op if neccessary";
// insert quantization op after this op if neccessary
对于该op的所有输出tensor
for (int i = 0; i < op->outputIndexes.size(); ++i) {
const auto outputIndex = op->outputIndexes[i];
if (int8Tensors.find(outputIndex) == int8Tensors.end()) { // 如果该tensor不是量化版本,跳过
continue;
}
auto output = _tensorMap[outputIndex];
auto curScale = _scales[output];
// construct one quantization op(FloatToInt8)
// 创建反量化的op
auto quantizationOp = new MNN::OpT;
quantizationOp->main.type = MNN::OpParameter_QuantizedFloatParam;
quantizationOp->name = name + "___FloatToInt8___" + flatbuffers::NumToString(i);
quantizationOp->type = MNN::OpType_FloatToInt8;
auto quantizationParam = new MNN::QuantizedFloatParamT;
quantizationOp->main.value = quantizationParam;
// 填充参数
const int channels = curScale.size();
std::vector<float> quantizationScale(channels);
Helper::invertData(quantizationScale.data(), curScale.data(), channels);
quantizationParam->tensorScale = quantizationScale;
// 插入 反量化op到 该op后面
quantizationOp->inputIndexes.push_back(_originaleModel->tensorName.size());
quantizationOp->outputIndexes.push_back(outputIndex);
_originaleModel->tensorName.push_back(_originaleModel->tensorName[outputIndex]);
_originaleModel->tensorName[outputIndex] = quantizationOp->name;
op->outputIndexes[i] = quantizationOp->inputIndexes[0];
iter = _originaleModel->oplists.insert(iter, std::unique_ptr<MNN::OpT>(quantizationOp));
iter++;
}
}
// Insert Turn float Op for output
for (auto index : int8Outputs) { // 对该图的 输出tensor,添加反量化, 因为图最终输出还是要浮点的
// construct new op
auto dequantizationOp = new MNN::OpT;
dequantizationOp->main.type = MNN::OpParameter_QuantizedFloatParam;
dequantizationOp->name = "___Int8ToFloat___For_" + flatbuffers::NumToString(index);
dequantizationOp->type = MNN::OpType_Int8ToFloat;
auto dequantizationParam = new MNN::QuantizedFloatParamT;
dequantizationOp->main.value = dequantizationParam;
dequantizationParam->tensorScale = _scales[_tensorMap[index]];
dequantizationOp->inputIndexes.push_back(index);
dequantizationOp->outputIndexes.push_back(_originaleModel->tensorName.size());
auto originTensorName = _originaleModel->tensorName[index];
_originaleModel->tensorName[index] = dequantizationOp->name;
_originaleModel->tensorName.emplace_back(originTensorName);
_originaleModel->oplists.insert(_originaleModel->oplists.end(), std::unique_ptr<MNN::OpT>(dequantizationOp));
}
}
_computeFeatureScaleADMM函数
关于ADMM的原理在这里:https://zhuanlan.zhihu.com/p/81243626
ADMM方式计算特征,其中computeScaleADMM包含在TensorStatistic.cpp中。
void Calibration::_computeFeatureScaleADMM() {
// feed input data according to input images
int count = 0;
std::vector<int> oneImageTensorDims = _inputTensorDims;
oneImageTensorDims[0] = 1;
auto inputTensorDataFormat = MNN::TensorUtils::getDescribe(_inputTensor)->dimensionFormat;
auto dimType = MNN::Tensor::CAFFE_C4;
if (inputTensorDataFormat == MNN::MNN_DATA_FORMAT_NHWC) {
dimType = MNN::Tensor::TENSORFLOW;
}
for (const auto& img : _imgaes) {
auto curPtr = _inputTensor->host<float>() + count * _inputTensor->stride(0);
std::shared_ptr<MNN::Tensor> tensorWarp(
MNN::Tensor::create(oneImageTensorDims, _inputTensor->getType(), curPtr, dimType));
Helper::preprocessInput(_process.get(), _width, _height, img, tensorWarp.get());
count++;
MNN_PRINT("\rProcessImage: %.2lf %%", (float)count * 100.0f / (float)_imageNum);
fflush(stdout);
}
MNN_PRINT("\n");
_scales.clear();
const int totalLayers = _featureInfo.size();
count = 0;
MNN::TensorCallBackWithInfo before = [&](const std::vector<MNN::Tensor*>& nTensors, const MNN::OperatorInfo* info) {
if (Helper::gNeedFeatureOp.find(info->type()) != Helper::gNeedFeatureOp.end()) {
for (auto t : nTensors) {
if (_featureInfo.find(t) != _featureInfo.end()) {
_scales[t] = _featureInfo[t]->computeScaleADMM();
count++;
MNN_PRINT("\rComputeADMM: %.2lf %%", (float)count * 100.0f / (float)totalLayers);
fflush(stdout);
}
}
}
return true;
};
MNN::TensorCallBackWithInfo after = [&](const std::vector<MNN::Tensor*>& nTensors, const MNN::OperatorInfo* info) {
if (Helper::gNeedFeatureOp.find(info->type()) != Helper::gNeedFeatureOp.end()) {
for (auto t : nTensors) {
if (_featureInfo.find(t) != _featureInfo.end()) {
_scales[t] = _featureInfo[t]->computeScaleADMM();
count++;
MNN_PRINT("\rComputeADMM: %.2lf %%", (float)count * 100.0f / (float)totalLayers);
fflush(stdout);
}
}
}
return true;
};
_interpreter->runSessionWithCallBackInfo(_session, before, after);
MNN_PRINT("\n");
}
computeScaleADMM函数
std::vector<float> TensorStatistic::computeScaleADMM() {
std::vector<float> scaleValue(mOriginTensor->channel(), 0.0f);
const int count = mOriginTensor->elementSize();
float max = 0;
const float bound = 127;
const float* originData = mOriginTensor->host<float>();
for (int i = 0; i < count; i++) {
float absData = std::fabs(originData[i]);
if (absData > max) {
max = absData;
}
}
float alpha = max / (bound * 2.5);
// DLOG(INFO) << "alpha init: " << alpha;
const int maxStep = 300;
float sum1 = 0;
float sum2 = 0;
float invAlpha;
for (int i = 0; i < maxStep; i++) {
sum1 = 0;
sum2 = 0;
invAlpha = 1 / alpha;
for (int i = 0; i < count; i++) {
auto origin = originData[i];
auto dataQuant = std::roundf(origin * invAlpha);
dataQuant = std::fmin(bound, std::fmax(-bound, dataQuant));
sum1 += (dataQuant * origin);
sum2 += (dataQuant * dataQuant);
}
alpha = sum1 / sum2;
}
// DLOG(INFO) << "alpha final: " << alpha;
std::fill(scaleValue.begin(), scaleValue.end(), alpha);
return scaleValue;
}