MNN量化源码详解

最新推荐文章于 2021-12-31 15:16:39 发布

三寸光阴___

最新推荐文章于 2021-12-31 15:16:39 发布

阅读量4.7k

点赞数 5

分类专栏： MNN 量化

本文链接：https://blog.csdn.net/qq_38109843/article/details/107181824

版权

量化同时被 2 个专栏收录

4 篇文章 1 订阅

订阅专栏

MNN

3 篇文章 1 订阅

订阅专栏

参考链接：https://zhuanlan.zhihu.com/p/153562409?from_voters_page=true

MNN量化工具使用

编译

 cd MNN
 mkdir build
 cd build
 cmake -DMNN_BUILD_QUANTOOLS=ON ..
 make -j4

使用

./quantized.out origin.mnn quantized.mnn ModelConfig.json

也可以用python安装mnn

pip install mnn
mnnquant origin.mnn quantized.mnn ModelConfig.json

ModelConfig.json配置格式

{
    "format":"GRAY",
    "mean":[
		0
    ],
    "normal":[
        0.00784314
    ],
    "width":28,
    "height":28,
    "path":"/mldb/dataset/MNIST/test_data/8",
    "used_image_num":100,
    "feature_quantize_method":"KL",
    "weight_quantize_method":"MAX_ABS"
}

format

图片统一按RGBA读取，然后转换到format指定格式，可选：“RGB”, “BGR”, “RGBA”, “GRAY”。

mean, normal

模型预处理需要的mean,normal, 数据按此公式填写：

width, height

模型输入的宽高

path

存放校正特征量化系数的图片目录

used_image_num

用于指定使用上述目录下多少张图片进行校正，默认使用path下全部图片

注意：请确保图片经过上述步骤处理之后的数据是输入到模型input接口的数据

feature_quantize_method

指定计算特征量化系数的方法，可选：
“KL”: 使用KL散度进行特征量化系数的校正，一般需要100 ~ 1000张图片
“ADMM”: 使用ADMM（Alternating Direction Method of Multipliers）方法进行特征量化系数的校正，一般需要一个batch的数据
默认：“KL”

weight_quantize_method

指定权值量化方法，可选：
“MAX_ABS”: 使用权值的绝对值的最大值进行对称量化
“ADMM”: 使用ADMM方法进行权值量化
默认：“MAX_ABS”
上述特征量化方法和权值量化方法可进行多次测试，择优使用。

源码详解

在这里插入图片描述

主函数quantized.cpp

int main(int argc, const char* argv[]) {
      if (argc < 4) {
        DLOG(INFO) << "Usage: ./quantized.out src.mnn dst.mnn preTreatConfig.json\n";
        return 0;
    }
    const char* modelFile      = argv[1];
    const char* preTreatConfig = argv[3];
    const char* dstFile        = argv[2];
    DLOG(INFO) << ">>> modelFile: " << modelFile;
    DLOG(INFO) << ">>> preTreatConfig: " << preTreatConfig;
    DLOG(INFO) << ">>> dstFile: " << dstFile
     std::unique_ptr<MNN::NetT> netT;
     {// 读取原始的model文件， 借助于flattbuffer生成Net对象
         std::ifstream input(modelFile);
         std::ostringstream outputOs;
         outputOs << input.rdbuf();
         netT = MNN::UnPackNet(outputOs.str().c_str()); //获取Net对象
     }
 
     // temp build net for inference
     flatbuffers::FlatBufferBuilder builder(1024);
     auto offset = MNN::Net::Pack(builder, netT.get());//打包模型准备放入buffer中
     builder.Finish(offset);
     int size      = builder.GetSize();
     auto ocontent = builder.GetBufferPointer();
 
     // 创建两个buffer，两个都用来放模型数据
     std::unique_ptr<uint8_t> modelForInference(new uint8_t[size]);
     memcpy(modelForInference.get(), ocontent, size);
     std::unique_ptr<uint8_t> modelOriginal(new uint8_t[size]);
     memcpy(modelOriginal.get(), ocontent, size);
 
     netT.reset();
     netT = MNN::UnPackNet(modelOriginal.get());
 
     // 进行量化操作， 主要这个靠的是Calibration类
     DLOG(INFO) << "Calibrate the feature and quantize model...";
     std::shared_ptr<Calibration> calibration(
         new Calibration(netT.get(), modelForInference.get(), size, preTreatConfig));
     calibration->runQuantizeModel();
     DLOG(INFO) << "Quantize model done!";
     // 量化后的模型写入到FlatBufferBuilder
     flatbuffers::FlatBufferBuilder builderOutput(1024);
     builderOutput.ForceDefaults(true);
     auto len = MNN::Net::Pack(builderOutput, netT.get());
     builderOutput.Finish(len);
    // FlatBufferBuilder的内容写入文件，得到量化模型
     {
         std::ofstream output(dstFile);
         output.write((const char*)builderOutput.GetBufferPointer(), builderOutput.GetSize());
     }
 }

Calibration类

MNN量化的核心类，权重量化，特征量化。

Calibration.hpp

class Calibration {
public:
	// 参数 原始模型,模型uint8_t buffer,size,json配置文件
    Calibration(MNN::NetT* model, uint8_t* modelBuffer, const int bufferSize, const std::string& configPath);

    void runQuantizeModel();

private:
    Calibration();
    MNN::NetT* _originaleModel;// 需要量化的模型 
    std::shared_ptr<MNN::CV::ImageProcess> _process;// 负责image到tensor的转化类
    const int _binNums = 2048;
    int _imageNum      = 0;
    int _width;
    int _height;
    std::vector<std::string> _imgaes;//图片,用于校正特征量化系数的

    // Tensor and Info
	// tensor 到 对应的 TensorStatistic， TensorStatistic是描述tensor在量化过程中需要的统计数据,后面有解释
    std::map<const MNN::Tensor*, std::shared_ptr<TensorStatistic>> _featureInfo;
	// 所有的tensor
    std::map<int, const MNN::Tensor*> _tensorMap;

    // Op's name, Inputs, Outputs
	// op到  input/output tensor的映射
    std::map<std::string, std::pair<std::vector<MNN::Tensor*>, std::vector<MNN::Tensor*>>> _opInfo;

    // The scale results
    std::map<const MNN::Tensor*, std::vector<float>> _scales;

    std::shared_ptr<MNN::Interpreter> _interpreter;
    // keep mnn forward information
    MNN::Session* _session;
    MNN::Tensor* _inputTensor;
    std::vector<int> _inputTensorDims;

    std::string _featureQuantizeMethod = "KL";
    std::string _weightQuantizeMethod  = "MAX_ABS";

    void _initMNNSession(const uint8_t* modelBuffer, const int bufferSize, const int channels);
    void _initMaps();

    void _computeFeatureMapsRange();
    void _collectFeatureMapsDistribution();
    void _computeFeatureScaleKL();
    void _computeFeatureScaleADMM();
    void _updateScale();

    // insert the dequantization op before the not supported op(int8), and insert dequantization op
    // after the output op, so that get original float data conveniently
    void _insertDequantize();
};

#endif // CALIBRATION_HPP

Calibration构造函数

Calibration::Calibration(MNN::NetT* model, uint8_t* modelBuffer, const int bufferSize, const std::string& configPath)
    : _originaleModel(model) {
    // when the format of input image is RGB/BGR, channels equal to 3, GRAY is 1
    int channles = 3;
	// 解析json
    rapidjson::Document document;
    {
        std::ifstream fileNames(configPath.c_str());
        std::ostringstream output;
        output << fileNames.rdbuf();
        auto outputStr = output.str();
        document.Parse(outputStr.c_str());
        if (document.HasParseError()) {
            MNN_ERROR("Invalid json\n");
            return;
        }
    }
    auto picObj = document.GetObject();
	 // 构造ImageProcess::config对象,将json内容传入
    ImageProcess::Config config;
    config.filterType = BILINEAR;
    config.destFormat = BGR;
    {
        if (picObj.HasMember("format")) {
            auto format = picObj["format"].GetString();
            static std::map<std::string, ImageFormat> formatMap{{"BGR", BGR}, {"RGB", RGB}, {"GRAY", GRAY}};
            if (formatMap.find(format) != formatMap.end()) {
                config.destFormat = formatMap.find(format)->second;
            }
        }
    }

    if (config.destFormat == GRAY) {
        channles = 1;
    }

    config.sourceFormat = RGBA;
    std::string imagePath;
    _imageNum = 0;
    {
        if (picObj.HasMember("mean")) {
            auto mean = picObj["mean"].GetArray();
            int cur   = 0;
            for (auto iter = mean.begin(); iter != mean.end(); iter++) {
                config.mean[cur++] = iter->GetFloat();
            }
        }
        if (picObj.HasMember("normal")) {
            auto normal = picObj["normal"].GetArray();
            int cur     = 0;
            for (auto iter = normal.begin(); iter != normal.end(); iter++) {
                config.normal[cur++] = iter->GetFloat();
            }
        }
        if (picObj.HasMember("width")) {
            _width = picObj["width"].GetInt();
        }
        if (picObj.HasMember("height")) {
            _height = picObj["height"].GetInt();
        }
        if (picObj.HasMember("path")) {
            imagePath = picObj["path"].GetString();
        }
        if (picObj.HasMember("used_image_num")) {
            _imageNum = picObj["used_image_num"].GetInt();
        }
        if (picObj.HasMember("feature_quantize_method")) {
            std::string method = picObj["feature_quantize_method"].GetString();
            if (Helper::featureQuantizeMethod.find(method) != Helper::featureQuantizeMethod.end()) {
                _featureQuantizeMethod = method;
            } else {
                MNN_ERROR("not supported feature quantization method: %s\n", method.c_str());
                return;
            }
        }
        if (picObj.HasMember("weight_quantize_method")) {
            std::string method = picObj["weight_quantize_method"].GetString();
            if (Helper::weightQuantizeMethod.find(method) != Helper::weightQuantizeMethod.end()) {
                _weightQuantizeMethod = method;
            } else {
                MNN_ERROR("not supported weight quantization method: %s\n", method.c_str());
                return;
            }
        }
        DLOG(INFO) << "Use feature quantization method: " << _featureQuantizeMethod;
        DLOG(INFO) << "Use weight quantization method: " << _weightQuantizeMethod;
    }
    std::shared_ptr<ImageProcess> process(ImageProcess::create(config));// 生成ImageProcess对象
    _process = process;

    // read images file names
    Helper::readImages(_imgaes, imagePath.c_str(), &_imageNum);

    _initMNNSession(modelBuffer, bufferSize, channles);
    _initMaps();
}

initMNNSession函数

主要用于初始化，做好模型推理的准备

void Calibration::_initMNNSession(const uint8_t* modelBuffer, const int bufferSize, const int channels) {
    _interpreter.reset(MNN::Interpreter::createFromBuffer(modelBuffer, bufferSize));
    MNN::ScheduleConfig config;
    _session     = _interpreter->createSession(config);
    _inputTensor = _interpreter->getSessionInput(_session, NULL);

    _inputTensorDims.resize(4);
    auto inputTensorDataFormat = MNN::TensorUtils::getDescribe(_inputTensor)->dimensionFormat;
    DCHECK(4 == _inputTensor->dimensions()) << "Only support 4 dimensions input";
    if (inputTensorDataFormat == MNN::MNN_DATA_FORMAT_NHWC) {
        _inputTensorDims[0] = 1;
        _inputTensorDims[1] = _height;
        _inputTensorDims[2] = _width;
        _inputTensorDims[3] = channels;
    } else if (inputTensorDataFormat == MNN::MNN_DATA_FORMAT_NC4HW4) {
        _inputTensorDims[0] = 1;
        _inputTensorDims[1] = channels;
        _inputTensorDims[2] = _height;
        _inputTensorDims[3] = _width;
    } else {
        DLOG(ERROR) << "Input Data Format ERROR!";
    }

    if (_featureQuantizeMethod == "KL") {
        _interpreter->resizeTensor(_inputTensor, _inputTensorDims);
        _interpreter->resizeSession(_session);
    } else if (_featureQuantizeMethod == "ADMM") {
        DCHECK((_imageNum * 4 * _height * _width) < (INT_MAX / 4)) << "Use Little Number of Images When Use ADMM";
        _inputTensorDims[0] = _imageNum;
        _interpreter->resizeTensor(_inputTensor, _inputTensorDims);
        _interpreter->resizeSession(_session);
    }
    _interpreter->releaseModel();
}

_initMaps函数

定义tensor回调函数，为input tensor和output tensor创建TensorStatistic对象；遍历op，input和output加入tensorMap。

void Calibration::_initMaps() {
    _featureInfo.clear();
    _opInfo.clear();
    _tensorMap.clear();
    // run mnn once, initialize featureMap, opInfo map
	// MNN提供了每个op计算的callback,一个计算前一个是计算后
     //  计算前的callback完成的工作是为input tensor创建TensorStatistic对象； op info的填充 op->input,output的映射
    MNN::TensorCallBackWithInfo before = [&](const std::vector<MNN::Tensor*>& nTensors, const MNN::OperatorInfo* info) {
        _opInfo[info->name()].first = nTensors;
        if (Helper::gNeedFeatureOp.find(info->type()) != Helper::gNeedFeatureOp.end()) {
            for (auto t : nTensors) {
                if (_featureInfo.find(t) == _featureInfo.end()) {
                    _featureInfo[t] = std::shared_ptr<TensorStatistic>(
                        new TensorStatistic(t, _featureQuantizeMethod, info->name() + "__input"));
                }
            }
        }
        return false;
    };
	// 计算后的callback完成的工作是  为output tensor创建TensorStatistic对象；op info的填充 op->input,output的映射
    MNN::TensorCallBackWithInfo after = [this](const std::vector<MNN::Tensor*>& nTensors,
                                               const MNN::OperatorInfo* info) {
        _opInfo[info->name()].second = nTensors;
        if (Helper::gNeedFeatureOp.find(info->type()) != Helper::gNeedFeatureOp.end()) {
            for (auto t : nTensors) {
                if (_featureInfo.find(t) == _featureInfo.end()) {
                    _featureInfo[t] =
                        std::shared_ptr<TensorStatistic>(new TensorStatistic(t, _featureQuantizeMethod, info->name()));
                }
            }
        }
        return true;
    };
    _interpreter->runSessionWithCallBackInfo(_session, before, after);
    // 遍历op,由op的<input/output index,input/output>加入到 tensorMap
    for (auto& op : _originaleModel->oplists) {
        if (_opInfo.find(op->name) == _opInfo.end()) {
            continue;
        }
        for (int i = 0; i < op->inputIndexes.size(); ++i) {
            _tensorMap[op->inputIndexes[i]] = _opInfo[op->name].first[i];
        }
        for (int i = 0; i < op->outputIndexes.size(); ++i) {
            _tensorMap[op->outputIndexes[i]] = _opInfo[op->name].second[i];
        }
    }

    if (_featureQuantizeMethod == "KL") {
        // set the tensor-statistic method of input tensor as THRESHOLD_MAX
        auto inputTensorStatistic = _featureInfo.find(_inputTensor);
        if (inputTensorStatistic != _featureInfo.end()) {
            inputTensorStatistic->second->setThresholdMethod(THRESHOLD_MAX);
        }
    }
}

TensorStatistic类

由于在Calibration实例里有个map包含了

std::map<const MNN::Tensor*, std::shared_ptr<TensorStatistic>> _featureInfo;

表示每个tensor都有一个TensorStatistic对象与之对应，tensor在量化过程中数据存在TensorStatistic里面。
TensorStatistic.hpp

class TensorStatistic {
 public:
     TensorStatistic(const MNN::Tensor* tensor, std::string method, const std::string& name, int binNumber = 2048, GET_THRESHOLD_METHOD thresholdMethod = THRESHOLD_KL);
    .  .  .
     void updateRange();
     void resetDistribution();
     void updateDistribution();
     void setThresholdMethod(GET_THRESHOLD_METHOD thresholdMethod);
     void setChannelWise(bool mergeChannel);
     std::vector<float> finishAndCompute();
     // only this one for ADMM
     std::vector<float> computeScaleADMM();
 private:
     int _computeThreshold(const std::vector<float>& distribution);
     std::vector<std::pair<float, float>> mRangePerChannel;// 该tensor在每个channel上的最大最小值
     // 记该channel上的 (取最大值绝对值，取最小值的绝对值，的最大值) 为maxValue
     std::vector<float> mIntervals;//每个channel上的interval = mBinNumber / maxValue
     std::vector<bool> mValidChannel; //如果channel上的maxValue>0.00001f, 为true
     std::vector<std::vector<float>> mDistribution;// 对于每个channel，tensor上浮点数据做均匀映射，映射到[0,mBinNumber]，统计各个整数出现次数， 形成了直方图。
 
     std::shared_ptr<MNN::Tensor> mHostTensor; // 该tensor在cpu端的表示
     const MNN::Tensor* mOriginTensor; // 原始的tensor
     int mBinNumber; //  默认是2048
     bool mUpdatedDistributionFlag = false;
     bool mUpdatedRangeFlags       = false;
 
     bool mMergeChannel                    = true;
     std::string mName;
     GET_THRESHOLD_METHOD mThresholdMethod = THRESHOLD_KL;
 };

runQuantizeModel函数

两种特征量化方法：KL和ADMM

 void Calibration::runQuantizeModel() {
     if (_featureQuantizeMethod == "KL") { // 如果配置文件里是 KL散度做
         _computeFeatureScaleKL();
     } else if (_featureQuantizeMethod == "ADMM") {// 如果配置文件里是ADMM
         _computeFeatureScaleADMM();
     }
     _updateScale();
     _insertDequantize();
 }

computeFeatureScaleKL函数

void Calibration::_computeFeatureScaleKL() {
     _computeFeatureMapsRange(); // 计算 feature map里的数据范围
     _collectFeatureMapsDistribution(); // 计算feature map里的数据分布
 
     _scales.clear();
     for (auto& iter : _featureInfo) {
         AUTOTIME;_imgaes
         _scales[iter.first] = iter.second->finishAndCompute();//缩放系数
     }
     //_featureInfo.clear();//No need now
 }

_computeFeatureMapsRange函数

用指定的图片集合做为模型输入，做推理，在推理过程中，每一个op的计算前后分别统计tensor的每一个channel上featuremap的最大值最小值，更新到该tensor关联的TensorStatistic对象实例里。其中updateRange函数在TensorStatistic.cpp中

void Calibration::_computeFeatureMapsRange() {
     // feed input data according to input images
     int count = 0;
     for (const auto& img : _imgaes) { // 对于每一个图片文件
         for (auto& iter : _featureInfo) {
             iter.second->resetUpdatedRangeFlags();
         }
         count++;
         // 读取图片，放到input tensor
         Helper::preprocessInput(_process.get(), _width, _height, img, _inputTensor);
         // 设置回调， 做推理时，每个op计算前，计算后分别调用
         MNN::TensorCallBackWithInfo before = [&](const std::vector<MNN::Tensor*>& nTensors,
                                                  const MNN::OperatorInfo* info) {
             for (auto t : nTensors) {
                 if (_featureInfo.find(t) != _featureInfo.end()) {
                     _featureInfo[t]->updateRange();// 统计输入tensor里的最大值最小值
                 }
             }
             return true;
         };
         MNN::TensorCallBackWithInfo after = [&](const std::vector<MNN::Tensor*>& nTensors,
                                                 const MNN::OperatorInfo* info) {
             for (auto t : nTensors) {
                 if (_featureInfo.find(t) != _featureInfo.end()) {
                     _featureInfo[t]->updateRange();// 统计输出tensor里的最大值最小值
                 }
             }
             return true;
         };
         // 推理一遍
         _interpreter->runSessionWithCallBackInfo(_session, before, after);
         MNN_PRINT("\rComputeFeatureRange: %.2lf %%", (float)count * 100.0f / (float)_imageNum);
         fflush(stdout);
     }
     MNN_PRINT("\n");
 }

updateRange函数

得到特征的最大值和最小值

void TensorStatistic::updateRange() {
    if (mUpdatedRangeFlags) {
        return;
    }
    mUpdatedRangeFlags = true;
    mOriginTensor->copyToHostTensor(mHostTensor.get());
    int batch   = mHostTensor->batch();
    int channel = mHostTensor->channel();
    int width   = mHostTensor->width();
    int height  = mHostTensor->height();
    auto area   = width * height;

    for (int n = 0; n < batch; ++n) {
        auto dataBatch = mHostTensor->host<float>() + n * mHostTensor->stride(0);
        for (int c = 0; c < channel; ++c) {
            int cIndex = c;
            if (mMergeChannel) {
                cIndex = 0;
            }
            auto minValue    = mRangePerChannel[cIndex].first;
            auto maxValue    = mRangePerChannel[cIndex].second;
            auto dataChannel = dataBatch + c * mHostTensor->stride(1);
            for (int v = 0; v < area; ++v) {
                minValue = std::min(minValue, dataChannel[v]);
                maxValue = std::max(maxValue, dataChannel[v]);
            }
            mRangePerChannel[cIndex].first  = minValue;
            mRangePerChannel[cIndex].second = maxValue;
        }
    }
}

_collectFeatureMapsDistribution函数

计算每个tensor的数据分布情况，其中resetDistributio函数和updateDistribution函数在TensorStatistic.cpp中。

void Calibration::_collectFeatureMapsDistribution() {
     for (auto& iter : _featureInfo) {
         iter.second->resetDistribution(); // 初始化，清空TensorStatistic实例里的记录数据分布的属性
     }
     // 定义两个回调， 分别在op计算前 计算后执行
     // feed input data according to input images
     MNN::TensorCallBackWithInfo before = [&](const std::vector<MNN::Tensor*>& nTensors, const MNN::OperatorInfo* info) {
         for (auto t : nTensors) {
             if (_featureInfo.find(t) != _featureInfo.end()) {
                 _featureInfo[t]->updateDistribution(); // 对该input tensor的TensorStatistic实例更新数据分布
             }
         }
         return true;
     };
     MNN::TensorCallBackWithInfo after = [&](const std::vector<MNN::Tensor*>& nTensors, const MNN::OperatorInfo* info) {
         for (auto t : nTensors) {
             if (_featureInfo.find(t) != _featureInfo.end()) {
                 _featureInfo[t]->updateDistribution();// 对该output tensor的TensorStatistic实例更新数据分布
             }
         }
         return true;
     };
     int count = 0;
     for (const auto& img : _imgaes) {// 对所有的image，跑一遍推理
         count++;
         for (auto& iter : _fiter : _featureInfoeatureInfo) {
             iter.second->resetUpdatedDistributionFlag();
         }
         Helper::preprocessInput(_process.get(), _width, _height, img, _inputTensor); //读图片填到input
         _interpreter->runSessionWithCallBackInfo(_session, before, after); // 执行推理 
 
         MNN_PRINT("\rCollectFeatureDistribution: %.2lf %%", (float)count * 100.0f / (float)_imageNum);
         fflush(stdout);
     }
     MNN_PRINT("\n");
 }

resetDistributio、updateDistribution函数

void TensorStatistic::resetDistribution() { // 初始化分布
     for (int i = 0; i < mIntervals.size(); ++i) { // 在每一个channel上做运算
         int cIndex = i;
         if (mMergeChannel) {
             cIndex = 0;
         }
         // 最大值是 该channel上出现的数据的 绝对值的最大值
         auto maxValue         = std::max(fabsf(mRangePerChannel[cIndex].second), fabsf(mRangePerChannel[cIndex].first));
         mValidChannel[cIndex] = maxValue > 0.00001f;// 最大值要足够大， 然后除以mBinNumber=2048，得到mIntervals
         mIntervals[cIndex]    = 0.0f;
         if (mValidChannel[cIndex]) {
              // mIntervals 代表是 原始float均匀映射到 [0-2048)时， 整数1对应到浮点数上的值
             mIntervals[cIndex] = (float)mBinNumber / maxValue;
         }
     }
     for (auto& c : mDistribution) {
         std::fill(c.begin(), c.end(), 1.0e-07); // mDistribution初始值设为 接近0的很小的浮点数。
     }
 }
 
 void TensorStatistic::updateDistribution() {
     if (mUpdatedDistributionFlag) { //
         return;
     }
     mUpdatedDistributionFlag = true;
     // 取tensor上的数据， 和shape
     mOriginTensor->copyToHostTensor(mHostTensor.get());
     int batch   = mHostTensor->batch();
     int channel = mHostTensor->channel();
     int width   = mHostTensor->width();
     int height  = mHostTensor->height();
     auto area   = width * height;
 
     for (int n = 0; n < batch; ++n) {
         auto dataBatch = mHostTensor->host<float>() + n * mHostTensor->stride(0);// 取该batch上的数据
         for (int c = 0; c < channel; ++c) {// 对每一个channel维度上计算
             int cIndex = c;
             if (mMergeChannel) {
                 cIndex = 0;
             }
             if (!mValidChannel[cIndex]) {
                 continue;
             }
             auto multi       = mIntervals[cIndex];  // 取间隔值
             auto target      = mDistribution[cIndex].data();//取该channel上分布
             auto dataChannel = dataBatch + c * mHostTensor->stride(1);//取该channel上的feature map
             for (int v = 0; v < area; ++v) {
                 auto data = dataChannel[v]; // data是feature map上的数值点
                 if (data == 0) {
                     continue;
                 }
                 int index = static_cast<int>(fabs(data) * multi);// 该数值点 均匀映射到 整数点， 整数记为index
                 index     = std::min(index, mBinNumber - 1); // 限制范围是 不超过mBinNumber - 1
                 target[index] += 1.0f; // 统计到 mDistribution里
             }
         }
     }
 }

finishAndCompute函数

计算出从float32到int8的缩放参数

std::vector<float> TensorStatistic::finishAndCompute() {
     std::vector<float> scaleValue(mDistribution.size(), 0.0f);
     . . . 
     for (int c = 0; c < mDistribution.size(); ++c) { // 对每一个channel
         if (!mValidChannel[c]) {
             continue;
         }
         float sum          = 0.0f;
         auto& distribution = mDistribution[c];
         std::for_each(distribution.begin(), distribution.end(), [&](float n) { sum += n; });// 求和
         std::for_each(distribution.begin(), distribution.end(), [sum](float& n) { n /= sum; }); // 除以和，等于占比， 相当于归一化了
 
         auto threshold = _computeThreshold(distribution); // 计算一个阈值
         scaleValue[c]  = ((float)threshold + 0.5) / mIntervals[c] / 127.0; // 计算缩放系数
     }
     return scaleValue;
 }

_computeThreshold函数

在targetBinNums 到mBinNumber之间找一个阈值，使得KL散度最小。

int TensorStatistic::_computeThreshold(const std::vector<float>& distribution) {
     const int targetBinNums = 128;
     int threshold           = targetBinNums; // 默认128
 
     if (mThresholdMethod == THRESHOLD_KL) { // 如果是通过KL散度
         float minKLDivergence   = 10000.0f;
         float afterThresholdSum = 0.0f;
         //targetBinNums=128, mBinNumber=2048
         std::for_each(distribution.begin() + targetBinNums, distribution.end(),
                       [&](float n) { afterThresholdSum += n; });
         for (int i = targetBinNums; i < mBinNumber; ++i) { // i 从128 到 2047， 寻找潜在的threshold
             std::vector<float> quantizedDistribution(targetBinNums);
             std::vector<float> candidateDistribution(i);
             std::vector<float> expandedDistribution(i);
             // candidateDistribution是保留了 从0到i的分布， 同时把 i到mBinNumber直接的分布加到最后i-1位置
             std::copy(distribution.begin(), distribution.begin() + i, candidateDistribution.begin());
             candidateDistribution[i - 1] += afterThresholdSum;
             afterThresholdSum -= distribution[i];
             // 空间大小i 缩小到targetBinNums时的间隔。
             const float binInterval = (float)i / (float)targetBinNums;
 
             // merge i bins to target bins  j从0到127
             // 把0-i之间分布， 映射到更小空间 0-targetBinNums， 当然映射过后会有损失， 结果保存到quantizedDistribution
             for (int j = 0; j < targetBinNums; ++j) {
                 // [j,j+1)在mBinNumber空间
                 // [start, end]是 i空间 相对应的映射
                 const float start = j * binInterval;
                 const float end   = start + binInterval;
                 // 向上取整
                 const int leftUpper = static_cast<int>(std::ceil(start));
                 if (leftUpper > start) {
                     const float leftScale = leftUpper - start;
                     quantizedDistribution[j] += leftScale * distribution[leftUpper - 1];
                 }// 向下取整
                 const int rightLower = static_cast<int>(std::floor(end));
                 if (rightLower < end) {
                     const float rightScale = end - rightLower;
                     quantizedDistribution[j] += rightScale * distribution[rightLower];
                 }
                 // 转化成在[0, targetBinNums]分布
                 std::for_each(distribution.begin() + leftUpper, distribution.begin() + rightLower,
                               [&](float n) { quantizedDistribution[j] += n; });
             }
             // expand target bins to i bins 从小空间 0-targetBinNums 在反过来映射回来到0-i之间
             // 结果保存到expandedDistribution;
             for (int j = 0; j < targetBinNums; ++j) {
                 const float start   = j * binInterval;
                 const float end     = start + binInterval;
                 float count         = 0;
                 const int leftUpper = static_cast<int>(std::ceil(start));
                 float leftScale     = 0.0f;
                 if (leftUpper > start) {
                     leftScale = leftUpper - start;
                     if (distribution[leftUpper - 1] != 0) {
                         count += leftScale;
                     }
                 }
                 const int rightLower = static_cast<int>(std::floor(end));
                 float rightScale     = 0.0f;
                 if (rightLower < end) {
                     rightScale = end - rightLower;
                     if (distribution[rightLower] != 0) {
                         count += rightScale;
                     }
                 }
 
                 std::for_each(distribution.begin() + leftUpper, distribution.begin() + rightLower, [&](float n) {
                     if (n != 0) {
                         count += 1;
                     }
                 });
 
                 if (count == 0) {
                     continue;
                 }
                 const float toExpandValue = quantizedDistribution[j] / count;
                 if (leftUpper > start && distribution[leftUpper - 1] != 0) {
                     expandedDistribution[leftUpper - 1] += toExpandValue * leftScale;
                 }
                 if (rightLower < end && distribution[rightLower] != 0) {
                     expandedDistribution[rightLower] += toExpandValue * rightScale;
                 }
 
                 for (int k = leftUpper; k < rightLower; ++k) {
                     if (distribution[k] != 0) {
                         expandedDistribution[k] += toExpandValue;
                     }
                 }
             }
             // KL散度计算公式 Sum(P[i] * log(P[i] / Q[i]))
             const float curKL = _klDivergence(candidateDistribution, expandedDistribution);
             if (curKL < minKLDivergence) { // 记录最小的KL散度，以及使得KL散度最小的 threshold
                 minKLDivergence = curKL;
                 threshold       = i;
             }
         }
     } else if (mThresholdMethod == THRESHOLD_MAX) {
         threshold = mBinNumber - 1;
     } else {
         // TODO, support other method
         MNN_ASSERT(false);
     }
     return threshold;
 }

权重量化_updateScale函数

主要是对权重进行量化，完成可量化op的量化

void Calibration::_updateScale() {
     for (const auto& op : _originaleModel->oplists) {
         const auto opType = op->type;
         // 只针对 conv， 或者Eltwise类型op
         if (opType != MNN::OpType_Convolution && opType != MNN::OpType_ConvolutionDepthwise &&
             opType != MNN::OpType_Eltwise) {
             continue;
         }
         auto tensorsPair = _opInfo.find(op->name);
         if (tensorsPair == _opInfo.end()) {
             MNN_ERROR("Can't find tensors for %s\n", op->name.c_str());
         }
 
         if (opType == MNN::OpType_Eltwise) {//Eltwise类型op
             auto param = op->main.AsEltwise();
             // Now only support AddInt8
             if (param->type != MNN::EltwiseType_SUM) {
                 continue;
             }
             // 取出前面算出来的 scale值
             const auto& inputScale0   = _scales[tensorsPair->second.first[0]];
             const auto& inputScale1   = _scales[tensorsPair->second.first[1]];
             const auto& outputScale   = _scales[tensorsPair->second.second[0]];
             const int outputScaleSize = outputScale.size();
             std::vector<float> outputInvertScale(outputScaleSize);
             Helper::invertData(outputInvertScale.data(), outputScale.data(), outputScaleSize);
             op->type = MNN::OpType_EltwiseInt8; // 修改op类型为量化版本
             op->main.Reset(); // 重新构造op的参数了
             op->main.type = MNN::OpParameter_EltwiseInt8;
 
             auto eltwiseInt8Param         = new MNN::EltwiseInt8T;
             auto input0ScaleParam         = new MNN::QuantizedFloatParamT;
             auto input1ScaleParam         = new MNN::QuantizedFloatParamT;
             auto outputScaleParam         = new MNN::QuantizedFloatParamT;
             input0ScaleParam->tensorScale = inputScale0;
             input1ScaleParam->tensorScale = inputScale1;
             outputScaleParam->tensorScale = outputInvertScale;
             // 从int8恢复float32需要的scale参数
             eltwiseInt8Param->inputQuan0  = std::unique_ptr<MNN::QuantizedFloatParamT>(input0ScaleParam);
             eltwiseInt8Param->inputQuan1  = std::unique_ptr<MNN::QuantizedFloatParamT>(input1ScaleParam);
             eltwiseInt8Param->outputQuan  = std::unique_ptr<MNN::QuantizedFloatParamT>(outputScaleParam);
             op->main.value                = eltwiseInt8Param;
             continue;
         }
 
         // below is Conv/DepthwiseConv
         const auto& inputScale  = _scales[tensorsPair->second.first[0]];
         const auto& outputScale = _scales[tensorsPair->second.second[0]];
 
         auto param                = op->main.AsConvolution2D();
         param->common->inputCount = tensorsPair->second.first[0]->channel();
         const int channles        = param->common->outputCount;
         const int weightSize      = param->weight.size();
         param->symmetricQuan.reset(new MNN::QuantizedFloatParamT);// 参数重置
         auto& quantizedParam = param->symmetricQuan;// 接着构建 int8版本参数
         quantizedParam->scale.resize(channles);
         quantizedParam->weight.resize(weightSize);
         quantizedParam->bias.resize(channles);
         // conv 和 deptwise_conv 分别算 weight，bias
         if (opType == MNN::OpType_Convolution) { 
             QuantizeConvPerChannel(param->weight.data(), param->weight.size(), param->bias.data(),
                                    quantizedParam->weight.data(), quantizedParam->bias.data(),
                                    quantizedParam->scale.data(), inputScale, outputScale, _weightQuantizeMethod);
             op->type = MNN::OpType_ConvInt8;
 
         } else if (opType == MNN::OpType_ConvolutionDepthwise) {
             QuantizeDepthwiseConv(param->weight.data(), param->weight.size(), param->bias.data(),
                                   quantizedParam->weight.data(), quantizedParam->bias.data(),
                                   quantizedParam->scale.data(), inputScale, outputScale, _weightQuantizeMethod);
             op->type = MNN::OpType_DepthwiseConvInt8;
         }
         if (param->common->relu6) {
             param->common->relu  = true;
             param->common->relu6 = false;
         }// 有了int8版本的参数， 原始的浮点参数就清空了
         param->weight.clear();
         param->bias.clear();
     }
 }

QuantizeConvPerChannel函数

int QuantizeConvPerChannel(const float* weight, const int size, const float* bias, int8_t* quantizedWeight,
                           int32_t* quantizedBias, float* scale, const std::vector<float>& inputScale,
                           const std::vector<float>& outputScale, std::string method, bool mergeChannel) {
    const int inputChannels  = inputScale.size();
    const int outputChannels = outputScale.size();
    const int icXoc          = inputChannels * outputChannels;
    DCHECK(size % icXoc == 0) << "Input Data Size Error!";

    std::vector<float> quantizedWeightScale(outputChannels);

    float inputScalexWeight = 1.0f;
    if (mergeChannel) {
        if (method == "MAX_ABS"){
            SymmetricQuantizeWeight(weight, size, quantizedWeight, quantizedWeightScale.data(), outputChannels);
        }
        else if (method == "ADMM") {
            QuantizeWeightADMM(weight, size, quantizedWeight, quantizedWeightScale.data(), outputChannels);
        }
        inputScalexWeight = inputScale[0];
    } else {
        const int kernelSize = size / icXoc;
        const int ocStride   = size / outputChannels;

        std::vector<float> weightMultiByInputScale(size);
        for (int oc = 0; oc < outputChannels; ++oc) {
            for (int ic = 0; ic < inputChannels; ++ic) {
                for (int i = 0; i < kernelSize; ++i) {
                    const int index                = oc * ocStride + ic * kernelSize + i;
                    weightMultiByInputScale[index] = inputScale[ic] * weight[index];
                }
            }
        }
        if (method == "MAX_ABS"){
            SymmetricQuantizeWeight(weightMultiByInputScale.data(), size, quantizedWeight, quantizedWeightScale.data(), outputChannels);
        }
        else if (method == "ADMM") {
            QuantizeWeightADMM(weightMultiByInputScale.data(), size, quantizedWeight, quantizedWeightScale.data(), outputChannels);
        }
    }

    for (int i = 0; i < outputChannels; ++i) {
        if (outputScale[i] == 0) {
            scale[i] = 0.0f;
        } else {
            scale[i] = inputScalexWeight * quantizedWeightScale[i] / outputScale[0];
        }
    }

    if (bias) {
        for (int i = 0; i < outputChannels; ++i) {
            if (inputScalexWeight == 0 || quantizedWeightScale[i] == 0) {
                quantizedBias[i] = 0;
            } else {
                quantizedBias[i] = static_cast<int32_t>(bias[i] / (inputScalexWeight * quantizedWeightScale[i]));
            }
        }
    }

    return 0;
}

QuantizeDepthwiseConv函数

int QuantizeDepthwiseConv(const float* weight, const int size, const float* bias, int8_t* quantizedWeight,
                          int32_t* quantizedBias, float* scale, const std::vector<float>& inputScale,
                          const std::vector<float>& outputScale, std::string method) {
    const int inputChannels  = inputScale.size();
    const int outputChannels = outputScale.size();
    DCHECK(inputChannels == outputChannels) << "Input Data Size Error!";

    std::vector<float> quantizedWeightScale(inputChannels);
    if (method == "MAX_ABS") {
        SymmetricQuantizeWeight(weight, size, quantizedWeight, quantizedWeightScale.data(), inputChannels);
    }
    else if (method == "ADMM") {
        QuantizeWeightADMM(weight, size, quantizedWeight, quantizedWeightScale.data(), inputChannels);
    }

    for (int c = 0; c < inputChannels; ++c) {
        const int index = c;
        if (outputScale[c] == 0) {
            scale[index] = 0.0f;
        } else {
            scale[index] = inputScale[c] * quantizedWeightScale[c] / outputScale[c];
        }
    }

    if (bias) {
        for (int i = 0; i < outputChannels; ++i) {
            if (inputScale[i] == 0 || quantizedWeightScale[i] == 0) {
                quantizedBias[i] = 0;
            } else {
                quantizedBias[i] = static_cast<int32_t>(bias[i] / (inputScale[i] * quantizedWeightScale[i]));
            }
        }
    }

    return 0;
}

添加反量化_insertDequantize函数

主要作用是添加反量化操作，对于不支持int8的op，前后要加反量化的层。量化op输出是整数，但是以量化op输出为输入的下一个op是float，可能需要是float版本输出，所以需要添加反量化。
大致说一下过程：
1.遍历op，找出所有被量化op的输入输出tensor
2.遍历op，找出所有不能量化的op
如果该op的输入tensor是可量化op产生的，那么需要输入后面加反量化
如果该op的输出tensor是给到可量化op的，那么需要在输出tensor后面加反量化

对于该图的输出tensor，后面加反量化，确保输出结果是浮点的

void Calibration::_insertDequantize() {
     // Search All Int Tensors
     std::set<int> int8Tensors;
     std::set<int> int8Outputs;
     for (auto& op : _originaleModel->oplists) {// 遍历op
         if (Helper::INT8SUPPORTED_OPS.count(op->type) > 0) {//如果op是 int8量化的， 记录其input和output
             for (auto index : op->inputIndexes) {//记录其input
                 int8Tensors.insert(index);
             }
             for (auto index : op->outputIndexes) {//记录其output
                 int8Tensors.insert(index);
                 int8Outputs.insert(index);
             }
         }
     }
     for (auto& op : _originaleModel->oplists) {// 去重，tensor
         for (auto index : op->inputIndexes) {
             auto iter = int8Outputs.find(index);
             if (iter != int8Outputs.end()) {
                 int8Outputs.erase(iter);
             }
         }
     }
 
     // Insert Convert For Not Support Int8 Ops， 对于不支持int8的op，前后要加反量化的层
     for (auto iter = _originaleModel->oplists.begin(); iter != _originaleModel->oplists.end();) {
         auto op           = iter->get();
         const auto opType = op->type;
         const auto name   = op->name;
         // check whether is output op
         // if Yes, insert dequantization op after this op  支持int8的op,跳过
         if (Helper::INT8SUPPORTED_OPS.find(opType) != Helper::INT8SUPPORTED_OPS.end()) {
             // this is quantized op
             iter++;
             continue;
         }
 
         auto& inputIndexes  = op->inputIndexes;
         const int inputSize = inputIndexes.size();
 
         // insert dequantization op before this op
         for (int i = 0; i < inputSize; ++i) {// 对于该op的所有输入 tensor
             const auto curInputIndex = inputIndexes[i];
             if (int8Tensors.find(curInputIndex) == int8Tensors.end()) {// 如果该tensor不是量化版的，跳过
                 continue;
             }
             auto input        = _tensorMap[curInputIndex];
             auto inputOpScale = _scales[input];
 
             // construct new op， 创建一个 定点转浮点的 op
             auto dequantizationOp       = new MNN::OpT;
             dequantizationOp->main.type = MNN::OpParameter_QuantizedFloatParam;
             dequantizationOp->name      = "___Int8ToFloat___For_" + name + flatbuffers::NumToString(i);
             // 填充参数
             dequantizationOp->type           = MNN::OpType_Int8ToFloat;
             auto dequantizationParam         = new MNN::QuantizedFloatParamT;
             dequantizationOp->main.value     = dequantizationParam;
             dequantizationParam->tensorScale = inputOpScale;
 
             dequantizationOp->inputIndexes.push_back(curInputIndex);
             dequantizationOp->outputIndexes.push_back(_originaleModel->tensorName.size());
             _originaleModel->tensorName.push_back(dequantizationOp->name);
 
             // reset current op's input index at i， 新创建定点转浮点op的 输出 接到 op的输入
             inputIndexes[i] = dequantizationOp->outputIndexes[0];
             iter = _originaleModel->oplists.insert(iter, std::unique_ptr<MNN::OpT>(dequantizationOp));
             iter++;
         }
 
         iter++;
         // LOG(INFO) << "insert quantization op after this op if neccessary";
         // insert quantization op after this op if neccessary
          对于该op的所有输出tensor
         for (int i = 0; i < op->outputIndexes.size(); ++i) {
             const auto outputIndex = op->outputIndexes[i];
             if (int8Tensors.find(outputIndex) == int8Tensors.end()) { // 如果该tensor不是量化版本，跳过
                 continue;
             }
             auto output   = _tensorMap[outputIndex];
             auto curScale = _scales[output];
             // construct one quantization op(FloatToInt8)
             // 创建反量化的op
             auto quantizationOp        = new MNN::OpT;
             quantizationOp->main.type  = MNN::OpParameter_QuantizedFloatParam;
             quantizationOp->name       = name + "___FloatToInt8___" + flatbuffers::NumToString(i);
             quantizationOp->type       = MNN::OpType_FloatToInt8;
             auto quantizationParam     = new MNN::QuantizedFloatParamT;
             quantizationOp->main.value = quantizationParam;
              // 填充参数
             const int channels = curScale.size();
             std::vector<float> quantizationScale(channels);
             Helper::invertData(quantizationScale.data(), curScale.data(), channels);
             quantizationParam->tensorScale = quantizationScale;
             // 插入 反量化op到 该op后面
             quantizationOp->inputIndexes.push_back(_originaleModel->tensorName.size());
             quantizationOp->outputIndexes.push_back(outputIndex);
             _originaleModel->tensorName.push_back(_originaleModel->tensorName[outputIndex]);
             _originaleModel->tensorName[outputIndex] = quantizationOp->name;
             op->outputIndexes[i]                              = quantizationOp->inputIndexes[0];
 
             iter = _originaleModel->oplists.insert(iter, std::unique_ptr<MNN::OpT>(quantizationOp));
             iter++;
         }
     }
 
     // Insert Turn float Op for output
     for (auto index : int8Outputs) { // 对该图的 输出tensor，添加反量化， 因为图最终输出还是要浮点的
         // construct new op
         auto dequantizationOp       = new MNN::OpT;
         dequantizationOp->main.type = MNN::OpParameter_QuantizedFloatParam;
         dequantizationOp->name      = "___Int8ToFloat___For_" + flatbuffers::NumToString(index);
 
         dequantizationOp->type           = MNN::OpType_Int8ToFloat;
         auto dequantizationParam         = new MNN::QuantizedFloatParamT;
         dequantizationOp->main.value     = dequantizationParam;
         dequantizationParam->tensorScale = _scales[_tensorMap[index]];
 
         dequantizationOp->inputIndexes.push_back(index);
         dequantizationOp->outputIndexes.push_back(_originaleModel->tensorName.size());
         auto originTensorName              = _originaleModel->tensorName[index];
         _originaleModel->tensorName[index] = dequantizationOp->name;
         _originaleModel->tensorName.emplace_back(originTensorName);
 
         _originaleModel->oplists.insert(_originaleModel->oplists.end(), std::unique_ptr<MNN::OpT>(dequantizationOp));
     }
 }

_computeFeatureScaleADMM函数

关于ADMM的原理在这里：https://zhuanlan.zhihu.com/p/81243626
ADMM方式计算特征，其中computeScaleADMM包含在TensorStatistic.cpp中。

void Calibration::_computeFeatureScaleADMM() {
    // feed input data according to input images
    int count                           = 0;
    std::vector<int> oneImageTensorDims = _inputTensorDims;
    oneImageTensorDims[0]               = 1;
    auto inputTensorDataFormat          = MNN::TensorUtils::getDescribe(_inputTensor)->dimensionFormat;
    auto dimType                        = MNN::Tensor::CAFFE_C4;
    if (inputTensorDataFormat == MNN::MNN_DATA_FORMAT_NHWC) {
        dimType = MNN::Tensor::TENSORFLOW;
    }

    for (const auto& img : _imgaes) {
        auto curPtr = _inputTensor->host<float>() + count * _inputTensor->stride(0);
        std::shared_ptr<MNN::Tensor> tensorWarp(
            MNN::Tensor::create(oneImageTensorDims, _inputTensor->getType(), curPtr, dimType));
        Helper::preprocessInput(_process.get(), _width, _height, img, tensorWarp.get());

        count++;
        MNN_PRINT("\rProcessImage: %.2lf %%", (float)count * 100.0f / (float)_imageNum);
        fflush(stdout);
    }
    MNN_PRINT("\n");
    _scales.clear();

    const int totalLayers = _featureInfo.size();
    count                 = 0;

    MNN::TensorCallBackWithInfo before = [&](const std::vector<MNN::Tensor*>& nTensors, const MNN::OperatorInfo* info) {
        if (Helper::gNeedFeatureOp.find(info->type()) != Helper::gNeedFeatureOp.end()) {
            for (auto t : nTensors) {
                if (_featureInfo.find(t) != _featureInfo.end()) {
                    _scales[t] = _featureInfo[t]->computeScaleADMM();
                    count++;
                    MNN_PRINT("\rComputeADMM: %.2lf %%", (float)count * 100.0f / (float)totalLayers);
                    fflush(stdout);
                }
            }
        }
        return true;
    };
    MNN::TensorCallBackWithInfo after = [&](const std::vector<MNN::Tensor*>& nTensors, const MNN::OperatorInfo* info) {
        if (Helper::gNeedFeatureOp.find(info->type()) != Helper::gNeedFeatureOp.end()) {
            for (auto t : nTensors) {
                if (_featureInfo.find(t) != _featureInfo.end()) {
                    _scales[t] = _featureInfo[t]->computeScaleADMM();
                    count++;
                    MNN_PRINT("\rComputeADMM: %.2lf %%", (float)count * 100.0f / (float)totalLayers);
                    fflush(stdout);
                }
            }
        }
        return true;
    };

    _interpreter->runSessionWithCallBackInfo(_session, before, after);
    MNN_PRINT("\n");
}

computeScaleADMM函数

std::vector<float> TensorStatistic::computeScaleADMM() {
    std::vector<float> scaleValue(mOriginTensor->channel(), 0.0f);

    const int count         = mOriginTensor->elementSize();
    float max               = 0;
    const float bound       = 127;
    const float* originData = mOriginTensor->host<float>();

    for (int i = 0; i < count; i++) {
        float absData = std::fabs(originData[i]);
        if (absData > max) {
            max = absData;
        }
    }
    float alpha = max / (bound * 2.5);

    // DLOG(INFO) << "alpha init: " << alpha;

    const int maxStep = 300;
    float sum1        = 0;
    float sum2        = 0;
    float invAlpha;

    for (int i = 0; i < maxStep; i++) {
        sum1     = 0;
        sum2     = 0;
        invAlpha = 1 / alpha;

        for (int i = 0; i < count; i++) {
            auto origin    = originData[i];
            auto dataQuant = std::roundf(origin * invAlpha);
            dataQuant      = std::fmin(bound, std::fmax(-bound, dataQuant));
            sum1 += (dataQuant * origin);
            sum2 += (dataQuant * dataQuant);
        }

        alpha = sum1 / sum2;
    }
    // DLOG(INFO) << "alpha final: " << alpha;

    std::fill(scaleValue.begin(), scaleValue.end(), alpha);
    return scaleValue;
}