# 浮点数量化2^n定点数

Target:

Code:

E.g. 量化为3bit 区间为[-1,0.5]

#include <iostream>
#include <cmath>
using namespace std;

template <typename Dtype>
int getArrayLen(Dtype& array)
{
return (sizeof(array) / sizeof(array[0]));
}

template <typename Dtype>
Dtype quantization(const Dtype data, const int N, const Dtype abs_min_2_pow){
if(data == Dtype(0))
return Dtype(0);
const int exponent = (1<<(N-1)) - 2;
const Dtype maxValue = abs_min_2_pow * Dtype(1<<exponent);
const Dtype minValue = -1 * maxValue * Dtype(2);
if (data <= minValue) {
return minValue;
} else if (data >= maxValue){
return maxValue;
} else {
Dtype sign_index = (data > 0) ? Dtype(1) : Dtype(-1);
Dtype fabs_data = fabs(data);
const int mul = int(floor(log(fabs_data / abs_min_2_pow) / log(2.0)));
Dtype mul_2 = 1<<mul;
if (mul < 0){
return (fabs_data < abs_min_2_pow / 2) ? Dtype(0) : sign_index * abs_min_2_pow;
} else {
Dtype upperbound = abs_min_2_pow * mul_2 * 2;
Dtype lowerbound = abs_min_2_pow * mul_2;
Dtype diff = upperbound - lowerbound;
diff /= 2;
return (fabs_data - lowerbound >= diff) ? sign_index * upperbound : sign_index * lowerbound;
}
}
}

int main(int argc, char *argv[])
{
float a[] = {0, -0.00001, -0.0624999, -0.0625, -0.0626, -0.124, -0.125, -0.126, -0.1874,
-0.1875, -0.1876, -0.249, -0.25, -0.251, -0.499, -0.5, -0.501, -0.749, -0.75, -0.751,
1.23, 2.4, -1.23, -2.4};

for(int i = 0; i < getArrayLen(a); i++){
cout << a[i] << "\t" << quantization(a[i], 3, (float)0.125) << endl;
}
return 0;
}
Output:

0       0
-1e-05  0
-0.0624999      0
-0.0625 -0.125
-0.0626 -0.125
-0.124  -0.125
-0.125  -0.125
-0.126  -0.125
-0.1874 -0.125
-0.1875 -0.25
-0.1876 -0.25
-0.249  -0.25
-0.25   -0.25
-0.251  -0.25
-0.499  -0.5
-0.5    -0.5
-0.501  -0.5
-0.749  -0.5
-0.75   -1
-0.751  -1
1.23    0.5
2.4     0.5
-1.23   -1
-2.4    -1