浮点数量化2^n定点数

版权声明:可以参考 亦可转载 请注明出处 https://blog.csdn.net/baidu_24281959/article/details/53138595

Target:

将浮点数量化到最近的2^n定点数

Code:

E.g. 量化为3bit 区间为[-1,0.5]

#include <iostream>
#include <cmath>
using namespace std;

template <typename Dtype>
int getArrayLen(Dtype& array)
{
	return (sizeof(array) / sizeof(array[0]));
}

template <typename Dtype>
Dtype quantization(const Dtype data, const int N, const Dtype abs_min_2_pow){
	if(data == Dtype(0))
		return Dtype(0);
	const int exponent = (1<<(N-1)) - 2;
	const Dtype maxValue = abs_min_2_pow * Dtype(1<<exponent);
	const Dtype minValue = -1 * maxValue * Dtype(2);
	if (data <= minValue) {
		return minValue;
	} else if (data >= maxValue){
		return maxValue;
	} else {
		Dtype sign_index = (data > 0) ? Dtype(1) : Dtype(-1);
		Dtype fabs_data = fabs(data);
		const int mul = int(floor(log(fabs_data / abs_min_2_pow) / log(2.0)));
		Dtype mul_2 = 1<<mul;
		if (mul < 0){
			return (fabs_data < abs_min_2_pow / 2) ? Dtype(0) : sign_index * abs_min_2_pow;
		} else {
			Dtype upperbound = abs_min_2_pow * mul_2 * 2;
			Dtype lowerbound = abs_min_2_pow * mul_2;
			Dtype diff = upperbound - lowerbound;
			diff /= 2;
			return (fabs_data - lowerbound >= diff) ? sign_index * upperbound : sign_index * lowerbound;
		}
	}	
}


int main(int argc, char *argv[])
{	
	float a[] = {0, -0.00001, -0.0624999, -0.0625, -0.0626, -0.124, -0.125, -0.126, -0.1874, 
					-0.1875, -0.1876, -0.249, -0.25, -0.251, -0.499, -0.5, -0.501, -0.749, -0.75, -0.751,
					1.23, 2.4, -1.23, -2.4};
	
	for(int i = 0; i < getArrayLen(a); i++){
		cout << a[i] << "\t" << quantization(a[i], 3, (float)0.125) << endl;
	}
	return 0;
}
Output:

0       0
-1e-05  0
-0.0624999      0
-0.0625 -0.125
-0.0626 -0.125
-0.124  -0.125
-0.125  -0.125
-0.126  -0.125
-0.1874 -0.125
-0.1875 -0.25
-0.1876 -0.25
-0.249  -0.25
-0.25   -0.25
-0.251  -0.25
-0.499  -0.5
-0.5    -0.5
-0.501  -0.5
-0.749  -0.5
-0.75   -1
-0.751  -1
1.23    0.5
2.4     0.5
-1.23   -1
-2.4    -1




没有更多推荐了,返回首页