Float32转Float16

最新推荐文章于 2024-10-08 23:14:43 发布

123456木木

最新推荐文章于 2024-10-08 23:14:43 发布

阅读量958

点赞数 3

文章标签：开发语言 typescript c#

本文链接：https://blog.csdn.net/xunixianshi123/article/details/136566168

版权

C++ 内置数据类型

C#内置数据类型

但是float是32位，很多情况下在精度不需要极其精确的情况下使用16位是完全足够的，因为half的数据类型应运而生：

半精度浮点数Half

精度比较：

16位Float精度：

数值计算公式为：(-1)^signbit * 2^(e) * (1+significantbits)

最大值为：0 11110 1111111111=(-1)^0 * 2^15 * (1+1-2^-10)=65504

最小值为：0 00001 0000000000=2^-14=6.10 * 10^-5

//转载：
 static byte[] FloatToHalf(float f)
        {          
            Byte[] bytes = BitConverter.GetBytes(f);
            Byte sign = 0x80;
            SByte exp;
            byte[] myByte = new byte[2];//返回的数组
            ushort result;

            ushort m;
            sign = (Byte)(bytes[3] & sign);//求符号位
            //求指数位
            exp = (SByte)(bytes[3] << 1);
            exp += (SByte)(bytes[2] >> 7);
            exp -= 127;
            exp += (SByte)((1<<(expSize -1)) -1);
            if (exp < 0)//下溢出
                exp = 0;
            //求尾数
            m = (ushort)(bytes[2] & 0x7f);
            m = (ushort)(m << (mSize - 7));
            m += (ushort)(bytes[1] >> (15-mSize));
            if (((bytes[1] >> (15 - mSize - 1)) & 1) == 1)//若被移除的最高位是1，则产生进位。
                m += 1;
            if (m >= (ushort)Math.Pow(2, mSize))//若进位后发生尾数溢出，则取消进位
                m -= 1;

            result = sign;
            result = (ushort)(result << 8);//把符号位移动到最高位上
            //装载指数位
            short temp1 = exp;
            temp1 = (short)(temp1 << (15-expSize));
            result += (ushort)temp1;
            result += m;//装载尾数
            myByte[0] = (byte)result;
            myByte[1] = (byte)(result >> 8);
            return myByte;
        }

    /**
     * <code>HalfFloatUtils</code> 类用于创建HalfFloat工具。
     */
    class HalfFloatUtils {

        /**
         * round a number to a half float number bits.
         * @param num 
         */
        static roundToFloat16Bits(num: number): number;

        /**
         * convert a half float number bits to a number.
         * @param float16bits - half float number bits
         */
        static convertToNumber(float16bits: number): number;
    }