double转float的c语言实现(round to nearest even)

关于double和float类型的定义,网上有很多相关的资料,这里仅贴出来链接:https://docs.oracle.com/cd/E19957-01/806-3568/ncg_math.htmlhttps://docs.oracle.com/cd/E19957-01/806-3568/ncg_math.html

IEEE754浮点数标准及浮点型和整型之间的转换_a3192048的博客-CSDN博客_ieee754浮点数转换icon-default.png?t=LA92https://blog.csdn.net/a3192048/article/details/106662693?spm=1001.2101.3001.6661.1&utm_medium=distribute.pc_relevant_t0.none-task-blog-2%7Edefault%7EOPENSEARCH%7Edefault-1.no_search_link&depth_1-utm_source=distribute.pc_relevant_t0.none-task-blog-2%7Edefault%7EOPENSEARCH%7Edefault-1.no_search_link

浮点数由三部分组成、符号位、指数位、尾数位。

浮点数的特殊数需要单独考虑

nan(指数位均为1,尾数位不全为0)

inf(指数位均为1,尾数位全为0)

denormal(指数位全为0)

round to nearest even(为了防止每次舍入造成数据分布的整体变化,IEEE设计了nearest to even舍入模式,根据浮点数据判断最后一位是否进位1。这里有两种情况:1、当舍弃数大于0x1000...,或2、浮点数最后一位为1,并且舍弃部分恰好等于0x1000...)

下面开始计算流程:

  1. 符号位,右移63位再左移23位到float的符号位

    1. dst += data & 0x8000 0000 0000 0000 >> 63 << 23

  2. 指数位,exp_f = exp_d - 1023 + 127

    1. exp_d = data & 0x7ff0 0000 0000 0000 >> 52,取得指数位

    2. exp_d in [874, 896]的部分需要记录,denormal_shift = min(max(-exp_f + 1, 0), 23),后面用来操作非规格数。

    3. 初始化一个tail_mask用来生成尾数掩码,tail_mask=0x007ffff

    4. 其中指数位exp_f > 255为溢出数,溢出到inf,尾数将要写0,生成tail_mask = exp_f > 255 ? tail_mask : 0

    5. 指数位exp_f < -23也为溢出数,溢出到0,尾数将要写0,或运算2.3tail_mask = exp_f < -23 ? tail_mask : 0

    6. 之后指数位限制在[0, 255]之间,exp = min(max(exp_f, 255), 0)

    7. dst += exp_f << 23

    8. double指数位为0x7ff的有可能为nan,需要结合尾数位来判断

  3. 尾数位

    1. data & 0x000f ffff ffff ffff,取得double尾数位

    2. 如果尾数位均为0,与2.7判别出nan,得到进位位nan_mask = exp_d == 0x7ff && tail == 0x0 ? 0x400000 : 0

    3. 尾数位右移29 + denormal_shift位之后,与操作2.3,2.4的结果

    4. dst += tail

  4. denormal位

    1. dst += (0x0080000 >> denormal_shift) & 0x007ffff

  5. rn位

    1. 准备float最后一个尾数位,last_bit = 0x00000001 & dst

    2. 准备double截掉的位,rn_mask = ((0x0000000020000000 << denormal_shift) - 1) & d_a

    3. 准备double待比较数,rn_base = (0x0000000010000000 << denormal_shift)

    4. 进位准则1:rn_mask > rn_base ? 1 : 0

    5. 进位准则2:rn_mask == rn_base && last_bit ? 1 : 0

    6. 两个进位准则与2.3的做与,判断是否实际进位,tail_mask

    7. dst += (judge1 + judge2) & tail_mask

  6. nan位

    1. 现在nan溢出到inf了,需要和3.2的nan掩码相加,dst += nan_mask

#include <iostream>
#include <algorithm>
#define U64 uint64_t
#define U32 uint32_t

#define GREEN        "\033[0;32;32m"
#define RED          "\033[0;32;31m"
#define NONE         "\033[0m"

//64左移len 位
U64 move_left64(U64 a, int len) {
    return a * (0x1 << len);
}
//64右移len 位
U64 move_right64(U64 a, int len) {
    return a / (0x1 << len);
}

float test(double d_a) {
    uint32_t dst = 0x000000;
    float f_a = (float)d_a;

    uint64_t sign_d_a = (*((int64_t *)&d_a) & (0x8000000000000000)) >> 63;
    uint32_t sign_f_a = sign_d_a << 31;
    dst += sign_f_a;

    uint64_t exp_d_a = (*((int64_t *)&d_a) & (0x7ff0000000000000)) >> 52;

    uint32_t exp_f_a = std::max(std::min((int32_t)exp_d_a - 1023 + 127, 0xff), 0x0) << 23;
    int32_t denormal_shift = std::min(std::max(-((int32_t)exp_d_a - 1023 + 127) + 1, 0x0), 23);
    dst += exp_f_a;

    int32_t tail_mask = (exp_d_a > 1151 || exp_d_a < 874) ? 0x0 : 0x007fffff;

    uint32_t denormal_bit = move_right64(0x800000, denormal_shift) & tail_mask;
    dst += denormal_bit;
    
    uint64_t tail_d_a = (*((int64_t *)&d_a) & (0x000fffffffffffff));
    uint32_t tail_f_a = (tail_d_a >> 29 >> denormal_shift) & tail_mask;
    dst += tail_f_a;

    int64_t rn_mask = (move_left64(0x0000000020000000, denormal_shift) - 1);
    int64_t rn_base = (move_left64(0x0000000010000000, denormal_shift));

    uint64_t rn = tail_d_a & rn_mask;

    uint32_t rn_up_judge1 = ((rn > rn_base) ? 0x1 : 0x0) & tail_mask;
    dst += rn_up_judge1;
    uint64_t last_bit = 0x00000001 & dst;
    uint32_t rn_up_judge2 = ((rn == rn_base && last_bit) ? 0x1 : 0x0) & tail_mask;
    dst += rn_up_judge2;

    uint32_t nan_mask = (exp_d_a == 0x7ff && tail_d_a != 0) ? 0x400000 : 0x0;
    dst += nan_mask;

    float reference = (float)d_a;
    if (*((int32_t *)&reference) == dst) {
        printf("dst: %x, %f,  line: %d\n", dst, *((float *)&dst), __LINE__);
        printf("ref: %x, %f,  line: %d\n", *((int32_t *)&reference), reference, __LINE__);
        printf(GREEN "pass!" NONE "\n");
    } else {
        float reference = (float)d_a;

        printf("sign d: %lld\t %llx\t\t\t line: %d\n", sign_d_a, sign_d_a, __LINE__);
        printf("sign f: %d\t %x\t\t\t line: %d\n", sign_f_a, sign_f_a, __LINE__);
        printf("dst  f: %2.3f\t %x\t\t\t line: %d\n", *((float *)&dst), dst, __LINE__);

        printf("*******\n");
        printf("exp d: %lld\t %llx\t\t\t line: %d\n", exp_d_a, exp_d_a, __LINE__);

        printf("exp f: %d\t %x\t\t\t line: %d\n", exp_f_a, exp_f_a, __LINE__);
        printf("denormal shift:\t %d, line: %d\n", denormal_shift, __LINE__);
        printf("dst  f: %2.3f\t %x\t\t\t line: %d\n", *((float *)&dst), dst, __LINE__);

        printf("*******\n");
        printf("tail_mask:\t %x, line: %d\n", tail_mask, __LINE__);

        printf("denormal bit:\t %x\n", denormal_bit);
        printf("dst  f: %2.3f\t %x\t\t\t line: %d\n", *((float *)&dst), dst, __LINE__);

        printf("*******\n");
        printf("tail f a: %x\t\t\t\t line: %d\n", tail_f_a, __LINE__);
        printf("dst  f: %2.3f\t %x\t\t\t line: %d\n", *((float *)&dst), dst, __LINE__);

        printf("rn_up_judge1: %x\t\t\t\t line: %d\n", rn_up_judge1, __LINE__);
        printf("rn_up_judge2: %x\t\t\t\t line: %d\n", rn_up_judge2, __LINE__);
        printf("dst  f: %2.3f\t %x\t\t\t line: %d\n", *((float *)&dst), dst, __LINE__);
        printf("nan mask: %d\t %x\t\t\t line: %d\n", nan_mask, nan_mask, __LINE__);
        printf("dst  f: %2.3f\t %x\t\t\t line: %d\n", *((float *)&dst), dst, __LINE__);

        printf("dst: %x, %f,  line: %d\n", dst, *((float *)&dst), __LINE__);
        printf("ref: %x, %f,  line: %d\n", *((int32_t *)&reference), reference, __LINE__);
        printf(RED "wrong!\n" NONE);
    }
    return *((float *)&dst);
}

int main() {
  int64_t from = 0x7Ff0000000000006;
  double test_value = *((double *)&from);
  test(test_value);

  from = 0x36a7000000000000;
  test_value = *((double *)&from);
  test(test_value);

  from = 0x36a8000000000000;
  test_value = *((double *)&from);
  test(test_value);
}

 再贴几个有意思的网站:

IEEE-754 Floating Point Converter,直观显示float每一位的

Base Convert: IEEE 754 Floating Point,研究double2float流程的

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值