float-float 模拟CPU和GPU上double双精度计算

博客介绍了如何通过FLOAT2结构体在CPU和GPU上模拟双精度浮点数运算,包括加减乘除、开方、倒数等操作。FLOAT2使用两个单精度浮点数来近似表示双精度数值,适用于混合精度计算。测试程序展示了FLOAT2在计算根号2和根号3之和等运算中的应用。
摘要由CSDN通过智能技术生成

float-float

基于Lewis书中代码加工。可以通过使用两个float单精度浮点数模拟CPU和GPU上双精度double类型运算。

在需要混合精度计算逻辑中可以使用,保证特定计算的精度需求。


FLOAT2类型源码(float2.h)
#pragma once 
#ifdef __CUDACC__
#define __HOST__DEVICE__  __host__ __device__
#else 
#define __HOST__DEVICE__ 
#endif

#define __INLINE__ __HOST__DEVICE__ inline


struct FLOAT2{

float x32,y32;

__INLINE__ 
FLOAT2():x32(0.0f),y32(0.0f){}

__INLINE__ explicit 
FLOAT2(float x):x32(x),y32(0.0f){}

__INLINE__  explicit 
FLOAT2(float x,float y):x32(x),y32(y) {}

/**************************************************************
 *      类型转换
 * ************************************************************/
//类型转换: FLOAT2->float
__INLINE__  
operator float(){return x32;}

//类型转换: FLOAT2->double
__INLINE__ 
operator double(){return double(x32)+double(y32);}

//double->FLOAT2, 未考虑溢出
__INLINE__  explicit 
FLOAT2(double a)
{
    x32=a;
    y32=a-x32;
}

__INLINE__ static bool isnan(const FLOAT2 a64)
{
    return std::isnan(a64.x32) || std::isnan(a64.y32);
}
__INLINE__ static bool isinf(const FLOAT2 a64)
{
    return !isnan(a64) &&(std::isinf(a64.x32) || std::isinf(a64.y32));
}
__INLINE__ static bool isnormal(const FLOAT2 a64)
{
    return std::isnormal(a64.x32) && std::isnormal(a64.y32);
}
__INLINE__ static bool isfinite(const FLOAT2 a64)
{
    std::isfinite(a64.x32) && std::isfinite(a64.y32);
}
__INLINE__ static bool iszero(const FLOAT2 a64)
{
    return a64.x32==0.0f && a64.y32==0.0f;
}

/********************************************************
 *                   计算函数
 * *******************************************************/
//==
__INLINE__ static bool EQ64(const FLOAT2 a64, const FLOAT2 b64)
{
    return a64.x32==b64.x32 && a64.y32==b64.y32;
}
//> 
__INLINE__ static bool GT64(const FLOAT2 a64, const FLOAT2 b64)
{
    return a64.x32>b64.x32 || (a64.x32==b64.x32 && a64.y32>b64.y32);
}
//>=
__INLINE__ static bool GE64(const FLOAT2 a64, const FLOAT2 b64)
{
    return a64.x32>=b64.x32 || (a64.x32==b64.x32 && a64.y32>=b64.y32);
}
//<
__INLINE__ static bool LT64(const FLOAT2 a64, const FLOAT2 b64)
{
    return !GE64(a64,b64);
}
//<=
__INLINE__ static bool LE64(const FLOAT2 a64, const FLOAT2 b64)
{
    return !GT64(a64,b64);
}

//-
__INLINE__ static FLOAT2 NEG64(const FLOAT2 a64)
{
    return FLOAT2{-a64.x32,-a64.y32};
}

/************************************************************
 *                    Addition
 * ***********************************************************/

//将a32+b32规格化成FLOAT2
__INLINE__ static FLOAT2 QuickTwoSum(const float a32, const float b32)
{
    // assumes a32 > b32
    float s32, e32 ;

    s32 = a32 + b32 ;
    e32 = b32 - (s32 - a32) ;                   // This is NOT always zero!
    return  FLOAT2{s32, e32} ;
}

__INLINE__ static FLOAT2 TwoSum(const float a32, const float b32)
{
    float s32, v32, e32 ;

    s32 = a32 + b32 ;
    v32 = s32 - a32 ;                           // This is NOT the same as b32!
    e32 = (a32 - (s32 - v32)) + (b32 - v32) ;   // This is NOT always zero!
    return FLOAT2{s32, e32} ;
}

__INLINE__ static FLOAT2 Add64(const FLOAT2 a64, const FLOAT2 b64)
{
    FLOAT2 s64, t64 ;

    s64 = TwoSum(a64.x32, b64.x32) ;
    t64 = TwoSum(a64.y32, b64.y32) ;
    s64.y32 += t64.x32 ;
    s64 = QuickTwoSum(s64.x32, s64.y32) ;
    s64.y32 += t64.y32 ;
    return QuickTwoSum(s64.x32, s64.y32) ;
}

__INLINE__ static FLOAT2 Add64(const FLOAT2 a64, const float b32)
{
    FLOAT2 s64;
    s64 = TwoSum(a64.x32, b32) ;
    s64.y32 += a64.y32 ;
    return QuickTwoSum(s64.x32, s64.y32) ;
}

__INLINE__ static FLOAT2 Add64(const float a32, const FLOAT2 b64)
{
    return Add64(b64,a32);
}

/*********************************************************
 *                   Subtraction
 * *******************************************************/
__INLINE__ static FLOAT2 Sub64(const FLOAT2 a64,const FLOAT2 b64)
{
    return Add64(a64,NEG64(b64));
}

__INLINE__ static FLOAT2 Sub64(const FLOAT2 a64,const float b32)
{
    return Add64(a64, -b32);
}

__INLINE__ static FLOAT2 Sub64(const float a32, const FLOAT2 b64)
{
    return NEG64(Sub64(b64,a32));
}

/*****************************************************************
 *                    Multiplication
 * ****************************************************************/

__INLINE__ static FLOAT2 Split(const float a32)
{
    float p32,ahi32,alo32;
    p32=4097.0f*a32;
    ahi32=p32-(p32-a32);
    alo32=a32-ahi32;

    return FLOAT2(ahi32,alo32);
}

__INLINE__ static FLOAT2 TwoProd(const float a32, const float b32)
{
    FLOAT2 a64, b64 ;
    float p32, e32 ;

    p32 = a32 * b32 ;
    a64 = Split(a32) ;
    b64 = Split(b32) ;
    e32 = ((a64.x32*b64.x32 - p32) + a64.x32*b64.y32 + a64.y32*b64.x32) + a64.y32*b64.y32 ;
    return FLOAT2{p32, e32} ;
}

__INLINE__ static FLOAT2 Mul64(const FLOAT2 a64, const FLOAT2 b64)
{
    FLOAT2 p64 ;

    p64 = TwoProd(a64.x32, b64.x32) ;
    p64.y32 += a64.x32 * b64.y32 ;
    p64.y32 += a64.y32 * b64.x32 ;
    return QuickTwoSum(p64.x32, p64.y32) ;
}

__INLINE__ static FLOAT2 Mul64(const FLOAT2 a64, const float b32)
{
    FLOAT2 p64 ;

    p64 = TwoProd(a64.x32, b32) ;
    p64.y32 += a64.y32 * b32 ;
    return QuickTwoSum(p64.x32, p64.y32) ;
}

__INLINE__ static FLOAT2 Mul64(const float a32, const FLOAT2 b64)
{
    return Mul64(b64,a32);
}

/*****************************************************************
 *                    Division
 * ****************************************************************/

//牛顿迭代法,迭代一次就OK
__INLINE__ static FLOAT2 Div64(const FLOAT2 a64, const FLOAT2 b64)
{
    //check divide by zero
    //ASSERT(!iszero(b64))

    FLOAT2 q64, e64 ;
    float i32, d32 ;

    i32 = 1.0f / b64.x32 ;

    float q32=a64.x32*i32;
    d32=Sub64(a64,Mul64(b64,q32)).x32;
    e64=TwoProd(i32,d32);
    return Add64(q32,e64);
}

__INLINE__ static FLOAT2 Div64(const FLOAT2 a64, const float b32)
{
    //check divide by zero
    //ASSERT(b32!=0.0f)

    FLOAT2 q64, e64 ;
    float i32, d32 ;

    i32 = 1.0f / b32 ;

    float q32=a64.x32*i32;
    d32=Sub64(a64,TwoProd(b32,q32)).x32;
    e64=TwoProd(i32,d32);
    return Add64(q32,e64);
}

__INLINE__ static FLOAT2 Div64(const float a32, const FLOAT2 b64)
{
    //check divide by zero
    //ASSERT(!iszero(b64))
    
    FLOAT2 t64(a32);
    return Div64(t64,b64);
}

/*****************************************************************
 *                    special functions
 * ****************************************************************/
 //TODO: 
 //ln,log10
 //sin,cos,sincos
 //exp,exp1m
 //polyval

//计算倒数
__INLINE__ static FLOAT2 Recip64(const FLOAT2 a64)
{
    FLOAT2 e64 ;
    float i32, d32 ;
    
    //近似值
    i32 = 1.0f / a64.x32 ;
    
     //一次牛顿迭代法: 求零点 f(x)=x^{-1}-a
    //迭代: x=x*(2-a*x)
    return Mul64(i32,Sub64(2.0f,Mul64(a64,i32)));
}

//计算1/sqrt(a)
//x=1/sqrt(a)=> f(x)=x^{-2}-a 求零点
//迭代: x=x*(1.5-0.5*a*x*x)
//https://math.stackexchange.com/questions/607794/newton-raphson-for-reciprocal-square-root
 __INLINE__ static FLOAT2 Rsqrt64(const FLOAT2 a64)
 {
     //近似值
     float i32=rsqrt(a64.x32);

     //一次牛顿迭代
     //迭代: x=x*(1.5-0.5*a*x*x)=0.5*x*(3-a*x*x)
     FLOAT2 t64=Mul64(i32,Mul64(i32,a64));
     return Mul64(0.5f*i32, Sub64(3.0f, t64));
 }

 __INLINE__ static FLOAT2 Sqrt64(const FLOAT2 a64)
 {
     return Recip64(Rsqrt64(a64));
 }

}; //struct FLOAT2


测试程序(test_float2.cu)
#include "float2.h"
#include <cstdio>
__INLINE__  static void test_float2()
{
   const double sqrt_2=1.4142135623730950488016887242097;
   const double sqrt_3=1.7320508075688772935274463415059;
   const double sqrt_2_plus_sqrt_3=3.146264369941972560695830907207;
   const float z=1.5f; //3/2

   printf("sqrt_2=%18.16g, sqrt_3=%18.16g\n",sqrt_2,sqrt_3);

   FLOAT2 a64(sqrt_2),b64(sqrt_3);

   auto x64=FLOAT2::Add64(a64,b64);

   double y=x64;

   printf("y=%18.16g, %18.16g\n",y,sqrt_2+sqrt_3);

   printf("%18.16g\n",double(FLOAT2::Sub64(z,FLOAT2(sqrt_2))));

   x64=FLOAT2::Div64(a64,b64);
   y=x64;
   printf("y=%18.16g\n",y);

   x64=FLOAT2::Recip64(FLOAT2(sqrt_2));
   y=x64;
   printf("y=%18.16g\n",y);
   
   x64=FLOAT2::Rsqrt64(FLOAT2(2.0f));
   y=x64;
   printf("y=%18.16g\n",y);

}

__global__ void kernel_float2()
{
    test_float2();
}

void 
test_driver()
{
   //==========================================================
   printf("----------->On CPU:\n");
   test_float2();
   
   //============================================================
   printf("------------>On GPU: \n");
   kernel_float2<<<1,1>>>();
}

int main()
{
    test_driver();
    return(0);
}

//编译: nvcc -O3 -use_fast_math test_float2.cu

/***********************************************************
matlab:

vpa(sqrt(2)) #1.4142135623730950488016887242097
vpa(sqrt(3)) #1.7320508075688772935274463415059
vpa(sqrt(2)+sqrt(3)) #3.146264369941972560695830907207
vpa(3/2-sqrt(2)) #0.085786437626904854525378141261172
vpa(sqrt(2/3)) #0.81649658092772603273242802490196 

**************************************************************/


网络资源
float-float模拟double运算
ieee float, fp16,bfp16,tfp32

float16


bfloat16


混合精度计算
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值