float-float
基于Lewis书中代码加工。可以通过使用两个float单精度浮点数模拟CPU和GPU上双精度double类型运算。
在需要混合精度计算逻辑中可以使用,保证特定计算的精度需求。
FLOAT2类型源码(float2.h)
#pragma once
#ifdef __CUDACC__
#define __HOST__DEVICE__ __host__ __device__
#else
#define __HOST__DEVICE__
#endif
#define __INLINE__ __HOST__DEVICE__ inline
struct FLOAT2{
float x32,y32;
__INLINE__
FLOAT2():x32(0.0f),y32(0.0f){}
__INLINE__ explicit
FLOAT2(float x):x32(x),y32(0.0f){}
__INLINE__ explicit
FLOAT2(float x,float y):x32(x),y32(y) {}
/**************************************************************
* 类型转换
* ************************************************************/
//类型转换: FLOAT2->float
__INLINE__
operator float(){return x32;}
//类型转换: FLOAT2->double
__INLINE__
operator double(){return double(x32)+double(y32);}
//double->FLOAT2, 未考虑溢出
__INLINE__ explicit
FLOAT2(double a)
{
x32=a;
y32=a-x32;
}
__INLINE__ static bool isnan(const FLOAT2 a64)
{
return std::isnan(a64.x32) || std::isnan(a64.y32);
}
__INLINE__ static bool isinf(const FLOAT2 a64)
{
return !isnan(a64) &&(std::isinf(a64.x32) || std::isinf(a64.y32));
}
__INLINE__ static bool isnormal(const FLOAT2 a64)
{
return std::isnormal(a64.x32) && std::isnormal(a64.y32);
}
__INLINE__ static bool isfinite(const FLOAT2 a64)
{
std::isfinite(a64.x32) && std::isfinite(a64.y32);
}
__INLINE__ static bool iszero(const FLOAT2 a64)
{
return a64.x32==0.0f && a64.y32==0.0f;
}
/********************************************************
* 计算函数
* *******************************************************/
//==
__INLINE__ static bool EQ64(const FLOAT2 a64, const FLOAT2 b64)
{
return a64.x32==b64.x32 && a64.y32==b64.y32;
}
//>
__INLINE__ static bool GT64(const FLOAT2 a64, const FLOAT2 b64)
{
return a64.x32>b64.x32 || (a64.x32==b64.x32 && a64.y32>b64.y32);
}
//>=
__INLINE__ static bool GE64(const FLOAT2 a64, const FLOAT2 b64)
{
return a64.x32>=b64.x32 || (a64.x32==b64.x32 && a64.y32>=b64.y32);
}
//<
__INLINE__ static bool LT64(const FLOAT2 a64, const FLOAT2 b64)
{
return !GE64(a64,b64);
}
//<=
__INLINE__ static bool LE64(const FLOAT2 a64, const FLOAT2 b64)
{
return !GT64(a64,b64);
}
//-
__INLINE__ static FLOAT2 NEG64(const FLOAT2 a64)
{
return FLOAT2{-a64.x32,-a64.y32};
}
/************************************************************
* Addition
* ***********************************************************/
//将a32+b32规格化成FLOAT2
__INLINE__ static FLOAT2 QuickTwoSum(const float a32, const float b32)
{
// assumes a32 > b32
float s32, e32 ;
s32 = a32 + b32 ;
e32 = b32 - (s32 - a32) ; // This is NOT always zero!
return FLOAT2{s32, e32} ;
}
__INLINE__ static FLOAT2 TwoSum(const float a32, const float b32)
{
float s32, v32, e32 ;
s32 = a32 + b32 ;
v32 = s32 - a32 ; // This is NOT the same as b32!
e32 = (a32 - (s32 - v32)) + (b32 - v32) ; // This is NOT always zero!
return FLOAT2{s32, e32} ;
}
__INLINE__ static FLOAT2 Add64(const FLOAT2 a64, const FLOAT2 b64)
{
FLOAT2 s64, t64 ;
s64 = TwoSum(a64.x32, b64.x32) ;
t64 = TwoSum(a64.y32, b64.y32) ;
s64.y32 += t64.x32 ;
s64 = QuickTwoSum(s64.x32, s64.y32) ;
s64.y32 += t64.y32 ;
return QuickTwoSum(s64.x32, s64.y32) ;
}
__INLINE__ static FLOAT2 Add64(const FLOAT2 a64, const float b32)
{
FLOAT2 s64;
s64 = TwoSum(a64.x32, b32) ;
s64.y32 += a64.y32 ;
return QuickTwoSum(s64.x32, s64.y32) ;
}
__INLINE__ static FLOAT2 Add64(const float a32, const FLOAT2 b64)
{
return Add64(b64,a32);
}
/*********************************************************
* Subtraction
* *******************************************************/
__INLINE__ static FLOAT2 Sub64(const FLOAT2 a64,const FLOAT2 b64)
{
return Add64(a64,NEG64(b64));
}
__INLINE__ static FLOAT2 Sub64(const FLOAT2 a64,const float b32)
{
return Add64(a64, -b32);
}
__INLINE__ static FLOAT2 Sub64(const float a32, const FLOAT2 b64)
{
return NEG64(Sub64(b64,a32));
}
/*****************************************************************
* Multiplication
* ****************************************************************/
__INLINE__ static FLOAT2 Split(const float a32)
{
float p32,ahi32,alo32;
p32=4097.0f*a32;
ahi32=p32-(p32-a32);
alo32=a32-ahi32;
return FLOAT2(ahi32,alo32);
}
__INLINE__ static FLOAT2 TwoProd(const float a32, const float b32)
{
FLOAT2 a64, b64 ;
float p32, e32 ;
p32 = a32 * b32 ;
a64 = Split(a32) ;
b64 = Split(b32) ;
e32 = ((a64.x32*b64.x32 - p32) + a64.x32*b64.y32 + a64.y32*b64.x32) + a64.y32*b64.y32 ;
return FLOAT2{p32, e32} ;
}
__INLINE__ static FLOAT2 Mul64(const FLOAT2 a64, const FLOAT2 b64)
{
FLOAT2 p64 ;
p64 = TwoProd(a64.x32, b64.x32) ;
p64.y32 += a64.x32 * b64.y32 ;
p64.y32 += a64.y32 * b64.x32 ;
return QuickTwoSum(p64.x32, p64.y32) ;
}
__INLINE__ static FLOAT2 Mul64(const FLOAT2 a64, const float b32)
{
FLOAT2 p64 ;
p64 = TwoProd(a64.x32, b32) ;
p64.y32 += a64.y32 * b32 ;
return QuickTwoSum(p64.x32, p64.y32) ;
}
__INLINE__ static FLOAT2 Mul64(const float a32, const FLOAT2 b64)
{
return Mul64(b64,a32);
}
/*****************************************************************
* Division
* ****************************************************************/
//牛顿迭代法,迭代一次就OK
__INLINE__ static FLOAT2 Div64(const FLOAT2 a64, const FLOAT2 b64)
{
//check divide by zero
//ASSERT(!iszero(b64))
FLOAT2 q64, e64 ;
float i32, d32 ;
i32 = 1.0f / b64.x32 ;
float q32=a64.x32*i32;
d32=Sub64(a64,Mul64(b64,q32)).x32;
e64=TwoProd(i32,d32);
return Add64(q32,e64);
}
__INLINE__ static FLOAT2 Div64(const FLOAT2 a64, const float b32)
{
//check divide by zero
//ASSERT(b32!=0.0f)
FLOAT2 q64, e64 ;
float i32, d32 ;
i32 = 1.0f / b32 ;
float q32=a64.x32*i32;
d32=Sub64(a64,TwoProd(b32,q32)).x32;
e64=TwoProd(i32,d32);
return Add64(q32,e64);
}
__INLINE__ static FLOAT2 Div64(const float a32, const FLOAT2 b64)
{
//check divide by zero
//ASSERT(!iszero(b64))
FLOAT2 t64(a32);
return Div64(t64,b64);
}
/*****************************************************************
* special functions
* ****************************************************************/
//TODO:
//ln,log10
//sin,cos,sincos
//exp,exp1m
//polyval
//计算倒数
__INLINE__ static FLOAT2 Recip64(const FLOAT2 a64)
{
FLOAT2 e64 ;
float i32, d32 ;
//近似值
i32 = 1.0f / a64.x32 ;
//一次牛顿迭代法: 求零点 f(x)=x^{-1}-a
//迭代: x=x*(2-a*x)
return Mul64(i32,Sub64(2.0f,Mul64(a64,i32)));
}
//计算1/sqrt(a)
//x=1/sqrt(a)=> f(x)=x^{-2}-a 求零点
//迭代: x=x*(1.5-0.5*a*x*x)
//https://math.stackexchange.com/questions/607794/newton-raphson-for-reciprocal-square-root
__INLINE__ static FLOAT2 Rsqrt64(const FLOAT2 a64)
{
//近似值
float i32=rsqrt(a64.x32);
//一次牛顿迭代
//迭代: x=x*(1.5-0.5*a*x*x)=0.5*x*(3-a*x*x)
FLOAT2 t64=Mul64(i32,Mul64(i32,a64));
return Mul64(0.5f*i32, Sub64(3.0f, t64));
}
__INLINE__ static FLOAT2 Sqrt64(const FLOAT2 a64)
{
return Recip64(Rsqrt64(a64));
}
}; //struct FLOAT2
测试程序(test_float2.cu)
#include "float2.h"
#include <cstdio>
__INLINE__ static void test_float2()
{
const double sqrt_2=1.4142135623730950488016887242097;
const double sqrt_3=1.7320508075688772935274463415059;
const double sqrt_2_plus_sqrt_3=3.146264369941972560695830907207;
const float z=1.5f; //3/2
printf("sqrt_2=%18.16g, sqrt_3=%18.16g\n",sqrt_2,sqrt_3);
FLOAT2 a64(sqrt_2),b64(sqrt_3);
auto x64=FLOAT2::Add64(a64,b64);
double y=x64;
printf("y=%18.16g, %18.16g\n",y,sqrt_2+sqrt_3);
printf("%18.16g\n",double(FLOAT2::Sub64(z,FLOAT2(sqrt_2))));
x64=FLOAT2::Div64(a64,b64);
y=x64;
printf("y=%18.16g\n",y);
x64=FLOAT2::Recip64(FLOAT2(sqrt_2));
y=x64;
printf("y=%18.16g\n",y);
x64=FLOAT2::Rsqrt64(FLOAT2(2.0f));
y=x64;
printf("y=%18.16g\n",y);
}
__global__ void kernel_float2()
{
test_float2();
}
void
test_driver()
{
//==========================================================
printf("----------->On CPU:\n");
test_float2();
//============================================================
printf("------------>On GPU: \n");
kernel_float2<<<1,1>>>();
}
int main()
{
test_driver();
return(0);
}
//编译: nvcc -O3 -use_fast_math test_float2.cu
/***********************************************************
matlab:
vpa(sqrt(2)) #1.4142135623730950488016887242097
vpa(sqrt(3)) #1.7320508075688772935274463415059
vpa(sqrt(2)+sqrt(3)) #3.146264369941972560695830907207
vpa(3/2-sqrt(2)) #0.085786437626904854525378141261172
vpa(sqrt(2/3)) #0.81649658092772603273242802490196
**************************************************************/
网络资源
float-float模拟double运算
- Emulate “double” using 2 “float”s
- Emulating FP64 with 2 FP32 on a GPU
- float-float
- double-float 实现
- ARM Assembly for Embedded Applications, 5th ed.
ieee float, fp16,bfp16,tfp32
- BFloat16 processing for Neural Networks on Armv8-A
- Using Tensor Cores for Mixed-Precision Scientific Computing