#define CBRT_cbrt(a) cbrt(*(uint64_t*)&a)
double cube[2048];
double cbrt(uint64_t a)
{//最优开立方算法
double x;
unsigned int exp;
if(a==0)return 0.0;
exp=((*(uint64_t*)&a)>>52)&0x7FF;
(*(uint64_t*)&a)&=0x800FFFFFFFFFFFFFLL;
(*(uint64_t*)&x)=((*(int64_t*)&a)>>2)|0x3FF0000000000000LL;
(*(uint64_t*)&a)|=0x3FE0000000000000LL;
(*(uint64_t*)&x)&=0xBFFFFFFFFFFFFFFFLL;//这里可以优化为BTR指令
x=(2.0/3.0)*(x+(*(double*)&a)/(x*x));
x=(2.0/3.0)*(x+(*(double*)&a)/(x*x));
x=(2.0/3.0)*(x+(*(double*)&a)/(x*x));
return x*cube[exp];
}
void getcube()
{
double x=1.0/3.0;
uint64_t exp;
uint64_t t;
for(t=0;t<2048;t++)
{
exp=(t<<52);
POWER(cube[t],exp,x);//幂运算:cube[t]=exp^x.
}
}
这个算法无误,但需注意传入值过小(非规范浮点数)可能会出错。
CPU、FPU硬件目前不支持直接计算开立方,需借助算法实现,经测试,该算法的性能比Intel C++2016版的还强,精度可以保证,但并不完善,未考虑非规范浮点数。另外,附上汇编:
#define CBRT(x,a,c) /* x=cbrt(a),c=2/3 */\
asm(\
"fldl %2 \n\t"\
"movabsq $0x800FFFFFFFFFFFFF, %%rdx \n\t"\
"movabsq $0X3FE0000000000000, %%rax \n\t"\
"andq %%rcx, %%rdx \n\t"\
"orq %%rdx, %%rax \n\t"\
"movq %%rax,(%%rsp) \n\t"\
"fldl (%%rsp) \n\t"\
"fld %%st(0) \n\t"\
"fld %%st(0) \n\t"\
"shrq $52, %%rcx \n\t"\
"andl $0x7FF, %%ecx \n\t"\
"sarq $2, %%rdx \n\t"\
"movabsq $0x3FF0000000000000, %%rax \n\t"\
"orq %%rax, %%rdx \n\t"\
"btrq $62, %%rdx \n\t"\
"movq %%rdx,(%%rsp) \n\t"\
"leaq cube, %%rax \n\t"\
"fldl (%%rsp) \n\t"\
"fld %%st(0) \n\t"\
"fmul %%st(1),%%st(0) \n\t"\
"fdivrp %%st(0),%%st(2) \n\t"\
"faddp %%st(0),%%st(1) \n\t"\
"fmul %%st(3),%%st(0) \n\t"\
"fld %%st(0) \n\t"\
"fmul %%st(1),%%st(0) \n\t"\
"fdivrp %%st(0),%%st(2) \n\t"\
"faddp %%st(0),%%st(1) \n\t"\
"fmul %%st(2),%%st(0) \n\t"\
"fld %%st(0) \n\t"\
"fmul %%st(1),%%st(0) \n\t"\
"fdivrp %%st(0),%%st(2) \n\t"\
"faddp %%st(0),%%st(1) \n\t"\
"fmulp %%st(0),%%st(1) \n\t"\
"fmull (%%rax,%%rcx,8) \n\t"\
"fstpl %0 \n\t"\
:"=m"(x)\
:"c"(a),"m"(c)\
:"rax","rdx"\
)
#define POWER(c,a,b) /* c=a^b */\
asm(\
"fldl %2 \n\t"\
"fldl %1 \n\t"\
"fyl2x \n\t"\
"fld %%st(0) \n\t"\
"frndint \n\t"\
"fld1 \n\t"\
"fscale \n\t"\
"fxch %%st(2) \n\t"\
"fsubp %%st(0),%%st(1) \n\t"\
"f2xm1 \n\t"\
"fmul %%st(1),%%st(0) \n\t"\
"faddp %%st(0),%%st(1) \n\t"\
"fstpl %0 \n\t"\
:"=m"(c):"m"(a),"m"(b))