用快速幂算法计算指数函数的自变量为正整数时的函数值是非常愚蠢的做法,毫无意义。与标准库的算法和本文将要介绍的查表法相比,快速幂不仅缓慢,而且精度很差。快速幂算法绝大多数情况下都是用于整型变量(不会有乘法累积误差)的模幂运算,少数情况下才会用于浮点运算(比如计算指数为正整数的幂函数),拿快速幂来算指数函数就好比把高铁开到了公路上——不匹配。
下面以以e为底的指数函数为例,介绍一种优化的查表法。如果自变量x超过1024*ln(2)=709.7827,就会导致exp(x)超过双精度浮点数所能表达的最大值1.79769*10^308. 正是因为自变量允许的范围很小,哪怕直接使用含709个双精度浮点数的数组,占用空间也不算很大。如何减少数组元素的个数呢?比如可以用一个含71个元素的数组a保存exp(0), exp(10), exp(20), exp(30),……,exp(700),再用一个含10个元素的数组b保存exp(0), exp(1), exp(2), exp(3),……,exp(9),那么1~709之间的任意整数的函数值都可以通过从a,b两个数组中各选出一个元素做乘法得到(后面就会知道,保存exp(0)会给编程带来极大的方便)。这样做就能让数组大小从709减小到81,还能不能继续减小呢?当然可以!假设数组b保存exp(0),exp(1),exp(2),exp(3),……,exp(n-1),数组a保存exp(0), exp(n), exp(2n), exp(3n),……,exp(709/n* n),那么数组元素总数
n+709/n+1>=2sqrt(n*709/n)+1=2sqrt(709)+1≈2*27+1=55(基本不等式)
因为27*27=729>709,所以只需要54个元素,不需要55个,即数组a存exp(0), exp(27), exp(54), exp(81),...,exp(702),数组b存exp(0), exp(1), exp(2), exp(3),...exp(26). 自变量x对27分别求商和余数(相当于求出它在27进制下的表示),再加两次数组访问和一次浮点乘法即可算出exp(x),速度和精度都远超快速幂。
C语言测试代码如下:
#include<stdio.h>
#include<math.h>
#include<time.h>
// #include<stdlib.h>
//exp(0),exp(27),exp(54),exp(81),...exp(702)
constexpr double exp0to702Array[27] = {
1.0,
5.3204824060179862e+11, 2.8307533032746939e+23, 1.5060973145850305e+35,
8.0131642640005911e+46, 4.2633899483147210e+58, 2.2683291210002405e+70,
1.2068605179340023e+82, 6.4210801521856136e+93, 3.4163243977334850e+105,
1.8176493851391000e+117, 9.6707715739419918e+128, 5.1453170011777236e+140,
2.7375568578151304e+152, 1.4565123097479283e+164, 7.7493481181624719e+175,
4.1230270320792022e+187, 2.1936492783713950e+199, 1.1671272390549059e+211,
6.2096799409759750e+222, 3.3038492872965482e+234, 1.7578072005196348e+246,
9.3523822835364470e+257, 4.9759185393909983e+269, 2.6474287042608522e+281,
1.4085597842206859e+293, 7.4942175497706501e+304
};
// exp(0),exp(1),exp(2),exp(3),...exp(26)
constexpr double exp0to26Array[27] = {
1.0,
2.7182818284590452e+00, 7.3890560989306502e+00, 2.0085536923187668e+01,
5.4598150033144239e+01, 1.4841315910257660e+02, 4.0342879349273512e+02,
1.0966331584284586e+03, 2.9809579870417283e+03, 8.1030839275753840e+03,
2.2026465794806717e+04, 5.9874141715197818e+04, 1.6275479141900392e+05,
4.4241339200892050e+05, 1.2026042841647768e+06, 3.2690173724721106e+06,
8.8861105205078726e+06, 2.4154952753575298e+07, 6.5659969137330511e+07,
1.7848230096318726e+08, 4.8516519540979028e+08, 1.3188157344832147e+09,
3.5849128461315916e+09, 9.7448034462489026e+09, 2.6489122129843472e+10,
7.2004899337385873e+10, 1.9572960942883876e+11
};
inline double exp_54Table(unsigned int x) {
//div函数需要#include<stdlib.h>,而且比下面的写法慢得多
//div_t result = div(x, 27);
//return exp0to702Array[result.quot] * exp0to26Array[result.rem];
unsigned int q = x / 27;
unsigned int r = x - q * 27; //比 r = x % 27 更快
return exp0to702Array[q] * exp0to26Array[r];
}
inline double exp_fastpower(unsigned short x) {
double e = 2.718281828459045;
double result = 1.0;
while (x > 0) {
if (x & 1) {
result *= e;
}
e *= e;
x >>= 1;
}
return result;
}
int main() {
printf("double,精度测试\n");
for (unsigned int i = 2; i < 80; i += 3) {
printf("exp_fastpower(%d)=%18.16lf\n exp_54Table(%d)=%18.16lf\n math.h exp(%d)=%18.16lf\n-------\n",
i, exp_fastpower(i), i, exp_54Table(i), i, exp(i));
}
printf("速度测试,编译器优化设为/O2 \n");
clock_t start;
double sum, tmp;
int N = 4e8;
/
start = clock();
sum = 0.0;
for (unsigned int i = 1; i < N; i++) {
tmp = exp_fastpower(i % 710);
sum += tmp / (1.0 + tmp);
}
printf("sum=%lf, fastpower_Time: %fs\n", sum, (double)(clock() - start) / CLOCKS_PER_SEC);
/
start = clock();
sum = 0.0;
for (unsigned int i = 1; i < N; i++) {
tmp = exp_54Table(i % 710);
sum += tmp / (1.0 + tmp);
}
printf("sum=%lf, 54Table_Time: %fs\n", sum, (double)(clock() - start) / CLOCKS_PER_SEC);
start = clock();
sum = 0.0;
for (unsigned int i = 1; i < N; i++) {
tmp = exp(i % 710);
sum += tmp / (1.0 + tmp);
}
printf("sum=%lf, math.h exp_Time: %fs\n", sum, (double)(clock() - start) / CLOCKS_PER_SEC);
return 0;
}
执行结果如下:
double,精度测试
exp_fastpower(2)=7.3890560989306495
exp_54Table(2)=7.3890560989306504
math.h exp(2)=7.3890560989306504
-------
exp_fastpower(5)=148.4131591025765715
exp_54Table(5)=148.4131591025765999
math.h exp(5)=148.4131591025765999
-------
exp_fastpower(8)=2980.9579870417273924
exp_54Table(8)=2980.9579870417283018
math.h exp(8)=2980.9579870417283018
-------
exp_fastpower(11)=59874.1417151977875619
exp_54Table(11)=59874.1417151978166657
math.h exp(11)=59874.1417151978166657
-------
exp_fastpower(14)=1202604.2841647760942578
exp_54Table(14)=1202604.2841647767927498
math.h exp(14)=1202604.2841647767927498
-------
exp_fastpower(17)=24154952.7535752803087234
exp_54Table(17)=24154952.7535752989351749
math.h exp(17)=24154952.7535752989351749
-------
exp_fastpower(20)=485165195.4097898602485657
exp_54Table(20)=485165195.4097902774810791
math.h exp(20)=485165195.4097902774810791
-------
exp_fastpower(23)=9744803446.2488937377929688
exp_54Table(23)=9744803446.2489032745361328
math.h exp(23)=9744803446.2489032745361328
-------
exp_fastpower(26)=195729609428.8385314941406250
exp_54Table(26)=195729609428.8387451171875000
math.h exp(26)=195729609428.8387756347656250
-------
exp_fastpower(29)=3931334297144.0371093750000000
exp_54Table(29)=3931334297144.0424804687500000
math.h exp(29)=3931334297144.0419921875000000
-------
exp_fastpower(32)=78962960182680.5937500000000000
exp_54Table(32)=78962960182680.7031250000000000
math.h exp(32)=78962960182680.6875000000000000
-------
exp_fastpower(35)=1586013452313428.5000000000000000
exp_54Table(35)=1586013452313430.7500000000000000
math.h exp(35)=1586013452313430.7500000000000000
-------
exp_fastpower(38)=31855931757113704.0000000000000000
exp_54Table(38)=31855931757113756.0000000000000000
math.h exp(38)=31855931757113756.0000000000000000
-------
exp_fastpower(41)=639843493530053888.0000000000000000
exp_54Table(41)=639843493530055040.0000000000000000
math.h exp(41)=639843493530054912.0000000000000000
-------
exp_fastpower(44)=12851600114359285760.0000000000000000
exp_54Table(44)=12851600114359310336.0000000000000000
math.h exp(44)=12851600114359308288.0000000000000000
-------
exp_fastpower(47)=258131288619006263296.0000000000000000
exp_54Table(47)=258131288619006754816.0000000000000000
math.h exp(47)=258131288619006754816.0000000000000000
-------
exp_fastpower(50)=5184705528587061559296.0000000000000000
exp_54Table(50)=5184705528587073093632.0000000000000000
math.h exp(50)=5184705528587072045056.0000000000000000
-------
exp_fastpower(53)=104137594330290650611712.0000000000000000
exp_54Table(53)=104137594330290868715520.0000000000000000
math.h exp(53)=104137594330290885492736.0000000000000000
-------
exp_fastpower(56)=2091659496012991271272448.0000000000000000
exp_54Table(56)=2091659496012996371546112.0000000000000000
math.h exp(56)=2091659496012996103110656.0000000000000000
-------
exp_fastpower(59)=42012104037905041232756736.0000000000000000
exp_54Table(59)=42012104037905144311971840.0000000000000000
math.h exp(59)=42012104037905144311971840.0000000000000000
-------
exp_fastpower(62)=843835666874143321603702784.0000000000000000
exp_54Table(62)=843835666874145520626958336.0000000000000000
math.h exp(62)=843835666874145383188004864.0000000000000000
-------
exp_fastpower(65)=16948892444103293822893555712.0000000000000000
exp_54Table(65)=16948892444103337803358666752.0000000000000000
math.h exp(65)=16948892444103337803358666752.0000000000000000
-------
exp_fastpower(68)=340427604993173161033062154240.0000000000000000
exp_54Table(68)=340427604993174075826736463872.0000000000000000
math.h exp(68)=340427604993174075826736463872.0000000000000000
-------
exp_fastpower(71)=6837671229762725006725617811456.0000000000000000
exp_54Table(71)=6837671229762744147024034136064.0000000000000000
math.h exp(71)=6837671229762744147024034136064.0000000000000000
-------
exp_fastpower(74)=137338297954017214457801288450048.0000000000000000
exp_54Table(74)=137338297954017628788967006535680.0000000000000000
math.h exp(74)=137338297954017610774568497053696.0000000000000000
-------
exp_fastpower(77)=2758513454523161674278391871700992.0000000000000000
exp_54Table(77)=2758513454523170321189676423053312.0000000000000000
math.h exp(77)=2758513454523170321189676423053312.0000000000000000
-------
速度测试,编译器优化设为/O2
sum=399456808.110091, fastpower_Time: 5.583000s
sum=399456808.110091, 54Table_Time: 0.641000s
sum=399456808.110091, math.h exp_Time: 2.040000s
上面的函数exp_54Table()的汇编代码如下:
1400010A0: 44 8B C1 mov r8d, ecx
1400010A3: B8 DB 4B 68 2F mov eax, 2F684BDBh
1400010A8: F7 E1 mul ecx
1400010AA: 44 2B C2 sub r8d, edx
1400010AD: 41 D1 E8 shr r8d, 1
1400010B0: 44 03 C2 add r8d, edx
1400010B3: 48 8D 15 46 EF FF FF lea rdx, cs:140000000h
1400010BA: 41 C1 E8 04 shr r8d, 4
1400010BE: 41 6B C0 1B imul eax, r8d, 1Bh
1400010C2: 2B C8 sub ecx, eax
1400010C4: F2 0F 10 84 CA 70 33 00 00 movsd xmm0, ds:rva qword_140003370[rdx+rcx*8]
1400010CD: F2 42 0F 59 84 C2 50 34 00 00 mulsd xmm0, ds:rva qword_140003450[rdx+r8*8]
1400010D7: C3 retn
函数体大小为:0xD7-0xA0+0x01=0x38=56 Byte. 第二行出现的0x2F684BDB=795364315=ceil(2^37/27)-2^32,指数37=32+ceil(log_2(27)). 原理参见 无符号整数快速除法优化 。同样的原理,对27求商还有更快的乘数和移位量,此处可以用14作为右移量(对[1,709]之间的全部正整数适用),2^14=16384,即 floor(x/27)=floor(((16384/27) * x) / 16384), 16384/27=606.814,向上取整变成607(0x025F). 如果右移量小于14,那么在某些情况下算出的商不对,感兴趣的可以尝试一下。把函数体改成如下写法后,汇编代码变得极其简短(Godbolt Compiler Explorer,这个网站可以在线查看汇编代码,编译器可以选择x64 MSVC v19,优化参数设为/O2)。
double exp_54Table(unsigned int x) {
unsigned int q = (607 * x) >> 14;
unsigned int r = x - q * 27;
return exp0to702Array[q] * exp0to26Array[r];
}
汇编代码:
1400010A0: 69 D1 5F 02 00 00 imul edx, ecx, 607
1400010A6: 4C 8D 05 53 EF FF FF lea r8, cs:140000000h
1400010AD: C1 EA 0E shr edx, 14
1400010B0: 6B C2 1B imul eax, edx, 27
1400010B3: 2B C8 sub ecx, eax
1400010B5: F2 41 0F 10 84 C8 70 33 00 00 movsd xmm0, ds:rva qword_140003370[r8+rcx*8]
1400010BF: F2 41 0F 59 84 D0 50 34 00 00 mulsd xmm0, ds:rva qword_140003450[r8+rdx*8]
1400010C9: C3 retn
函数体的大小只有0xC9-0xA0+0x01=0x2A=42 Byte,加上54个双精度浮点数,总共42+54*8=474 Byte,比直接存709个浮点数(709*8=5672 Byte)小多了。
将 q = x / 27 改成 q = (607 * x) >> 14 后,运行时间从0.64s减少到0.47s. 查表法的速度是标准库的5倍,是快速幂的12倍。查表法的精度自然不输标准库,比快速幂要好得多。
虽然快速幂的函数体也很小(40 Byte,如下),但速度和精度的劣势使其对于指数函数而言几乎没有实用价值。
1400010D0: F2 0F 10 0D 58 24 00 00 movsd xmm1, cs:qword_140003530
1400010D8: F2 0F 10 05 48 24 00 00 movsd xmm0, cs:qword_140003528
1400010E0: 66 85 C9 test cx, cx
1400010E3: 74 12 jz short locret_1400010F7
1400010E5: F6 C1 01 test cl, 1
1400010E8: 74 04 jz short loc_1400010EE
1400010EA: F2 0F 59 C1 mulsd xmm0, xmm1
1400010EE: 66 D1 E9 shr cx, 1
1400010F1: F2 0F 59 C9 mulsd xmm1, xmm1
1400010F5: 75 EE jnz short loc_1400010E5
1400010F7: C3 retn
王希:没有math.h我们能干啥?,这篇文章写的计算exp的方法,就是把自变量拆成整数部分和小数部分,整数部分使用快速幂计算,这就完全是缺少编程经验,误导人了。
在一些低性能单片机上,没有浮点运算单元,导致浮点乘法的耗时很长,也不支持双精度浮点数。比如Arduino系列单片机没有浮点运算单元,除了Due支持双精度浮点数以外,其它型号都只支持单精度浮点数,那么exp(x)最大允许整数指数为88(128ln2=88.72……),只需要保存19个单精度浮点数即可实现查表,大小为19*4=76 Byte,可以说很小了。在Arduino Nano(ATMEL M328P)上,查表法的速度是快速幂算法的2倍,是标准库的4倍。
Arduino示例代码如下:
//exp(0),exp(10),exp(20),exp(30),...exp(80)
float exp0to80Array[9] = {
1.0f,
22026.466f, 485165200.0f,
1.0686475e+13f, 2.3538527e+17f,
5.1847055e+21f, 1.1420074e+26f,
2.5154387e+30f, 5.5406224e+34f
};
// exp(0),exp(1),exp(2),exp(3),...exp(9)
float exp0to9Array[10] = {
1.0f,
2.7182818f, 7.3890561f,
20.085537f, 54.598150f,
148.41316f, 403.42879f,
1096.6332f, 2980.9580f,
8103.0839f
};
inline float exp_19Table(unsigned short x) { // 0<=x<=88
unsigned short q = (103 * x) >> 10; // 2^10=1024, 1024/10=102.4,向上取整为103
unsigned short r = x - q * 10; // 比 q = x / 10; r = x % 10 更快
return exp0to80Array[q] * exp0to9Array[r];
}
inline float exp_fastpower(unsigned short x) {
float e = 2.7182818f;
float result = 1.0;
while (x > 0) {
if (x & 1) {
result *= e;
}
e *= e;
x >>= 1;
}
return result;
}
void setup() {
Serial.begin(9600);
}
void loop() {
float start;
float sum, s, threshold = 16000.0f;
unsigned int N = 5e4, M = 60, t;
/
start = millis();
sum = 0.0f;
for (unsigned int i = 1; i < N; i++) {
t = i % M;
s = exp_fastpower(t) / i;
if (s < threshold) {
sum += s;
}
}
start = (millis() - start) / 1000.0f;
Serial.print("sum = " + String(sum, 4) + ",\t fastpower_Time:" + String(start, 3) + "s\n");
/
start = millis();
sum = 0.0f;
for (unsigned int i = 1; i < N; i++) {
t = i % M;
s = exp_19Table(t) / i;
if (s < threshold) {
sum += s;
}
}
start = (millis() - start) / 1000.0f;
Serial.print("sum = " + String(sum, 4) + ",\t 19Table_Time:" + String(start, 3) + "s\n");
start = millis();
sum = 0.0f;
for (unsigned int i = 1; i < N; i++) {
t = i % M;
s = expf(t) / i;
if (s < threshold) {
sum += s;
}
}
start = (millis() - start) / 1000.0f;
Serial.print("sum = " + String(sum, 4) + ",\t math.h exp_Time:" + String(start, 3) + "s\n");
}
结果如下:
sum = 13839906.0000, fastpower_Time:6.221s
sum = 13839915.0000, 19Table_Time:3.201s
sum = 13839903.0000, math.h exp_Time:11.646s
sum = 13839906.0000, fastpower_Time:6.222s
sum = 13839915.0000, 19Table_Time:3.200s
sum = 13839903.0000, math.h exp_Time:11.645s
sum = 13839906.0000, fastpower_Time:6.222s
sum = 13839915.0000, 19Table_Time:3.200s
sum = 13839903.0000, math.h exp_Time:11.645s
sum = 13839906.0000, fastpower_Time:6.223s
sum = 13839915.0000, 19Table_Time:3.200s
sum = 13839903.0000, math.h exp_Time:11.646s