本文展示了如何将一个使用蒙特卡洛方法计算圆周率的C语言程序加速一百倍
平台
Ubuntu 14.04 LTS, gcc
原始版本
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
int main()
{
struct timespec tstart={0,0}, tend={0,0};
clock_gettime(CLOCK_MONOTONIC, &tstart);
int i;
int simulate_total = 1000000000;
double radius = (double) RAND_MAX;
double radius_square = radius * radius;
double x, y;
int inside_count = 0;
srand(time(NULL));
for (i = 0; i < simulate_total;i++)
{
x = (double) rand();
y = (double) rand();
if (x * x + y * y < radius_square)
inside_count += 1;
}
printf("%f\n", 4.0 * inside_count / simulate_total);
clock_gettime(CLOCK_MONOTONIC, &tend);
printf("This computation took about %.5f seconds\n",
((double)tend.tv_sec + 1.0e-9*tend.tv_nsec) -
((double)tstart.tv_sec + 1.0e-9*tstart.tv_nsec));
exit(0);
}
10亿个随机数,用时40s
faster random generator
用kcachegrind进行profile,发现rand是瓶颈,从网上找了一个快速rand的实现:
Fast Random Number Generator on the Intel® Pentium® 4 Processor
#include<stdio.h>
#include<stdlib.h>
#include <time.h>
unsigned int g_seed;
//Used to seed the generator.
inline void fast_srand(int seed)
{
g_seed = seed;
}
//fastrand routine returns one integer, similar output value range as C lib.
inline int fastrand()
{
g_seed = (214013*g_seed+2531011);
return (g_seed>>16)&0x7FFF;
}
int main()
{
struct timespec tstart={0,0}, tend={0,0};
clock_gettime(CLOCK_MONOTONIC, &tstart);
int i;
int simulate_total = 1000000000;
int radius = 0x7FFF;
int radius_square = radius * radius;
int x, y;
int inside_count = 0;
fast_srand((int) time(NULL));
for (i = 0; i < simulate_total;i++)
{
x = fastrand();
y = fastrand();
if (x * x + y * y < radius_square)
inside_count += 1;
}
printf("%f\n", 4.0 * inside_count / simulate_total);
clock_gettime(CLOCK_MONOTONIC, &tend);
printf("This computation took about %.5f seconds\n",
((double)tend.tv_sec + 1.0e-9*tend.tv_nsec) -
((double)tstart.tv_sec + 1.0e-9*tstart.tv_nsec));
exit(0);
}
这个版本用时4.2s
AVX
在上一步的基础上使用AVX
#include<stdio.h>
#include<stdlib.h>
#include <time.h>
typedef int v4si __attribute__ ((vector_size (16)));
int main()
{
struct timespec tstart={0,0}, tend={0,0};
clock_gettime(CLOCK_MONOTONIC, &tstart);
v4si g_seed = {(int)time(NULL),(int)time(NULL),(int)time(NULL),(int)time(NULL)};
v4si a = {214013,214013,214013,214013};
v4si b = {2531011,2531011,2531011,2531011};
v4si c = {16,16,16,16};
v4si d = {0x7FFF,0x7FFF,0x7FFF,0x7FFF};
int i;
int simulate_total = 1000000000;
int radius_s = 0x7FFF * 0x7FFF;
v4si radius_square = {radius_s,radius_s,radius_s, radius_s};
v4si x, y;
v4si inside_count = {0,0,0,0};
int count = 0;
for (i = 0; i < simulate_total / 4;i++)
{
g_seed = (a * g_seed+b);
x = (g_seed >> c) & d;
g_seed = (a * g_seed+b);
y = (g_seed >> c) & d;
inside_count = inside_count + (x * x + y * y < radius_square);
}
for (i = 0; i < 4; i++)
count += inside_count[i];
printf("%f\n", -4.0 * count / simulate_total);
clock_gettime(CLOCK_MONOTONIC, &tend);
printf("This computation took about %.5f seconds\n", ((double)tend.tv_sec + 1.0e-9*tend.tv_nsec) - ((double)tstart.tv_sec + 1.0e-9*tstart.tv_nsec));
exit(0);
}
这个版本用时1.4s
查看汇编:
.L3:
vpmulld xmm0, xmm0, xmm6
vpaddd xmm0, xmm0, xmm5
vpsrad xmm2, xmm0, 16
vpmulld xmm0, xmm0, xmm6
vpand xmm2, xmm2, xmm4
vpaddd xmm0, xmm0, xmm5
vpsrad xmm1, xmm0, 16
vpand xmm1, xmm1, xmm4
sub eax, 1
vpmulld xmm2, xmm2, xmm2
vpmulld xmm1, xmm1, xmm1
vpaddd xmm1, xmm2, xmm1
vpcmpgtd xmm1, xmm7, xmm1
vpaddd xmm3, xmm3, xmm1
jne .L3
循环中的运算都在寄存器进行,没有内存读取,用不上cache优化。
循环中所有的g_seed = (a * g_seed+b);
形成了critical path,所以我们使用loop unrolling, 增加并行度,减少循环次数,也就是减少critical path的运行时间。
loop unrolling
#include<stdio.h>
#include<stdlib.h>
#include <time.h>
#define UNROLL 4
typedef int v4si __attribute__ ((vector_size (16)));
int main()
{
struct timespec tstart={0,0}, tend={0,0};
clock_gettime(CLOCK_MONOTONIC, &tstart);
int t1 = (int) time(NULL);
v4si g_seed1 = {t1, 2 * t1, 3 * t1, 4 * t1};
int t2 = (int) time(NULL);
v4si g_seed2 = {t2, 2 * t2, 3 * t2, 4 * t2};
int t3 = (int) time(NULL);
v4si g_seed3 = {t3, 2 * t3, 3 * t3, 4 * t3};
int t4 = (int) time(NULL);
v4si g_seed4 = {t4, 2 * t4, 3 * t4, 4 * t4};
v4si a = {214013,214013,214013,214013};
v4si b = {2531011,2531011,2531011,2531011};
v4si c = {16,16,16,16};
v4si d = {0x7FFF,0x7FFF,0x7FFF,0x7FFF};
int i;
int simulate_total = 1000000000;
int radius_s = 0x7FFF * 0x7FFF;
v4si radius_square = {radius_s,radius_s,radius_s, radius_s};
v4si x1, y1, x2, y2, x3, y3, x4, y4;
v4si inside_count1 = {0,0,0,0};
v4si inside_count2 = {0,0,0,0};
v4si inside_count3 = {0,0,0,0};
v4si inside_count4 = {0,0,0,0};
int count = 0;
for (i = 0; i < simulate_total / (UNROLL * 4);i++)
{
g_seed1 = (a * g_seed1+b);
x1 = (g_seed1 >> c) & d;
g_seed1 = (a * g_seed1+b);
y1 = (g_seed1 >> c) & d;
inside_count1 += (x1 * x1 + y1 * y1 < radius_square);
g_seed2 = (a * g_seed2+b);
x2 = (g_seed2 >> c) & d;
g_seed2 = (a * g_seed2+b);
y2 = (g_seed2 >> c) & d;
inside_count2 += (x2 * x2 + y2 * y2 < radius_square);
g_seed3 = (a * g_seed3+b);
x3 = (g_seed3 >> c) & d;
g_seed3 = (a * g_seed3+b);
y3 = (g_seed3 >> c) & d;
inside_count3 += (x3 * x3 + y3 * y3 < radius_square);
g_seed4 = (a * g_seed4+b);
x4 = (g_seed4 >> c) & d;
g_seed4 = (a * g_seed4+b);
y4 = (g_seed4 >> c) & d;
inside_count4 += (x4 * x4 + y4 * y4 < radius_square);
}
for (i = 0; i < 4; i++)
count += inside_count1[i] + inside_count2[i] + inside_count3[i] + inside_count4[i];
printf("%f\n", -4.0 * count / simulate_total);
clock_gettime(CLOCK_MONOTONIC, &tend);
printf("This computation took about %.5f seconds\n", ((double)tend.tv_sec + 1.0e-9*tend.tv_nsec) - ((double)tstart.tv_sec + 1.0e-9*tstart.tv_nsec));
exit(0);
}
这个版本用时0.7s
多线程
在上面的基础上使用多线程
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <pthread.h>
#define UNROLL 4
#define NUM_THREADS 4
int simulate_total = 1000000000;
typedef int v4si __attribute__ ((vector_size (16)));
v4si inside_count[NUM_THREADS] = {{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}};
void *thread(void *argp)
{
struct timespec tstart={0,0}, tend={0,0};
clock_gettime(CLOCK_MONOTONIC, &tstart);
int index = (int) argp;
int t1 = (int) time(NULL);
v4si g_seed1 = {t1, 2 * t1, 3 * t1, 4 * t1};
int t2 = (int) time(NULL);
v4si g_seed2 = {t2, 2 * t2, 3 * t2, 4 * t2};
int t3 = (int) time(NULL);
v4si g_seed3 = {t3, 2 * t3, 3 * t3, 4 * t3};
int t4 = (int) time(NULL);
v4si g_seed4 = {t4, 2 * t4, 3 * t4, 4 * t4};
v4si a = {214013,214013,214013,214013};
v4si b = {2531011,2531011,2531011,2531011};
v4si c = {16,16,16,16};
v4si d = {0x7FFF,0x7FFF,0x7FFF,0x7FFF};
int i;
int radius_s = 0x7FFF * 0x7FFF;
v4si radius_square = {radius_s,radius_s,radius_s, radius_s};
v4si x1, y1, x2, y2, x3, y3, x4, y4;
v4si inside_count1 = {0,0,0,0};
v4si inside_count2 = {0,0,0,0};
v4si inside_count3 = {0,0,0,0};
v4si inside_count4 = {0,0,0,0};
for (i = 0; i < simulate_total / (UNROLL * 4 * NUM_THREADS);i++)
{
g_seed1 = (a * g_seed1+b);
x1 = (g_seed1 >> c) & d;
g_seed1 = (a * g_seed1+b);
y1 = (g_seed1 >> c) & d;
inside_count1 += (x1 * x1 + y1 * y1 < radius_square);
g_seed2 = (a * g_seed2+b);
x2 = (g_seed2 >> c) & d;
g_seed2 = (a * g_seed2+b);
y2 = (g_seed2 >> c) & d;
inside_count2 += (x2 * x2 + y2 * y2 < radius_square);
g_seed3 = (a * g_seed3+b);
x3 = (g_seed3 >> c) & d;
g_seed3 = (a * g_seed3+b);
y3 = (g_seed3 >> c) & d;
inside_count3 += (x3 * x3 + y3 * y3 < radius_square);
g_seed4 = (a * g_seed4+b);
x4 = (g_seed4 >> c) & d;
g_seed4 = (a * g_seed4+b);
y4 = (g_seed4 >> c) & d;
inside_count4 += (x4 * x4 + y4 * y4 < radius_square);
}
inside_count[index] += inside_count1 + inside_count2 + inside_count3 + inside_count4;
clock_gettime(CLOCK_MONOTONIC, &tend);
printf("thread %d: this computation took about %.5f seconds\n", index, ((double)tend.tv_sec + 1.0e-9*tend.tv_nsec) - ((double)tstart.tv_sec + 1.0e-9*tstart.tv_nsec));
}
int main()
{
struct timespec tstart={0,0}, tend={0,0};
clock_gettime(CLOCK_MONOTONIC, &tstart);
v4si sum = {0, 0, 0, 0};
int count = 0;
pthread_t tid[NUM_THREADS];
int i, j;
for (i = 0; i < NUM_THREADS; i++)
pthread_create(tid + i, NULL, thread, (void *) i);
for (i = 0; i < NUM_THREADS; i++)
pthread_join(tid[i], NULL);
for (i = 0; i < NUM_THREADS; i++)
for (j = 0; j < 4; j++)
count += inside_count[i][j];
printf("%f\n", -4.0 * count / simulate_total);
clock_gettime(CLOCK_MONOTONIC, &tend);
printf("This computation took about %.5f seconds\n", ((double)tend.tv_sec + 1.0e-9*tend.tv_nsec) - ((double)tstart.tv_sec + 1.0e-9*tstart.tv_nsec));
exit(0);
}
这个版本10亿个随机数用时0.340s左右, 比原始版本快了100倍。
总结
原始版本: 40s
faster rng: 4s
faster rng + AVX: 1.4s
faster rng + AVX + unroll: 0.7s
faster rng + AVX + unroll + 多线程: 0.35s
讨论
编码和优化是不同的工作,不要在编码时做自以为是的优化。Premature optimization is the root of all evil。
程序优化的第一步是用profile工具找出瓶颈。
如果想在半小时内完成优化,无脑地使用SIMD和多线程、多进程即可;如果想把程序加速一个数量级以上,需要考虑有没有更快的算法,算法对速度的提升往往超过AVX和多线程加在一起的效果。
对于int类型,AVX只能使用128位的xmm寄存器,只能4路并行,AVX2可以使用256位的ymm寄存器,实现8路并行,可惜AVX2只支持最新型的CPU。RDSEED和RDRAND可由硬件生成随机数,同样只支持最新型的CPU。如果使用AVX2、RDSEED和RDRAND, 运行时间应该可以进一步降低。
在上面多线程的基础上再加上多进程,用时仍是0.340s左右。后来看了4线程时,CPU usage已经达到380%,所以在多线程的基础上再加上多进程并没有提升速度。