程序优化技巧——如何将一个程序加速一百倍

本文展示了如何将一个使用蒙特卡洛方法计算圆周率的C语言程序加速一百倍

平台

Ubuntu 14.04 LTS, gcc

原始版本

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

int main()
{

    struct timespec tstart={0,0}, tend={0,0};
    clock_gettime(CLOCK_MONOTONIC, &tstart);
    int i;
    int simulate_total = 1000000000;
    double radius = (double) RAND_MAX;
    double radius_square = radius * radius;
    double x, y;
    int inside_count = 0;
    srand(time(NULL));
    for (i = 0; i < simulate_total;i++)
    {
        x = (double) rand();
        y = (double) rand();

        if (x * x + y * y < radius_square)
            inside_count += 1;
    }
    printf("%f\n", 4.0 * inside_count / simulate_total);   
    clock_gettime(CLOCK_MONOTONIC, &tend);
    printf("This computation took about %.5f seconds\n",
                     ((double)tend.tv_sec + 1.0e-9*tend.tv_nsec) - 
                     ((double)tstart.tv_sec + 1.0e-9*tstart.tv_nsec));
    exit(0);
}

10亿个随机数,用时40s

faster random generator

用kcachegrind进行profile,发现rand是瓶颈,从网上找了一个快速rand的实现:
Fast Random Number Generator on the Intel® Pentium® 4 Processor

#include<stdio.h>
#include<stdlib.h>
#include <time.h>

unsigned int g_seed;


//Used to seed the generator.

inline void fast_srand(int seed)
{
    g_seed = seed;
}


//fastrand routine returns one integer, similar output value range as C lib.
inline int fastrand()
{
    g_seed = (214013*g_seed+2531011);
    return (g_seed>>16)&0x7FFF;
}

int main()
{
    struct timespec tstart={0,0}, tend={0,0};
    clock_gettime(CLOCK_MONOTONIC, &tstart);
    int i;
    int simulate_total = 1000000000;
    int radius = 0x7FFF;
    int radius_square = radius * radius;
    int x, y;
    int inside_count = 0;

    fast_srand((int) time(NULL));
    for (i = 0; i < simulate_total;i++)
    {
        x = fastrand();
        y = fastrand();

        if (x * x + y * y < radius_square)
            inside_count += 1;
    }
    printf("%f\n", 4.0 * inside_count / simulate_total);   
    clock_gettime(CLOCK_MONOTONIC, &tend);
    printf("This computation took about %.5f seconds\n",
                     ((double)tend.tv_sec + 1.0e-9*tend.tv_nsec) - 
                     ((double)tstart.tv_sec + 1.0e-9*tstart.tv_nsec));
    exit(0);
}

这个版本用时4.2s

AVX

在上一步的基础上使用AVX

#include<stdio.h>
#include<stdlib.h>
#include <time.h>

typedef int v4si __attribute__ ((vector_size (16)));

int main()
{
    struct timespec tstart={0,0}, tend={0,0};
    clock_gettime(CLOCK_MONOTONIC, &tstart);
    v4si g_seed = {(int)time(NULL),(int)time(NULL),(int)time(NULL),(int)time(NULL)};
    v4si a = {214013,214013,214013,214013};
    v4si b = {2531011,2531011,2531011,2531011};
    v4si c = {16,16,16,16};
    v4si d = {0x7FFF,0x7FFF,0x7FFF,0x7FFF};
    int i;
    int simulate_total = 1000000000;
    int radius_s = 0x7FFF * 0x7FFF;
    v4si radius_square = {radius_s,radius_s,radius_s, radius_s};
    v4si x, y;
    v4si inside_count = {0,0,0,0};
    int count = 0;
    for (i = 0; i < simulate_total / 4;i++)
    {
        g_seed = (a * g_seed+b);
        x = (g_seed >> c) & d;
        g_seed = (a * g_seed+b);
        y = (g_seed >> c) & d;
        inside_count = inside_count + (x * x + y * y < radius_square);
    }
    for (i = 0; i < 4; i++)
        count += inside_count[i];
    printf("%f\n", -4.0 * count / simulate_total);   
    clock_gettime(CLOCK_MONOTONIC, &tend);
    printf("This computation took about %.5f seconds\n", ((double)tend.tv_sec + 1.0e-9*tend.tv_nsec) - ((double)tstart.tv_sec + 1.0e-9*tstart.tv_nsec));
    exit(0);
}

这个版本用时1.4s

查看汇编:

.L3:
    vpmulld xmm0, xmm0, xmm6
    vpaddd  xmm0, xmm0, xmm5
    vpsrad  xmm2, xmm0, 16
    vpmulld xmm0, xmm0, xmm6
    vpand   xmm2, xmm2, xmm4
    vpaddd  xmm0, xmm0, xmm5
    vpsrad  xmm1, xmm0, 16
    vpand   xmm1, xmm1, xmm4
    sub eax, 1
    vpmulld xmm2, xmm2, xmm2
    vpmulld xmm1, xmm1, xmm1
    vpaddd  xmm1, xmm2, xmm1
    vpcmpgtd    xmm1, xmm7, xmm1
    vpaddd  xmm3, xmm3, xmm1
    jne .L3

循环中的运算都在寄存器进行,没有内存读取,用不上cache优化。

循环中所有的g_seed = (a * g_seed+b);形成了critical path,所以我们使用loop unrolling, 增加并行度,减少循环次数,也就是减少critical path的运行时间。

loop unrolling

#include<stdio.h>
#include<stdlib.h>
#include <time.h>
#define UNROLL 4
typedef int v4si __attribute__ ((vector_size (16)));

int main()
{
    struct timespec tstart={0,0}, tend={0,0};
    clock_gettime(CLOCK_MONOTONIC, &tstart);
    int t1 = (int) time(NULL);
    v4si g_seed1 = {t1, 2 * t1, 3 * t1, 4 * t1};
    int t2 = (int) time(NULL);
    v4si g_seed2 = {t2, 2 * t2, 3 * t2, 4 * t2};
    int t3 = (int) time(NULL);
    v4si g_seed3 = {t3, 2 * t3, 3 * t3, 4 * t3};
    int t4 = (int) time(NULL);
    v4si g_seed4 = {t4, 2 * t4, 3 * t4, 4 * t4};
    v4si a = {214013,214013,214013,214013};
    v4si b = {2531011,2531011,2531011,2531011};
    v4si c = {16,16,16,16};
    v4si d = {0x7FFF,0x7FFF,0x7FFF,0x7FFF};
    int i;
    int simulate_total = 1000000000;
    int radius_s = 0x7FFF * 0x7FFF;
    v4si radius_square = {radius_s,radius_s,radius_s, radius_s};
    v4si x1, y1, x2, y2, x3, y3, x4, y4;
    v4si inside_count1 = {0,0,0,0};
    v4si inside_count2 = {0,0,0,0};
    v4si inside_count3 = {0,0,0,0};
    v4si inside_count4 = {0,0,0,0};
    int count = 0;
    for (i = 0; i < simulate_total / (UNROLL * 4);i++)
    {
        g_seed1 = (a * g_seed1+b);
        x1 = (g_seed1 >> c) & d;
        g_seed1 = (a * g_seed1+b);
        y1 = (g_seed1 >> c) & d;
        inside_count1 += (x1 * x1 + y1 * y1 < radius_square);
        g_seed2 = (a * g_seed2+b);
        x2 = (g_seed2 >> c) & d;
        g_seed2 = (a * g_seed2+b);
        y2 = (g_seed2 >> c) & d;
        inside_count2 += (x2 * x2 + y2 * y2 < radius_square);
        g_seed3 = (a * g_seed3+b);
        x3 = (g_seed3 >> c) & d;
        g_seed3 = (a * g_seed3+b);
        y3 = (g_seed3 >> c) & d;
        inside_count3 += (x3 * x3 + y3 * y3 < radius_square);
        g_seed4 = (a * g_seed4+b);
        x4 = (g_seed4 >> c) & d;
        g_seed4 = (a * g_seed4+b);
        y4 = (g_seed4 >> c) & d;
        inside_count4 += (x4 * x4 + y4 * y4 < radius_square);
    }
    for (i = 0; i < 4; i++)
        count += inside_count1[i] + inside_count2[i] + inside_count3[i] + inside_count4[i];
    printf("%f\n", -4.0 * count / simulate_total);   
    clock_gettime(CLOCK_MONOTONIC, &tend);
    printf("This computation took about %.5f seconds\n", ((double)tend.tv_sec + 1.0e-9*tend.tv_nsec) - ((double)tstart.tv_sec + 1.0e-9*tstart.tv_nsec));
    exit(0);
}

这个版本用时0.7s

多线程

在上面的基础上使用多线程

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <pthread.h>


#define UNROLL 4
#define NUM_THREADS 4


int simulate_total = 1000000000;
typedef int v4si __attribute__ ((vector_size (16)));
v4si inside_count[NUM_THREADS] = {{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}}; 


void *thread(void *argp)
{
    struct timespec tstart={0,0}, tend={0,0};
    clock_gettime(CLOCK_MONOTONIC, &tstart);
    int index = (int) argp;
    int t1 = (int) time(NULL);
    v4si g_seed1 = {t1, 2 * t1, 3 * t1, 4 * t1};
    int t2 = (int) time(NULL);
    v4si g_seed2 = {t2, 2 * t2, 3 * t2, 4 * t2};
    int t3 = (int) time(NULL);
    v4si g_seed3 = {t3, 2 * t3, 3 * t3, 4 * t3};
    int t4 = (int) time(NULL);
    v4si g_seed4 = {t4, 2 * t4, 3 * t4, 4 * t4};
    v4si a = {214013,214013,214013,214013};
    v4si b = {2531011,2531011,2531011,2531011};
    v4si c = {16,16,16,16};
    v4si d = {0x7FFF,0x7FFF,0x7FFF,0x7FFF};
    int i;
    int radius_s = 0x7FFF * 0x7FFF;
    v4si radius_square = {radius_s,radius_s,radius_s, radius_s};
    v4si x1, y1, x2, y2, x3, y3, x4, y4;
    v4si inside_count1 = {0,0,0,0};
    v4si inside_count2 = {0,0,0,0};
    v4si inside_count3 = {0,0,0,0};
    v4si inside_count4 = {0,0,0,0};

    for (i = 0; i < simulate_total / (UNROLL * 4 * NUM_THREADS);i++)
    {
        g_seed1 = (a * g_seed1+b);
        x1 = (g_seed1 >> c) & d;
        g_seed1 = (a * g_seed1+b);
        y1 = (g_seed1 >> c) & d;
        inside_count1 += (x1 * x1 + y1 * y1 < radius_square);
        g_seed2 = (a * g_seed2+b);
        x2 = (g_seed2 >> c) & d;
        g_seed2 = (a * g_seed2+b);
        y2 = (g_seed2 >> c) & d;
        inside_count2 += (x2 * x2 + y2 * y2 < radius_square);
        g_seed3 = (a * g_seed3+b);
        x3 = (g_seed3 >> c) & d;
        g_seed3 = (a * g_seed3+b);
        y3 = (g_seed3 >> c) & d;
        inside_count3 += (x3 * x3 + y3 * y3 < radius_square);
        g_seed4 = (a * g_seed4+b);
        x4 = (g_seed4 >> c) & d;
        g_seed4 = (a * g_seed4+b);
        y4 = (g_seed4 >> c) & d;
        inside_count4 += (x4 * x4 + y4 * y4 < radius_square);
    }
    inside_count[index] += inside_count1 + inside_count2 + inside_count3 + inside_count4;
    clock_gettime(CLOCK_MONOTONIC, &tend);
    printf("thread %d: this computation took about %.5f seconds\n", index, ((double)tend.tv_sec + 1.0e-9*tend.tv_nsec) - ((double)tstart.tv_sec + 1.0e-9*tstart.tv_nsec));
}
int main()
{
    struct timespec tstart={0,0}, tend={0,0};
    clock_gettime(CLOCK_MONOTONIC, &tstart);
    v4si sum  = {0, 0, 0, 0};
    int count = 0;
    pthread_t tid[NUM_THREADS];
    int i, j;

    for (i = 0; i < NUM_THREADS; i++)
        pthread_create(tid + i, NULL, thread, (void *) i);
    for (i = 0; i < NUM_THREADS; i++)
        pthread_join(tid[i], NULL);
    for (i = 0; i < NUM_THREADS; i++)
        for (j = 0; j < 4; j++)
            count += inside_count[i][j];
    printf("%f\n", -4.0 * count / simulate_total);   
    clock_gettime(CLOCK_MONOTONIC, &tend);
    printf("This computation took about %.5f seconds\n", ((double)tend.tv_sec + 1.0e-9*tend.tv_nsec) - ((double)tstart.tv_sec + 1.0e-9*tstart.tv_nsec));
    exit(0);
}

这个版本10亿个随机数用时0.340s左右, 比原始版本快了100倍。

总结

原始版本:             40s
faster rng:           4s
faster rng + AVX:       1.4s
faster rng + AVX + unroll:  0.7s
faster rng + AVX + unroll + 多线程: 0.35s

讨论

  • 编码和优化是不同的工作,不要在编码时做自以为是的优化。Premature optimization is the root of all evil

  • 程序优化的第一步是用profile工具找出瓶颈。

  • 如果想在半小时内完成优化,无脑地使用SIMD和多线程、多进程即可;如果想把程序加速一个数量级以上,需要考虑有没有更快的算法,算法对速度的提升往往超过AVX和多线程加在一起的效果。

  • 对于int类型,AVX只能使用128位的xmm寄存器,只能4路并行,AVX2可以使用256位的ymm寄存器,实现8路并行,可惜AVX2只支持最新型的CPU。RDSEED和RDRAND可由硬件生成随机数,同样只支持最新型的CPU。如果使用AVX2、RDSEED和RDRAND, 运行时间应该可以进一步降低。

  • 在上面多线程的基础上再加上多进程,用时仍是0.340s左右。后来看了4线程时,CPU usage已经达到380%,所以在多线程的基础上再加上多进程并没有提升速度。

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值