程序优化技巧——如何将一个程序加速一百倍

最新推荐文章于 2023-09-24 15:03:46 发布

Smiling_Wang

最新推荐文章于 2023-09-24 15:03:46 发布

阅读量2.7k

点赞数

分类专栏： C语言优化文章标签： C语言优化 profile AVX 多线程

本文链接：https://blog.csdn.net/Smiling_Wang/article/details/50922090

版权

优化同时被 2 个专栏收录

2 篇文章 0 订阅

订阅专栏

C语言

1 篇文章 0 订阅

订阅专栏

本文展示了如何将一个使用蒙特卡洛方法计算圆周率的C语言程序加速一百倍

平台

Ubuntu 14.04 LTS, gcc

原始版本

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

int main()
{

    struct timespec tstart={0,0}, tend={0,0};
    clock_gettime(CLOCK_MONOTONIC, &tstart);
    int i;
    int simulate_total = 1000000000;
    double radius = (double) RAND_MAX;
    double radius_square = radius * radius;
    double x, y;
    int inside_count = 0;
    srand(time(NULL));
    for (i = 0; i < simulate_total;i++)
    {
        x = (double) rand();
        y = (double) rand();

        if (x * x + y * y < radius_square)
            inside_count += 1;
    }
    printf("%f\n", 4.0 * inside_count / simulate_total);   
    clock_gettime(CLOCK_MONOTONIC, &tend);
    printf("This computation took about %.5f seconds\n",
                     ((double)tend.tv_sec + 1.0e-9*tend.tv_nsec) - 
                     ((double)tstart.tv_sec + 1.0e-9*tstart.tv_nsec));
    exit(0);
}

10亿个随机数，用时40s

faster random generator

用kcachegrind进行profile，发现rand是瓶颈，从网上找了一个快速rand的实现:
Fast Random Number Generator on the Intel® Pentium® 4 Processor

#include<stdio.h>
#include<stdlib.h>
#include <time.h>

unsigned int g_seed;


//Used to seed the generator.

inline void fast_srand(int seed)
{
    g_seed = seed;
}


//fastrand routine returns one integer, similar output value range as C lib.
inline int fastrand()
{
    g_seed = (214013*g_seed+2531011);
    return (g_seed>>16)&0x7FFF;
}

int main()
{
    struct timespec tstart={0,0}, tend={0,0};
    clock_gettime(CLOCK_MONOTONIC, &tstart);
    int i;
    int simulate_total = 1000000000;
    int radius = 0x7FFF;
    int radius_square = radius * radius;
    int x, y;
    int inside_count = 0;

    fast_srand((int) time(NULL));
    for (i = 0; i < simulate_total;i++)
    {
        x = fastrand();
        y = fastrand();

        if (x * x + y * y < radius_square)
            inside_count += 1;
    }
    printf("%f\n", 4.0 * inside_count / simulate_total);   
    clock_gettime(CLOCK_MONOTONIC, &tend);
    printf("This computation took about %.5f seconds\n",
                     ((double)tend.tv_sec + 1.0e-9*tend.tv_nsec) - 
                     ((double)tstart.tv_sec + 1.0e-9*tstart.tv_nsec));
    exit(0);
}

这个版本用时4.2s

AVX

在上一步的基础上使用AVX

#include<stdio.h>
#include<stdlib.h>
#include <time.h>

typedef int v4si __attribute__ ((vector_size (16)));

int main()
{
    struct timespec tstart={0,0}, tend={0,0};
    clock_gettime(CLOCK_MONOTONIC, &tstart);
    v4si g_seed = {(int)time(NULL),(int)time(NULL),(int)time(NULL),(int)time(NULL)};
    v4si a = {214013,214013,214013,214013};
    v4si b = {2531011,2531011,2531011,2531011};
    v4si c = {16,16,16,16};
    v4si d = {0x7FFF,0x7FFF,0x7FFF,0x7FFF};
    int i;
    int simulate_total = 1000000000;
    int radius_s = 0x7FFF * 0x7FFF;
    v4si radius_square = {radius_s,radius_s,radius_s, radius_s};
    v4si x, y;
    v4si inside_count = {0,0,0,0};
    int count = 0;
    for (i = 0; i < simulate_total / 4;i++)
    {
        g_seed = (a * g_seed+b);
        x = (g_seed >> c) & d;
        g_seed = (a * g_seed+b);
        y = (g_seed >> c) & d;
        inside_count = inside_count + (x * x + y * y < radius_square);
    }
    for (i = 0; i < 4; i++)
        count += inside_count[i];
    printf("%f\n", -4.0 * count / simulate_total);   
    clock_gettime(CLOCK_MONOTONIC, &tend);
    printf("This computation took about %.5f seconds\n", ((double)tend.tv_sec + 1.0e-9*tend.tv_nsec) - ((double)tstart.tv_sec + 1.0e-9*tstart.tv_nsec));
    exit(0);
}

这个版本用时1.4s

查看汇编:

.L3:
    vpmulld xmm0, xmm0, xmm6
    vpaddd  xmm0, xmm0, xmm5
    vpsrad  xmm2, xmm0, 16
    vpmulld xmm0, xmm0, xmm6
    vpand   xmm2, xmm2, xmm4
    vpaddd  xmm0, xmm0, xmm5
    vpsrad  xmm1, xmm0, 16
    vpand   xmm1, xmm1, xmm4
    sub eax, 1
    vpmulld xmm2, xmm2, xmm2
    vpmulld xmm1, xmm1, xmm1
    vpaddd  xmm1, xmm2, xmm1
    vpcmpgtd    xmm1, xmm7, xmm1
    vpaddd  xmm3, xmm3, xmm1
    jne .L3

循环中的运算都在寄存器进行，没有内存读取，用不上cache优化。

循环中所有的g_seed = (a * g_seed+b);形成了critical path，所以我们使用loop unrolling，增加并行度，减少循环次数，也就是减少critical path的运行时间。

loop unrolling

#include<stdio.h>
#include<stdlib.h>
#include <time.h>
#define UNROLL 4
typedef int v4si __attribute__ ((vector_size (16)));

int main()
{
    struct timespec tstart={0,0}, tend={0,0};
    clock_gettime(CLOCK_MONOTONIC, &tstart);
    int t1 = (int) time(NULL);
    v4si g_seed1 = {t1, 2 * t1, 3 * t1, 4 * t1};
    int t2 = (int) time(NULL);
    v4si g_seed2 = {t2, 2 * t2, 3 * t2, 4 * t2};
    int t3 = (int) time(NULL);
    v4si g_seed3 = {t3, 2 * t3, 3 * t3, 4 * t3};
    int t4 = (int) time(NULL);
    v4si g_seed4 = {t4, 2 * t4, 3 * t4, 4 * t4};
    v4si a = {214013,214013,214013,214013};
    v4si b = {2531011,2531011,2531011,2531011};
    v4si c = {16,16,16,16};
    v4si d = {0x7FFF,0x7FFF,0x7FFF,0x7FFF};
    int i;
    int simulate_total = 1000000000;
    int radius_s = 0x7FFF * 0x7FFF;
    v4si radius_square = {radius_s,radius_s,radius_s, radius_s};
    v4si x1, y1, x2, y2, x3, y3, x4, y4;
    v4si inside_count1 = {0,0,0,0};
    v4si inside_count2 = {0,0,0,0};
    v4si inside_count3 = {0,0,0,0};
    v4si inside_count4 = {0,0,0,0};
    int count = 0;
    for (i = 0; i < simulate_total / (UNROLL * 4);i++)
    {
        g_seed1 = (a * g_seed1+b);
        x1 = (g_seed1 >> c) & d;
        g_seed1 = (a * g_seed1+b);
        y1 = (g_seed1 >> c) & d;
        inside_count1 += (x1 * x1 + y1 * y1 < radius_square);
        g_seed2 = (a * g_seed2+b);
        x2 = (g_seed2 >> c) & d;
        g_seed2 = (a * g_seed2+b);
        y2 = (g_seed2 >> c) & d;
        inside_count2 += (x2 * x2 + y2 * y2 < radius_square);
        g_seed3 = (a * g_seed3+b);
        x3 = (g_seed3 >> c) & d;
        g_seed3 = (a * g_seed3+b);
        y3 = (g_seed3 >> c) & d;
        inside_count3 += (x3 * x3 + y3 * y3 < radius_square);
        g_seed4 = (a * g_seed4+b);
        x4 = (g_seed4 >> c) & d;
        g_seed4 = (a * g_seed4+b);
        y4 = (g_seed4 >> c) & d;
        inside_count4 += (x4 * x4 + y4 * y4 < radius_square);
    }
    for (i = 0; i < 4; i++)
        count += inside_count1[i] + inside_count2[i] + inside_count3[i] + inside_count4[i];
    printf("%f\n", -4.0 * count / simulate_total);   
    clock_gettime(CLOCK_MONOTONIC, &tend);
    printf("This computation took about %.5f seconds\n", ((double)tend.tv_sec + 1.0e-9*tend.tv_nsec) - ((double)tstart.tv_sec + 1.0e-9*tstart.tv_nsec));
    exit(0);
}

这个版本用时0.7s

多线程

在上面的基础上使用多线程

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <pthread.h>


#define UNROLL 4
#define NUM_THREADS 4


int simulate_total = 1000000000;
typedef int v4si __attribute__ ((vector_size (16)));
v4si inside_count[NUM_THREADS] = {{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}}; 


void *thread(void *argp)
{
    struct timespec tstart={0,0}, tend={0,0};
    clock_gettime(CLOCK_MONOTONIC, &tstart);
    int index = (int) argp;
    int t1 = (int) time(NULL);
    v4si g_seed1 = {t1, 2 * t1, 3 * t1, 4 * t1};
    int t2 = (int) time(NULL);
    v4si g_seed2 = {t2, 2 * t2, 3 * t2, 4 * t2};
    int t3 = (int) time(NULL);
    v4si g_seed3 = {t3, 2 * t3, 3 * t3, 4 * t3};
    int t4 = (int) time(NULL);
    v4si g_seed4 = {t4, 2 * t4, 3 * t4, 4 * t4};
    v4si a = {214013,214013,214013,214013};
    v4si b = {2531011,2531011,2531011,2531011};
    v4si c = {16,16,16,16};
    v4si d = {0x7FFF,0x7FFF,0x7FFF,0x7FFF};
    int i;
    int radius_s = 0x7FFF * 0x7FFF;
    v4si radius_square = {radius_s,radius_s,radius_s, radius_s};
    v4si x1, y1, x2, y2, x3, y3, x4, y4;
    v4si inside_count1 = {0,0,0,0};
    v4si inside_count2 = {0,0,0,0};
    v4si inside_count3 = {0,0,0,0};
    v4si inside_count4 = {0,0,0,0};

    for (i = 0; i < simulate_total / (UNROLL * 4 * NUM_THREADS);i++)
    {
        g_seed1 = (a * g_seed1+b);
        x1 = (g_seed1 >> c) & d;
        g_seed1 = (a * g_seed1+b);
        y1 = (g_seed1 >> c) & d;
        inside_count1 += (x1 * x1 + y1 * y1 < radius_square);
        g_seed2 = (a * g_seed2+b);
        x2 = (g_seed2 >> c) & d;
        g_seed2 = (a * g_seed2+b);
        y2 = (g_seed2 >> c) & d;
        inside_count2 += (x2 * x2 + y2 * y2 < radius_square);
        g_seed3 = (a * g_seed3+b);
        x3 = (g_seed3 >> c) & d;
        g_seed3 = (a * g_seed3+b);
        y3 = (g_seed3 >> c) & d;
        inside_count3 += (x3 * x3 + y3 * y3 < radius_square);
        g_seed4 = (a * g_seed4+b);
        x4 = (g_seed4 >> c) & d;
        g_seed4 = (a * g_seed4+b);
        y4 = (g_seed4 >> c) & d;
        inside_count4 += (x4 * x4 + y4 * y4 < radius_square);
    }
    inside_count[index] += inside_count1 + inside_count2 + inside_count3 + inside_count4;
    clock_gettime(CLOCK_MONOTONIC, &tend);
    printf("thread %d: this computation took about %.5f seconds\n", index, ((double)tend.tv_sec + 1.0e-9*tend.tv_nsec) - ((double)tstart.tv_sec + 1.0e-9*tstart.tv_nsec));
}
int main()
{
    struct timespec tstart={0,0}, tend={0,0};
    clock_gettime(CLOCK_MONOTONIC, &tstart);
    v4si sum  = {0, 0, 0, 0};
    int count = 0;
    pthread_t tid[NUM_THREADS];
    int i, j;

    for (i = 0; i < NUM_THREADS; i++)
        pthread_create(tid + i, NULL, thread, (void *) i);
    for (i = 0; i < NUM_THREADS; i++)
        pthread_join(tid[i], NULL);
    for (i = 0; i < NUM_THREADS; i++)
        for (j = 0; j < 4; j++)
            count += inside_count[i][j];
    printf("%f\n", -4.0 * count / simulate_total);   
    clock_gettime(CLOCK_MONOTONIC, &tend);
    printf("This computation took about %.5f seconds\n", ((double)tend.tv_sec + 1.0e-9*tend.tv_nsec) - ((double)tstart.tv_sec + 1.0e-9*tstart.tv_nsec));
    exit(0);
}

这个版本10亿个随机数用时0.340s左右, 比原始版本快了100倍。

总结

原始版本:             40s
faster rng:           4s
faster rng + AVX:       1.4s
faster rng + AVX + unroll:  0.7s
faster rng + AVX + unroll + 多线程： 0.35s

讨论

编码和优化是不同的工作，不要在编码时做自以为是的优化。Premature optimization is the root of all evil。
程序优化的第一步是用profile工具找出瓶颈。
如果想在半小时内完成优化，无脑地使用SIMD和多线程、多进程即可；如果想把程序加速一个数量级以上，需要考虑有没有更快的算法，算法对速度的提升往往超过AVX和多线程加在一起的效果。
对于int类型，AVX只能使用128位的xmm寄存器，只能4路并行，AVX2可以使用256位的ymm寄存器，实现8路并行，可惜AVX2只支持最新型的CPU。RDSEED和RDRAND可由硬件生成随机数，同样只支持最新型的CPU。如果使用AVX2、RDSEED和RDRAND，运行时间应该可以进一步降低。
在上面多线程的基础上再加上多进程，用时仍是0.340s左右。后来看了4线程时，CPU usage已经达到380%，所以在多线程的基础上再加上多进程并没有提升速度。