有了前面一个基础,这个就比较容易折腾出来了,只是换了个表达方式,所以也很快就出炉,负责计算的函数的性能仍然跟GCC(3.4.5)优化后的时间一致,但是因为不知道怎么进一步该这个程序,所以就只能到这个地步了,以后有改进的会继续发布。
#include <stdio.h>
#include <time.h>
#define N 1024*1024*1024
#define M 10
int vector_reduction(int *a, int count)
{
int result = 0;
__asm__ __volatile__(
"xor % % rax, % % rax;"
"xor % % ebx, % % ebx;"
"cmp % % ebx, % % edx;"
"je 2f;"
"1:"
"add (% % rsi), % % rax;"
"add $4, % % rsi;"
"add $1, % % ebx;"
//"inc % % rsi;"
//"inc % % ebx;"
"cmp % % ebx, % % edx;"
"ja 1b;"
"2:"
:"=a"(result), "=S"(a), "=d"(count)
:"a"(result), "S"(a), "d"(count)
:"ebx", "memory"
);
return result;
}
int main()
{
int *a = (int *)malloc(sizeof(int) * N);
int i, result;
clock_t start, ctime;
start = clock();
for(i = 0; i < N; ++i)
a[i] = 1;
ctime = clock() - start;
printf("initial time: % d s\n", ctime / CLOCKS_PER_SEC);
start = clock();
for(i = 0; i < M; ++i)
result = vector_reduction(a, N);
ctime = clock() - start;
printf("sum = % d\n", result);
printf("compute time: % d s\n", ctime / CLOCKS_PER_SEC);
free(a);
return 0;
}