下面是我简单写的c代码和汇编代码实现:
#include <stdio.h>
#include <unistd.h>
#include <sys/time.h>
int main()
{
int *pIntM = (int*)malloc(1024*1024*sizeof(int));
struct timeval tv_start, tv_end;
memset((void*)pIntM, 0 , 1024*1024*sizeof(int));
gettimeofday(&tv_start, 0);
int *pIntTmp = pIntM;
#if 0
for(int i = 0; i < 1024*1024; i++, pIntTmp++)
{
*pIntTmp = i;
}
#else
__asm__ volatile(
"mov x5,%0\n"
"mov x6, #0\n"
"nextProc:\n"
"str x6,[x5, #4]\n"
"add x5, x5, #4\n"
"add x6, x6, #1\n"
"mov x7, #0x100000\n"
"cmp x6, x7\n"
"b.ge exit\n"
"b nextProc\n"
"exit:\n"
"nop\n"
:"=r"(pIntTmp)
:"0"(pIntTmp)
:"memory","cc"
);
#endif
gettimeofday(&tv_end, 0);
printf("%s(%d): cost %ldms\n", __FILE__, __LINE__, (tv_end.tv_sec * 1000000 + tv_end.tv_usec -
tv_start.tv_sec * 1000000 - tv_start.tv_usec)/1000);
printf("%d, %d, %d, %d\n", pIntTmp[1024], pIntTmp[1024*1024-3], pIntTmp[1024*1024-2], pIntTmp[1024*1024-1]);
return 0;
}
如果c实现,编译时,
不使用-o3选项时,
cost 12ms
使用-o3,同样是:
cost 12ms
如果使用汇编
cost 3ms
也就是说,在这个小程序使用手写的汇编实现还是能更好的提升性能的。