近来,希望能通过使用某种技术优化常规memcpy()的性能,于是尝试了 MMX/SSE,希望能借此实现一个性能更高的memcpy函数。
代码如下(里面的USE1函数是借用别人的,但性能也不怎么样):
#include <stdio.h> #include <stdlib.h> #include <string.h> #include <unistd.h> #include <sys/time.h> #define LEN 100*1024*1024 #define USE1 class TimeUse{ public: TimeUse(char * cMsg) { memset(m_cMsg, 0, sizeof(m_cMsg)); strncpy(m_cMsg, cMsg, strlen(cMsg)); gettimeofday(&tTime1, NULL); } ~TimeUse() { gettimeofday(&tTime2, NULL); unsigned long ulDiff = (tTime2.tv_sec-tTime1.tv_sec)*1000 + (tTime2.tv_usec-tTime1.tv_usec)/1000; printf("%s Use %ld ms/n", m_cMsg, ulDiff); } private: struct timeval tTime1, tTime2; char m_cMsg[255]; }; #ifdef USE0 /*100M耗时约85ms*/ static inline void * memcopy(void *dest, const void *src, int size) { int i, n, len, iCount; char * to = (char *)dest; char *from = (char *)src; n = size; len = size; char cFSave[108]; { int i; #if 0 __asm__ __volatile__ ( "1: prefetchnta 128(%0)/n" : : "r" (from) ); #endif /*开始MMX之前要保存FPS*/ iCount = (len/64); if(iCount > 0) { __asm__( ".lcomm buffer, 108/n" "fsave buffer/n" "loop:/n" "movq (%0), %%mm0/n" "movq 8(%0), %%mm1/n" "movq 16(%0), %%mm2/n" "movq 24(%0), %%mm3/n" "movq 32(%0), %%mm4/n" "movq 40(%0), %%mm5/n" "movq 48(%0), %%mm6/n" "movq 56(%0), %%mm7/n" "movntq %%mm0, (%1)/n" "movntq %%mm1, 8(%1)/n" "mov