Arm平台下各种memcpy优化对比＜三＞

xiaozhiwise

于 2024-02-29 18:16:01 发布

阅读量573

点赞数 8

分类专栏： Linux 文章标签： linux

本文链接：https://blog.csdn.net/xiaozhiwise/article/details/136377929

版权

Linux 专栏收录该内容

96 篇文章 1 订阅

订阅专栏

因memcpy导致tda4vm上的h264解码占CPU较高而改弃，从网上找到各种memcpy的优化代码，在一起做了个运行速度对比，请查收；

#include <stdio.h>
#include <stdlib.h>     /* rand, srand */
#include <string.h>
#include <assert.h>
#include <sys/time.h>
#include <time.h>       /* time() */

/*
 * 可以一个字长一个字长的拷贝，而不需要逐个字节来拷贝
 */
void *memcpy_word_length(void *dst,const void *src, size_t num)
{
    assert((dst!=NULL)&&(src!=NULL));

    int wordnum = num/4;    // 计算有多少个32位，按4字节拷贝
    int slice = num%4;      // 剩余的按字节拷贝
    int * pintsrc = (int *)src;
    int * pintdst = (int *)dst;

    while(wordnum--)
        *pintdst++ = *pintsrc++;
    while (slice--)
        *((char *)pintdst++) =*((char *)pintsrc++);

    return dst;
}

/*
 * 考虑了地址重叠
 */
void *memmove_address_overlap(void *dst, const void *src, size_t len)
{
    //8字节长度
	double *srcdb, *dstdb;
	char *srcch, *dstch;
	size_t times, left;
 
	times = len / 8;
	left = len % 8;
	
	if (!src || !dst)	return dst;
 
	//从前往后复制
	if (dst < src) {
		srcdb = (double *)src;
		dstdb = (double *)dst;
		
		while (times--) {
			*dstdb++ = *srcdb++;
			len-=8;
		}
		
		srcch = (char *)srcdb;
		dstch = (char *)dstdb;
 
		while (left--) {
			*dstch++ = *srcch++;
		}
	}
	//从后往前复制
	else if (dst > src) {
		if (times > 0) {
			dstdb = (double *)(dst+len-8);
			srcdb = (double *)(src+len-8);	
		}
		
		while(times--) {
			*dstdb = *srcdb;
			dstdb--;srcdb--;
		}
		
		if (len > 8 && left) {
			srcch = (char *)(srcdb+1)-1;
			dstch = (char *)(dstdb+1)-1;
			while(left--) {
				*dstch-- = *srcch--;
			}
		}
		else {
      		srcch = (char *)src + (len-1);
      		dstch = (char *)dst + (len-1);	
      		
            while (len--)
	 	       *dstch-- = *srcch--;
		}
	}
	
	return dst;
}

static void get_rand_bytes(unsigned char *data, int len)
{
    int i;

    srand((unsigned)time(NULL)); //种下随机种子
    for (i = 0; i < len; i++) {
        data[i] = rand() % 255; //取随机数，并保证数在0-255之间
        //printf("%02X ", data[i]);
    }  
}

static int get_cur_time_us(void)
{
    struct timeval tv;

    gettimeofday(&tv, NULL);  //使用gettimeofday获取当前系统时间

    return (tv.tv_sec * 1000 * 1000 + tv.tv_usec); //利用struct timeval结构体将时间转换为ms
}

#define ARRAY_SIZE(n)  sizeof(n) / sizeof(n[0])

int main(void)
{
   int size_list[] = {
       1024 * 1024 * 10,  // 10MB
       1024 * 1024 * 1,  // 1MB
       1024 * 100, // 100KB
       1024 * 10, // 10KB
       1024 * 1, // 1KB
   };
   char *data1;
   char *data2;
   int t1;
   int t2;
   int i = 0;
  
   data1 = (char *)malloc(size_list[0]);
   data2 = (char *)malloc(size_list[0]);
  
   get_rand_bytes((unsigned char *)data1, size_list[0]);
  
   for (i = 0; i < ARRAY_SIZE(size_list); i++) {
       t1 = get_cur_time_us();
       memcpy(data2, data1, size_list[i]);
       t2 = get_cur_time_us();
       printf("copy %d bytes, memcpy   waste time %dus\n", size_list[i], t2 - t1);
      
       t1 = get_cur_time_us();
       memcpy_word_length(data2, data1, size_list[i]);
       t2 = get_cur_time_us();
       printf("copy %d bytes, memcpy_word_length  waste time %dus\n", size_list[i], t2 - t1);

       t1 = get_cur_time_us();
       memmove_address_overlap(data2, data1, size_list[i]);
       t2 = get_cur_time_us();
       printf("copy %d bytes, memmove_address_overlap  waste time %dus\n\n", size_list[i], t2 - t1);
   }
  
   free(data1);
   free(data2);
  
   return 0;
}

#if 0

copy 10485760 bytes, memcpy waste time 7324us
copy 10485760 bytes, memcpy_word_length waste time 12940us
copy 10485760 bytes, memmove_address_overlap waste time 14450us

copy 1048576 bytes, memcpy waste time 704us
copy 1048576 bytes, memcpy_word_length waste time 1313us
copy 1048576 bytes, memmove_address_overlap waste time 1220us

copy 102400 bytes, memcpy waste time 81us
copy 102400 bytes, memcpy_word_length waste time 96us
copy 102400 bytes, memmove_address_overlap waste time 65us

copy 10240 bytes, memcpy waste time 2us
copy 10240 bytes, memcpy_word_length waste time 14us
copy 10240 bytes, memmove_address_overlap waste time 6us

copy 1024 bytes, memcpy waste time 0us
copy 1024 bytes, memcpy_word_length waste time 1us
copy 1024 bytes, memmove_address_overlap waste time 1us

#endif