memcpy 性能 直接使用库函数(已优化到极限),但存在cache 问题(第一次调用性能差)
//memcpy_tmp.h
/* This Software is part of Wind River Simics. The rights to copy, distribute,
modify, or otherwise make use of this Software may be licensed only
pursuant to the terms of an applicable license agreement.
Copyright 2014-2018 Intel Corporation */
#ifndef MEMCPY_VDE
#define MEMCPY_VDE
inline void memcpy_vde(void* pDst,void* pSrc,int len);
#endif
//memcpy_tmp.cpp
/* This Software is part of Wind River Simics. The rights to copy, distribute,
modify, or otherwise make use of this Software may be licensed only
pursuant to the terms of an applicable license agreement.
Copyright 2014-2018 Intel Corporation */
#include <stdio.h>
#include <string.h>
//#include "Test.h"
//#include "libmemcpy_opt.h"
#include <math.h>
using namespace std;
inline void memcpy_vde(void* pDst,void* pSrc,int len)
{
unsigned long* pSrc_mem_64 = (unsigned long*) pSrc;
unsigned long* pDst_mem_64 = (unsigned long*) pDst;
unsigned long len_512Byte = len>>9;
int i = 0 ,j = 0;
for(; i < len_512Byte; i++,j+=64) //512byte
{
// printf("j =%d\n",j);
pDst_mem_64[j]=pSrc_mem_64[j];
pDst_mem_64[j+1]=pSrc_mem_64[j+1];
pDst_mem_64[j+2]=pSrc_mem_64[j+2];
pDst_mem_64[j+3]=pSrc_mem_64[j+3];
pDst_mem_64[j+4]=pSrc_mem_64[j+4];
pDst_mem_64[j+5]=pSrc_mem_64[j+5];
pDst_mem_64[j+6]=pSrc_mem_64[j+6];
pDst_mem_64[j+7]=pSrc_mem_64[j+7];
pDst_mem_64[j+8]=pSrc_mem_64[j+8];
pDst_mem_64[j+9]=pSrc_mem_64[j+9];
pDst_mem_64[j+10]=pSrc_mem_64[j+10];
pDst_mem_64[j+11]=pSrc_mem_64[j+11];
pDst_mem_64[j+12]=pSrc_mem_64[j+12];
pDst_mem_64[j+13]=pSrc_mem_64[j+13];
pDst_mem_64[j+14]=pSrc_mem_64[j+14];
pDst_mem_64[j+15]=pSrc_mem_64[j+15];
pDst_mem_64[j+16]=pSrc_mem_64[j+16];
pDst_mem_64[j+17]=pSrc_mem_64[j+17];
pDst_mem_64[j+18]=pSrc_mem_64[j+18];
pDst_mem_64[j+19]=pSrc_mem_64[j+19];
pDst_mem_64[j+20]=pSrc_mem_64[j+20];
pDst_mem_64[j+21]=pSrc_mem_64[j+21];
pDst_mem_64[j+22]=pSrc_mem_64[j+22];
pDst_mem_64[j+23]=pSrc_mem_64[j+23];
pDst_mem_64[j+24]=pSrc_mem_64[j+24];
pDst_mem_64[j+25]=pSrc_mem_64[j+25];
pDst_mem_64[j+26]=pSrc_mem_64[j+26];
pDst_mem_64[j+27]=pSrc_mem_64[j+27];
pDst_mem_64[j+28]=pSrc_mem_64[j+28];
pDst_mem_64[j+29]=pSrc_mem_64[j+29];
pDst_mem_64[j+30]=pSrc_mem_64[j+30];
pDst_mem_64[j+31]=pSrc_mem_64[j+31];
pDst_mem_64[j+32]=pSrc_mem_64[j+32];
pDst_mem_64[j+33]=pSrc_mem_64[j+33];
pDst_mem_64[j+34]=pSrc_mem_64[j+34];
pDst_mem_64[j+35]=pSrc_mem_64[j+35];
pDst_mem_64[j+36]=pSrc_mem_64[j+36];
pDst_mem_64[j+37]=pSrc_mem_64[j+37];
pDst_mem_64[j+38]=pSrc_mem_64[j+38];
pDst_mem_64[j+39]=pSrc_mem_64[j+39];
pDst_mem_64[j+40]=pSrc_mem_64[j+40];
pDst_mem_64[j+41]=pSrc_mem_64[j+41];
pDst_mem_64[j+42]=pSrc_mem_64[j+42];
pDst_mem_64[j+43]=pSrc_mem_64[j+43];
pDst_mem_64[j+44]=pSrc_mem_64[j+44];
pDst_mem_64[j+45]=pSrc_mem_64[j+45];
pDst_mem_64[j+46]=pSrc_mem_64[j+46];
pDst_mem_64[j+47]=pSrc_mem_64[j+47];
pDst_mem_64[j+48]=pSrc_mem_64[j+48];
pDst_mem_64[j+49]=pSrc_mem_64[j+49];
pDst_mem_64[j+50]=pSrc_mem_64[j+50];
pDst_mem_64[j+51]=pSrc_mem_64[j+51];
pDst_mem_64[j+52]=pSrc_mem_64[j+52];
pDst_mem_64[j+53]=pSrc_mem_64[j+53];
pDst_mem_64[j+54]=pSrc_mem_64[j+54];
pDst_mem_64[j+55]=pSrc_mem_64[j+55];
pDst_mem_64[j+56]=pSrc_mem_64[j+56];
pDst_mem_64[j+57]=pSrc_mem_64[j+57];
pDst_mem_64[j+58]=pSrc_mem_64[j+58];
pDst_mem_64[j+59]=pSrc_mem_64[j+59];
pDst_mem_64[j+60]=pSrc_mem_64[j+60];
pDst_mem_64[j+61]=pSrc_mem_64[j+61];
pDst_mem_64[j+62]=pSrc_mem_64[j+62];
pDst_mem_64[j+63]=pSrc_mem_64[j+63];
}
if((len>>8) & 0x1) //256byte
{
pDst_mem_64[j]=pSrc_mem_64[j];
pDst_mem_64[j+1]=pSrc_mem_64[j+1];
pDst_mem_64[j+2]=pSrc_mem_64[j+2];
pDst_mem_64[j+3]=pSrc_mem_64[j+3];
pDst_mem_64[j+4]=pSrc_mem_64[j+4];
pDst_mem_64[j+5]=pSrc_mem_64[j+5];
pDst_mem_64[j+6]=pSrc_mem_64[j+6];
pDst_mem_64[j+7]=pSrc_mem_64[j+7];
pDst_mem_64[j+8]=pSrc_mem_64[j+8];
pDst_mem_64[j+9]=pSrc_mem_64[j+9];
pDst_mem_64[j+10]=pSrc_mem_64[j+10];
pDst_mem_64[j+11]=pSrc_mem_64[j+11];
pDst_mem_64[j+12]=pSrc_mem_64[j+12];
pDst_mem_64[j+13]=pSrc_mem_64[j+13];
pDst_mem_64[j+14]=pSrc_mem_64[j+14];
pDst_mem_64[j+15]=pSrc_mem_64[j+15];
pDst_mem_64[j+16]=pSrc_mem_64[j+16];
pDst_mem_64[j+17]=pSrc_mem_64[j+17];
pDst_mem_64[j+18]=pSrc_mem_64[j+18];
pDst_mem_64[j+19]=pSrc_mem_64[j+19];
pDst_mem_64[j+20]=pSrc_mem_64[j+20];
pDst_mem_64[j+21]=pSrc_mem_64[j+21];
pDst_mem_64[j+22]=pSrc_mem_64[j+22];
pDst_mem_64[j+23]=pSrc_mem_64[j+23];
pDst_mem_64[j+24]=pSrc_mem_64[j+24];
pDst_mem_64[j+25]=pSrc_mem_64[j+25];
pDst_mem_64[j+26]=pSrc_mem_64[j+26];
pDst_mem_64[j+27]=pSrc_mem_64[j+27];
pDst_mem_64[j+28]=pSrc_mem_64[j+28];
pDst_mem_64[j+29]=pSrc_mem_64[j+29];
pDst_mem_64[j+30]=pSrc_mem_64[j+30];
pDst_mem_64[j+31]=pSrc_mem_64[j+31];
j+=32;
}
if((len>>7) & 0x1) //128byte
{
pDst_mem_64[j]=pSrc_mem_64[j];
pDst_mem_64[j+1]=pSrc_mem_64[j+1];
pDst_mem_64[j+2]=pSrc_mem_64[j+2];
pDst_mem_64[j+3]=pSrc_mem_64[j+3];
pDst_mem_64[j+4]= pSrc_mem_64[j+4];
pDst_mem_64[j+5]=pSrc_mem_64[j+5];
pDst_mem_64[j+6]=pSrc_mem_64[j+6];
pDst_mem_64[j+7]=pSrc_mem_64[j+7];
pDst_mem_64[j+8]=pSrc_mem_64[j+8];
pDst_mem_64[j+9]=pSrc_mem_64[j+9];
pDst_mem_64[j+10]=pSrc_mem_64[j+10];
pDst_mem_64[j+11]=pSrc_mem_64[j+11];
pDst_mem_64[j+12]=pSrc_mem_64[j+12];
pDst_mem_64[j+13]=pSrc_mem_64[j+13];
pDst_mem_64[j+14]=pSrc_mem_64[j+14];
pDst_mem_64[j+15]=pSrc_mem_64[j+15];
j+=16;
}
if((len>>6) & 0x1) //64byte
{
pDst_mem_64[j]=pSrc_mem_64[j];
pDst_mem_64[j+1]=pSrc_mem_64[j+1];
pDst_mem_64[j+2]=pSrc_mem_64[j+2];
pDst_mem_64[j+3]=pSrc_mem_64[j+3];
pDst_mem_64[j+4]=pSrc_mem_64[j+4];
pDst_mem_64[j+5]=pSrc_mem_64[j+5];
pDst_mem_64[j+6]=pSrc_mem_64[j+6];
pDst_mem_64[j+7]=pSrc_mem_64[j+7];
j+=8;
}
if((len>>5) & 0x1) //32byte
{
pDst_mem_64[j]=pSrc_mem_64[j];
pDst_mem_64[j+1]=pSrc_mem_64[j+1];
pDst_mem_64[j+2]=pSrc_mem_64[j+2];
pDst_mem_64[j+3]=pSrc_mem_64[j+3];
j+=4;
}
if((len>>4) & 0x1) //16byte
{
pDst_mem_64[j]=pSrc_mem_64[j];
pDst_mem_64[j+1]=pSrc_mem_64[j+1];
j+=2;
}
if((len>>3) & 0x1) //8byte
{
pDst_mem_64[j]=pSrc_mem_64[j];
j+=1;
}
char* pSrc_tmp =(char*)&(pSrc_mem_64[j]);
char* pDst_tmp =(char*)&(pDst_mem_64[j]);
for(i=0 ; i <(len & 0x7); i++) //1byte
{
pDst_tmp[i]=pSrc_tmp[i];
}
}