c 语言优化问题-内存写耗时测试

最新推荐文章于 2023-05-30 20:57:51 发布

奋斗的麻雀

最新推荐文章于 2023-05-30 20:57:51 发布

阅读量694

点赞数 1

分类专栏： SIMD优化

本文链接：https://blog.csdn.net/myzhouwang/article/details/86012698

版权

SIMD优化专栏收录该内容

2 篇文章 0 订阅

订阅专栏

void calRowSum(uint32_t *pDst, intptr_t strideD, int32_t width, int32_t height)
{
	for (int32_t w = 0; w < width; w++)
	{
		uint32_t *pDstTmp = &pDst[w];
		for (int32_t h = 0; h < height; h++)
		{
			pDstTmp[0] += pDstTmp[-1];
			pDstTmp += strideD ;
		}
	}
}

按上图写，release 模式下耗时0.072ms,pDstTmp每次移动strideD(1000以上的大数）;

void calRowSum(uint32_t *pDst, intptr_t strideD, int32_t width, int32_t height)
{
	for (int32_t w = 0; w < width; w++)
	{
		uint32_t *pDstTmp = &pDst[w];
		for (int32_t h = 0; h < height; h++)
		{
			pDstTmp[0] += pDstTmp[-1];
		}
	}
}

但是如果这么写，对相同内存进行写操作，耗时是0.15ms，少了一行，却耗时是前面的两倍;

void calRowSum(uint32_t *pDst, intptr_t strideD, int32_t width, int32_t height)
{
	for (int32_t w = 0; w < width; w++)
	{
		uint32_t *pDstTmp = &pDst[w];
		for (int32_t h = 0; h < height; h++)
		{
			pDstTmp[0] += pDstTmp[-1];
			pDstTmp += 2 ;
		}
	}
}

这么写，耗时0.03ms;

void calRowSum(uint32_t *pDst, intptr_t strideD, int32_t width, int32_t height)
{
	for (int32_t w = 0; w < width; w++)
	{
		uint32_t *pDstTmp = &pDst[w];
		for (int32_t h = 0; h < height; h++)
		{
			pDstTmp[0] += pDstTmp[-1];
			pDstTmp += 1 ;
		}
	}
}

但是这么写，耗时是0.148ms；

此外还测了+3,4,5...的实验，暂时结果是，对同一个内存反复写是非常耗时的，每次偏移一个单位，仍然耗时，当每次偏移2个单位内存时，耗时急剧下降，但随着strideD的变大（位数的变化），耗时会慢慢变大，如下：

void calRowSum(uint32_t *pDst, intptr_t strideD, int32_t width, int32_t height)
{
	for (int32_t w = 0; w < width; w++)
	{
		uint32_t *pDstTmp = &pDst[w];
		for (int32_t h = 0; h < height; h++)
		{
			pDstTmp[0] += pDstTmp[-1];
			pDstTmp += 254;
		}
	}
}

耗时0.035ms;

void calRowSum(uint32_t *pDst, intptr_t strideD, int32_t width, int32_t height)
{
	for (int32_t w = 0; w < width; w++)
	{
		uint32_t *pDstTmp = &pDst[w];
		for (int32_t h = 0; h < height; h++)
		{
			pDstTmp[0] += pDstTmp[-1];
			pDstTmp += 300;
		}
	}
}

耗时0.055ms

void calRowSum(uint32_t *pDst, intptr_t strideD, int32_t width, int32_t height)
{
	for (int32_t w = 0; w < width; w++)
	{
		uint32_t *pDstTmp = &pDst[w];
		for (int32_t h = 0; h < height; h++)
		{
			pDstTmp[0] += pDstTmp[-1];
			pDstTmp += 700;
		}
	}
}

耗时0.086ms...

结论，当下一次的计算依赖上一次的结果时，速度会变慢，因为上一次的结果还在计算中，此次需要继续等待，如

   for (int32_t h = 0; h < height; h++)
       {
           pDstTmp[0] += pDstTmp[-1];
       }

这个循环中，反复对pDstTmp[0]这块内存进行写，在第二次进行写操作时可能会等待第一次写的完成后才开始进行；

而，

       for (int32_t h = 0; h < height; h++)
       {
           pDstTmp[0] += pDstTmp[-1];
           pDstTmp += 1 ;
       }

这个程序和上者的区别是，第二次调用的pDstTmp[-1]是第一次 pDstTmp[0]，同样第一次pDstTmp[0]写结束前是不会进行第二次将pDstTmp[-1]赋值给新的pDstTmp[0]，需要等待。

总之：

1，写内存速度相对比较操作，算术操作等要慢得多；

2，

for (int32_t h = 0; h < height; h++)
       {
           pDstTmp[0] += pDstTmp[-1];
           pDstTmp += 1 ;
       }

pDstTmp[0] += pDstTmp[-1];在计算结果出来之前，pDstTmp += 1 ;h < height; h++指令就已经执行完毕，快速到又到了pDstTmp[0] += pDstTmp[-1];指令，但是此时上一次的计算还没结束。。。。