这里使用了纯C,SSE c++加速版本,SSE 汇编加速版本测试
求二维向量二阶范数的计算:平方和求平方根
结果使用纯C反而更快环境WIN10 + VS2013
结果是
arrayCalcCPP 0.040ms
arrayCalcSSE 0.207ms
arrayCalcSSE2 0.208ms
arrayCalcSSEASM 0.207ms
使用编译器优化的纯C 比 手动写的并行加速还要快5倍
如果有网友能说出具体原因,欢迎留言~
void arrayCalcCPP(F32 *p1, F32 *p2, F32 *pSum, S32 len)
{
S32 i = 0;
for (i = 0; i < len; i++)
{
pSum[i] = sqrtf(p1[i] * p1[i] + p2[i] * p2[i]);
}
}
void arrayCalcSSE(F32 *p1, F32 *p2, F32 *pSum, S32 len)
{
S32 i = 0;
S32 end = len >> 2;
__m128 m1, m2, m3;
__m128 *_p1 = (__m128 *)p1;
__m128 *_p2 = (__m128 *)p2;
__m128 *_psum = (__m128 *)pSum;
for (i = 0; i < end; i++)
{
m1 = _mm_mul_ps(*_p1, *_p1);
m2 = _mm_mul_ps(*_p2, *_p2);
m3 = _mm_add_ps(m1, m2);
*_psum = _mm_sqrt_ps(m3);
_p1++;
_p2++;
_psum++;
}
}
void arrayCalcSSE2(F32 *p1, F32 *p2, F32 *pSum, S32 len)
{
S32 i = 0;
S32 end = len >> 2;
__m128 m1, m2, m3;
__m128 *_p1 = (__m128 *)p1;
__m128 *_p2 = (__m128 *)p2;
__m128 *_psum = (__m128 *)pSum;
for (i = 0; i < end; i++)
{
m1 = _mm_load_ps(p1);
m2 = _mm_load_ps(p2);
m1 = _mm_mul_ps(m1, m1);
m2 = _mm_mul_ps(m2, m2);
m3 = _mm_add_ps(m1, m2);
m3 = _mm_sqrt_ps(m3);
_mm_store_ps(pSum, m3);
p1 += 4;
p2 += 4;
pSum += 4;
}
}
void arrayCalcSSEASM(F32 *p1, F32 *p2, F32 *pSum, S32 len)
{
S32 i = 0;
S32 end = len >> 2;
_asm
{
mov esi, p1 // 输入的源数组1的地址送往esi
mov edx, p2 // 输入的源数组2的地址送往edx
mov edi, pSum // 输出结果数组的地址保存在edi
mov ecx, end //循环次数送往ecx
start_loop :
movaps xmm0, [esi] // xmm0 = [esi]
mulps xmm0, xmm0 // xmm0 = xmm0 * xmm0
movaps xmm1, [edx] // xmm1 = [edx]
mulps xmm1, xmm1 // xmm1 = xmm1 * xmm1
addps xmm0, xmm1 // xmm0 = xmm0 + xmm1
sqrtps xmm0, xmm0 // xmm0 = sqrt(xmm0)
movaps[edi], xmm0 // [edi] = xmm0
add esi, 16 // esi += 16
add edx, 16 // edx += 16
add edi, 16 // edi += 16
dec ecx // ecx--
jnz start_loop //如果不为0则转向start_loop
}
}
#define ARRAY_LENGTH 100000
void SSE_test()
{
F32 *array1 = (F32 *)GP_ALIGN_MALLOC16(sizeof(F32) * ARRAY_LENGTH);
F32 *array2 = (F32 *)GP_ALIGN_MALLOC16(sizeof(F32) * ARRAY_LENGTH);
F32 *array3 = (F32 *)GP_ALIGN_MALLOC16(sizeof(F32) * ARRAY_LENGTH);
F32 *array4 = (F32 *)GP_ALIGN_MALLOC16(sizeof(F32) * ARRAY_LENGTH);
F32 *array5 = (F32 *)GP_ALIGN_MALLOC16(sizeof(F32) * ARRAY_LENGTH);
for (int i = 0; i < ARRAY_LENGTH; i++)
{
array1[i] = i * 1.f;
array2[i] = i * 1.f;
}
arrayCalcCPP(array1, array2, array3, ARRAY_LENGTH);
arrayCalcSSE(array1, array2, array4, ARRAY_LENGTH);
arrayCalcSSEASM(array1, array2, array5, ARRAY_LENGTH);
}
void SSE_testP()
{
F32 *array1 = (F32 *)GP_ALIGN_MALLOC16(sizeof(F32) * ARRAY_LENGTH);
F32 *array2 = (F32 *)GP_ALIGN_MALLOC16(sizeof(F32) * ARRAY_LENGTH);
F32 *array3 = (F32 *)GP_ALIGN_MALLOC16(sizeof(F32) * ARRAY_LENGTH);
F32 *array4 = (F32 *)GP_ALIGN_MALLOC16(sizeof(F32) * ARRAY_LENGTH);
F32 *array5 = (F32 *)GP_ALIGN_MALLOC16(sizeof(F32) * ARRAY_LENGTH);
for (int i = 0; i < ARRAY_LENGTH; i++)
{
array1[i] = i * 1.f;
array2[i] = i * 1.f;
}
double t;
t = (double)getTickCount();
for (int i = 0; i < CYC_NUM; i++)
{
arrayCalcCPP(array1, array2, array3, ARRAY_LENGTH);
}
t = ((double)getTickCount() - t) * 1000 / CYC_NUM / getTickFrequency();
printf("arrayCalcCPP %.3fms\n", t);
t = (double)getTickCount();
for (int i = 0; i < CYC_NUM; i++)
{
arrayCalcSSE(array1, array2, array4, ARRAY_LENGTH);
}
t = ((double)getTickCount() - t) * 1000 / CYC_NUM / getTickFrequency();
printf("arrayCalcSSE %.3fms\n", t);
t = (double)getTickCount();
for (int i = 0; i < CYC_NUM; i++)
{
arrayCalcSSE2(array1, array2, array4, ARRAY_LENGTH);
}
t = ((double)getTickCount() - t) * 1000 / CYC_NUM / getTickFrequency();
printf("arrayCalcSSE2 %.3fms\n", t);
t = (double)getTickCount();
for (int i = 0; i < CYC_NUM; i++)
{
arrayCalcSSEASM(array1, array2, array5, ARRAY_LENGTH);
}
t = ((double)getTickCount() - t) * 1000 / CYC_NUM / getTickFrequency();
printf("arrayCalcSSEASM %.3fms\n", t);
}