void arrayCalcCPP(F32 *p1, F32 *p2, F32 *pSum, S32 len)
{
S32 i = 0;
for (i = 0; i < len; i++)
{
pSum[i] = sqrtf(p1[i] * p1[i] + p2[i] * p2[i]);
}
}
void arrayCalcSSE(F32 *p1, F32 *p2, F32 *pSum, S32 len)
{
S32 i = 0;
S32 end = len >> 2;
__m128 m1, m2, m3;
__m128 *_p1 = (__m128 *)p1;
__m128 *_p2 = (__m128 *)p2;
__m128 *_psum = (__m128 *)pSum;
for (i = 0; i < end; i++)
{
m1 = _mm_mul_ps(*_p1, *_p1);
m2 = _mm_mul_ps(*_p2, *_p2);
m3 = _mm_add_ps(m1, m2);
// m4 = _mm_sqrt_ps(m3);
*_psum = _mm_sqrt_ps(m3);
_p1++;
_p2++;
_psum++;
}
}
void arrayCalcSSEASM(F32 *p1, F32 *p2, F32 *pSum, S32 len)
{
S32 i = 0;
S32 end = len >> 2;
_asm
{
mov esi, p1 // 输入的源数组1的地址送往esi
mov edx, p2 // 输入的源数组2的地址送往edx
mov edi, pSum // 输出结果数组的地址保存在edi
mov ecx, end //循环次数送往ecx
start_loop :
movaps xmm0, [esi] // xmm0 = [esi]
mulps xmm0, xmm0 // xmm0 = xmm0 * xmm0
movaps xmm1, [edx] // xmm1 = [edx]
mulps xmm1, xmm1 // xmm1 = xmm1 * xmm1
addps xmm0, xmm1 // xmm0 = xmm0 + xmm1
sqrtps xmm0, xmm0 // xmm0 = sqrt(xmm0)
movaps[edi], xmm0 // [edi] = xmm0
add esi, 16 // esi += 16
add edx, 16 // edx += 16
add edi, 16 // edi += 16
dec ecx // ecx--
jnz start_loop //如果不为0则转向start_loop
}
}
#define ARRAY_LENGTH 100000
void SSE_test()
{
F32 *array1 = (F32 *)GP_ALIGN_MALLOC16(sizeof(F32) * ARRAY_LENGTH);
F32 *array2 = (F32 *)GP_ALIGN_MALLOC16(sizeof(F32) * ARRAY_LENGTH);
F32 *array3 = (F32 *)GP_ALIGN_MALLOC16(sizeof(F32) * ARRAY_LENGTH);
F32 *array4 = (F32 *)GP_ALIGN_MALLOC16(sizeof(F32) * ARRAY_LENGTH);
F32 *array5 = (F32 *)GP_ALIGN_MALLOC16(sizeof(F32) * ARRAY_LENGTH);
// GP_ALIGN16 F32 array1[ARRAY_LENGTH] = { 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 };
// GP_ALIGN16 F32 array2[ARRAY_LENGTH] = { 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 };
// GP_ALIGN16 F32 array3[ARRAY_LENGTH];
// GP_ALIGN16 F32 array4[ARRAY_LENGTH];
// GP_ALIGN16 F32 array5[ARRAY_LENGTH];
for (int i = 0; i < ARRAY_LENGTH; i++)
{
array1[i] = i * 1.f;
array2[i] = i * 1.f;
}
arrayCalcCPP(array1, array2, array3, ARRAY_LENGTH);
arrayCalcSSE(array1, array2, array4, ARRAY_LENGTH);
arrayCalcSSEASM(array1, array2, array5, ARRAY_LENGTH);
}
void SSE_testP()
{
F32 *array1 = (F32 *)GP_ALIGN_MALLOC16(sizeof(F32) * ARRAY_LENGTH);
F32 *array2 = (F32 *)GP_ALIGN_MALLOC16(sizeof(F32) * ARRAY_LENGTH);
F32 *array3 = (F32 *)GP_ALIGN_MALLOC16(sizeof(F32) * ARRAY_LENGTH);
F32 *array4 = (F32 *)GP_ALIGN_MALLOC16(sizeof(F32) * ARRAY_LENGTH);
F32 *array5 = (F32 *)GP_ALIGN_MALLOC16(sizeof(F32) * ARRAY_LENGTH);
// GP_ALIGN16 F32 array1[ARRAY_LENGTH] = { 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 };
// GP_ALIGN16 F32 array2[ARRAY_LENGTH] = { 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 };
// GP_ALIGN16 F32 array3[ARRAY_LENGTH];
// GP_ALIGN16 F32 array4[ARRAY_LENGTH];
// GP_ALIGN16 F32 array5[ARRAY_LENGTH];
for (int i = 0; i < ARRAY_LENGTH; i++)
{
array1[i] = i * 1.f;
array2[i] = i * 1.f;
}
double t;
t = (double)getTickCount();
for (int i = 0; i < CYC_NUM; i++)
{
arrayCalcCPP(array1, array2, array3, ARRAY_LENGTH);
}
t = ((double)getTickCount() - t) * 1000 / CYC_NUM / getTickFrequency();
printf("arrayCalcCPP %.3fms\n", t);
t = (double)getTickCount();
for (int i = 0; i < CYC_NUM; i++)
{
arrayCalcSSE(array1, array2, array4, ARRAY_LENGTH);
}
t = ((double)getTickCount() - t) * 1000 / CYC_NUM / getTickFrequency();
printf("arrayCalcSSE %.3fms\n", t);
t = (double)getTickCount();
for (int i = 0; i < CYC_NUM; i++)
{
arrayCalcSSEASM(array1, array2, array5, ARRAY_LENGTH);
}
t = ((double)getTickCount() - t) * 1000 / CYC_NUM / getTickFrequency();
printf("arrayCalcSSEASM %.3fms\n", t);
}
“相关推荐”对你有帮助么?
-
非常没帮助
-
没帮助
-
一般
-
有帮助
-
非常有帮助
提交