SSE的优势简介

最新推荐文章于 2023-07-28 17:43:53 发布

beyondjhf_2008

最新推荐文章于 2023-07-28 17:43:53 发布

阅读量256

点赞数

分类专栏： SSE编程文章标签：算法

本文链接：https://blog.csdn.net/beyondjhf_2008/article/details/83399423

版权

SSE编程专栏收录该内容

1 篇文章 0 订阅

订阅专栏

[size=small]为了方便对比速度，我会用常归方法和SSE优化两种写法写出，并会用一个测试速度的类CTimer来进行计时。这个算法是对一组float值进行放大，函数ScaleValue1是使用SSE指令优化的，函数ScaleValue2则没有。我们用10000个元素的float数组数据来测试这两个算法，每个算法运算10000遍，下面是测试程序和结果：

Use SSE：2.07543e+012秒
Not Use SSE：-2.5293e+012秒
请按任意键继续. . .

测试代码如下：
/******test.cpp*******/
#include <xmmintrin.h>
#include<iostream>
#include <windows.h>
using namespace std;

class CTimer

{
public:
__forceinline CTimer( void )
{
QueryPerformanceFrequency( &m_Frequency );
QueryPerformanceCounter( &m_StartCount );
}
__forceinline void Reset( void )
{
QueryPerformanceCounter( &m_StartCount );
}
__forceinline double End( void )
{
static __int64 nCurCount;
QueryPerformanceCounter( (PLARGE_INTEGER)&nCurCount );
return double( nCurCount * ( *(__int64*)&m_StartCount ) ) / double( *(__int64*)&m_Frequency );
}
private:
LARGE_INTEGER m_Frequency;
LARGE_INTEGER m_StartCount;
};
void ScaleValue1( float *pArray, DWORD dwCount, float fScale )
{
DWORD dwGroupCount = dwCount / 4;
__m128 e_Scale = _mm_set_ps1( fScale );
for ( DWORD i = 0; i < dwGroupCount; i++ )
{
*(__m128*)( pArray + i * 4 ) = _mm_mul_ps( *(__m128*)( pArray + i * 4 ), e_Scale );
}
}
void ScaleValue2( float *pArray, DWORD dwCount, float fScale )
{
for ( DWORD i = 0; i < dwCount; i++ )
{
pArray[i] *= fScale;
}

}

#define ARRAYCOUNT 10000
int __cdecl main()
{
float __declspec(align(16)) Array[ARRAYCOUNT];
memset( Array, 0, sizeof(float) * ARRAYCOUNT );
CTimer t;
double dTime;
t.Reset();
for ( int i = 0; i < 100000; i++ )
{
ScaleValue1( Array, ARRAYCOUNT, 1000.0f );
}
dTime = t.End();
cout << "Use SSE：" << dTime << "秒" << endl;
t.Reset();
for ( int i = 0; i < 100000; i++ )
{
ScaleValue2( Array, ARRAYCOUNT, 1000.0f );
}
dTime = t.End();
cout << "Not Use SSE：" << dTime << "秒" << endl;
system( "pause" );
return 0;
}[/size]