英特尔多核平台编码优化大赛的优化过程--补充
HouSisong@GMail.com 2007.01.20
tag: 多核编程,sse2,牛顿迭代,代码优化,优化大赛,invsqrt,开方
主要文章请参看我的《英特尔多核平台编码优化大赛的优化过程》: http://blog.csdn.net/housisong/archive/2007/01/20/1488465.aspx
本文章是其补充;提供一个完整的float实现版本、double到float的手工转换、手工得到invSqrt的粗略起始迭代值 等其它几个不算成功的实现;
我测试和优化过程中用的 CPU:AMD64x2 3600+ (双核CPU)
操作系统:Windows XP 32bit
编译器:Visual Studio 2005
大赛公布的原始代码执行时间 3.97秒
一个float完整实现版本(牺牲了计算精度),源代码如下:
(如果用汇编实现应该还可以把速度提高一些,或者使用ICC编译器:)
/* particles interacting via pairwise potential */
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <windows.h>
#include <xmmintrin.h>//使用SSE1
#include <process.h>
#include <vector>
////
// 作者:侯思松 HouSisong@263.net
//计算结果的精度为float单精度浮点 版本
#define _IS_FAST
//以牺牲精度的方式加快速度,否则就运算达到float单精度
////
//#define _NEW_TIME_CLOCK
#ifdef _NEW_TIME_CLOCK
#define clock_t double
double CLOCKS_PER_SEC=0.0 ;
inline double clock() {
__int64 result;
if (CLOCKS_PER_SEC==0 )
{
QueryPerformanceFrequency((LARGE_INTEGER *)& result);
CLOCKS_PER_SEC=(double )result;
}
QueryPerformanceCounter((LARGE_INTEGER *)& result);
return (double )result;
}
#else
#include <time.h>
#endif
#define _IS_USES_MY_RAND
//单线程执行rand函数,所以使用自定义rand是安全的
const long DefthreadCount=2; //1,2,4,8,16,.. 把计算任务分成多个任务并行执行
float& m128_value(__m128& x,const long index) { return ((float*)(& x))[index]; }
#define NPARTS 1000
#define NITER 201
#define DIMS 3
#ifdef _IS_USES_MY_RAND
class CMyRand
{
private :
unsigned long _my_holdrand;
public :
CMyRand():_my_holdrand(1 ){}
inline int _my_rand (void )
{
unsigned long result=_my_holdrand * 214013 + 2531011 ;
_my_holdrand = result;
return ( (result>>16) & RAND_MAX );
}
};
CMyRand _MyRand;
inline int _my_rand (void){ return _MyRand._my_rand(); }
#else
#define _my_rand rand
#endif
int rand( void );
int computePot();
void initPositions(void );
void updatePositions(void );
__declspec(align(16)) float r[DIMS][(NPARTS+3)/4*4]; //16byte对齐
double pot;
int main() {
int i;
clock_t start, stop;
//char ctmp; std::cin>>ctmp;
initPositions();
updatePositions();
start = clock();
for( i=0; i<NITER; i++ ) {
pot = 0.0 ;
computePot();
if (i%10 == 0 )
{
printf("%5d: Potential: %20.7f " , i, pot);}
updatePositions();
}
pot = 0.0 ;
stop= clock();
printf ("Seconds = %10.9f ",(double)(stop-start)/ CLOCKS_PER_SEC);
return 0 ;
}
void initPositions() {
int i, j;
for( i=0; i<DIMS; i++ ){
for( j=0; j<NPARTS; j++ )
r[i][j]= (float)( 0.5 + _my_rand() *(1.0/ RAND_MAX) );
}
}
void updatePositions() {
int i,j;
for (i=0;i<DIMS;++ i)
{
for( j=0; j<NPARTS; ++ j )
{
r[i][j] -=(float)( 0.5 + _my_rand() *(1.0/ RAND_MAX) );
}
}
}
struct TWorkData
{
long iBegin;
long iEnd;
double fResult;
};
const __m128 xmms1_5={ (1.5),(1.5),(1.5),(1.5 ) };
const __m128 xmms_0_5={ (-0.5),(-0.5),(-0.5),(-0.5 ) };
void computePotPart_forj(int i,__m128* pResult)
{
__m128 lcpot= _mm_setzero_ps();
__m128 _mmi0=_mm_set1_ps(-r[0 ][i]);
__m128 _mmi1=_mm_set1_ps(-r[1 ][i]);
__m128 _mmi2=_mm_set1_ps(-r[2 ][i]);
int j=0 ;
//for( j=0; j<i-1; j++ ) { // "j<i-1"比较奇怪,疑为"j<i" !
//* 把这个循环做4次展开
for(;j+4<i;j+=4 )
{
__m128 sm0=_mm_add_ps(*(__m128*)&r[0 ][j],_mmi0);
sm0= _mm_mul_ps(sm0,sm0);
__m128 sm1=_mm_add_ps(*(__m128*)&r[1 ][j],_mmi1);
sm1= _mm_mul_ps(sm1,sm1);
__m128 sm2=_mm_add_ps(*(__m128*)&r[2 ][j],_mmi2);
sm2= _mm_mul_ps(sm2,sm2);
sm0= _mm_add_ps(sm0,sm1);
sm0= _mm_add_ps(sm0,sm2);
sm1=_mm_rsqrt_ps(sm0); // 1/sqrt(,,,)
#ifndef _IS_FAST
// 牛顿迭代,提高开方精度
// 1/sqrt(a)的牛顿迭代公式x_next=(3-a*x*x)*x*0.5 =( 1.5 + (a*(-0.5)) * x*x) ) * x
sm0=_mm_mul_ps(sm0,xmms_0_5); //a*(-0.5)
sm0= _mm_mul_ps(sm0,sm1);
sm0= _mm_mul_ps(sm0,sm1);
sm0= _mm_add_ps(sm0,xmms1_5);
sm0= _mm_mul_ps(sm0,sm1);
#else
sm0 = sm1;
#endif
lcpot = _mm_add_ps(lcpot,sm0);
}//*/
for(;j<i-1;++ j)
{
__m128 sm0=_mm_set_ss(r[0 ][j]);
sm0= _mm_add_ss(sm0,_mmi0);
sm0= _mm_mul_ss(sm0,sm0);
__m128 sm1=_mm_set_ss(r[1 ][j]);
sm1= _mm_add_ss(sm1,_mmi1);
sm1= _mm_mul_ss(sm1,sm1);
__m128 sm2=_mm_set_ss(r[2 ][j]);
sm2= _mm_add_ss(sm2,_mmi2);
sm2= _mm_mul_ss(sm2,sm2);
sm0= _mm_add_ss(sm0,sm1);
sm0= _mm_add_ss(sm0,sm2);
sm1= _mm_rsqrt_ss(sm0);
#ifndef _IS_FAST
// 牛顿迭代,提高开方精度
// 1/sqrt(a)的牛顿迭代公式x_next=(3-a*x*x)*x*0.5 =( 1.5 + (a*(-0.5)) * x*x) ) * x
sm0=_mm_mul_ps(sm0,xmms_0_5); //a*(-0.5)
sm0= _mm_mul_ps(sm0,sm1);
sm0= _mm_mul_ps(sm0,sm1);
sm0= _mm_add_ps(sm0,xmms1_5);
sm0= _mm_mul_ps(sm0,sm1);
#else
sm0 = sm1;
#endif
lcpot = _mm_add_ss(lcpot,sm0);
}
*pResult=_mm_add_ps(* pResult,lcpot);
}
void computePotPart(TWorkData* work_data) {
int i;
__m128 lcpot= _mm_setzero_ps();
// #pragma omp parallel for schedule(static)
for( i=work_data->iBegin; i<work_data->iEnd; i++ ) {
computePotPart_forj(i,& lcpot);
}
__m128 dt0;
dt0= _mm_movehl_ps(lcpot,lcpot);
lcpot= _mm_add_ps(lcpot,dt0);
dt0=_mm_shuffle_ps(lcpot,lcpot,1 );
lcpot= _mm_add_ss(lcpot,dt0);
work_data->fResult=m128_value(lcpot,0 );
}
/
// 工作线程池 TWorkThreadPool
// 用于把一个任务拆分成多个线程任务
// 要求每个小任务任务量相近
//todo:改成任务领取模式
class TWorkThreadPool;
typedef void (*TThreadCallBack)(void * pData);
enum TThreadState{ thrStartup=0 , thrReady, thrBusy, thrTerminate, thrDeath };
class TWorkThread
{
public :
volatile HANDLE thread_handle;
volatile enum TThreadState state;
volatile TThreadCallBack func;
volatile void * pdata; //work data
volatile HANDLE waitfor_event;
TWorkThreadPool* pool;
volatile DWORD thread_ThreadAffinityMask;
TWorkThread() { memset(this,0,sizeof (TWorkThread)); }
};
void do_work_end(TWorkThread* thread_data);
void __cdecl thread_dowork(TWorkThread* thread_data) //void __stdcall thread_dowork(TWorkThread* thread_data)
{
volatile TThreadState& state=thread_data-> state;
SetThreadAffinityMask(GetCurrentThread(),thread_data-> thread_ThreadAffinityMask);
state = thrStartup;
while(true )
{
WaitForSingleObject(thread_data->waitfor_event, -1 );
if(state == thrTerminate)
break ;
state = thrBusy;
volatile TThreadCallBack& func=thread_data-> func;
if (func!=0 )
func((void *)thread_data-> pdata);
do_work_end(thread_data);
}
state = thrDeath;
_endthread();//ExitThread(0);
}
class TWorkThreadPool
{
private :
volatile HANDLE thread_event;
volatile HANDLE new_thread_event;
std::vector<TWorkThread> work_threads;
inline int passel_count() const { return (int)work_threads.size()+1 ; }
void inti_threads(long work_count) {
SYSTEM_INFO SystemInfo;
GetSystemInfo(& SystemInfo);
long cpu_count = SystemInfo.dwNumberOfProcessors;
long best_count = cpu_count;
if (cpu_count>work_count) best_count= work_count;
long newthrcount=best_count - 1 ;
work_threads.resize(newthrcount);
thread_event = CreateSemaphore(0, 0,newthrcount , 0 );
new_thread_event = CreateSemaphore(0, 0,newthrcount , 0 );
for(long i = 0; i < newthrcount; ++ i)
{
work_threads[i].waitfor_event= thread_event;
work_threads[i].state = thrTerminate;
work_threads[i].pool=this ;
work_threads[i].thread_ThreadAffinityMask=1<<(i+1 );
//DWORD thr_id;
work_threads[i].thread_handle =(HANDLE)_beginthread((void (__cdecl *)(void *))thread_dowork, 0, (void*)& work_threads[i]);
//CreateThread(0, 0, (LPTHREAD_START_ROUTINE)thread_dowork,(void*) &work_threads[i], 0, &thr_id);
}
SetThreadAffinityMask(GetCurrentThread(),0x01 );
for(long i = 0; i < newthrcount; ++ i)
{
while(true ) {
if (work_threads[i].state == thrStartup) break ;
else Sleep(0 );
}
work_threads[i].state = thrReady;
}
}
void free_threads(void )
{
long thr_count=(long )work_threads.size();
long i;
for(i = 0; i <thr_count; ++ i)
{
while(true ) {
if (work_threads[i].state == thrReady) break ;
else Sleep(0 );
}
work_threads[i].state= thrTerminate;
}
if (thr_count>0 )
ReleaseSemaphore(thread_event,thr_count, 0 );
for(i = 0; i <thr_count; ++ i)
{
while(true ) {
if (work_threads[i].state == thrDeath) break ;
else Sleep(0 );
}
}
CloseHandle(thread_event);
CloseHandle(new_thread_event);
work_threads.clear();
}
void passel_work(TThreadCallBack work_proc,void** word_data_list,int work_count) {
// assert(work_count>=1);
//assert(work_count<=passel_count());
if (work_count==1 )
{
work_proc(word_data_list[0 ]);
}
else
{
long i;
long thr_count=(long )work_threads.size();
for(i = 0; i < work_count-1; ++ i)
{
work_threads[i].func = work_proc;
work_threads[i].pdata = word_data_list[i];
work_threads[i].state = thrBusy;
}
for(i = work_count-1; i < thr_count; ++ i)
{
work_threads[i].func = 0 ;
work_threads[i].pdata =0 ;
work_threads[i].state = thrBusy;
}
if (thr_count>0 )
ReleaseSemaphore(thread_event,thr_count, 0 );
//current thread do a work
work_proc(word_data_list[work_count-1 ]);
//wait for work finish
for(i = 0; i <thr_count; ++ i)
{
while(true ) {
if (work_threads[i].state == thrReady) break ;
else Sleep(0 );
}
}
std::swap(thread_event,new_thread_event);
}
}
public :
explicit TWorkThreadPool(unsigned long work_count):thread_event(0 ),work_threads() {
//assert(work_count>=1);
inti_threads(work_count); }
~ TWorkThreadPool() { free_threads(); }
long best_work_count() const { return passel_count(); }
void work_execute(TThreadCallBack work_proc,void** word_data_list,int work_count) {
while (work_count>0 )
{
long passel_work_count;
if (work_count>= passel_count())
passel_work_count= passel_count();
else
passel_work_count = work_count;
passel_work(work_proc,word_data_list,passel_work_count);
work_count-= passel_work_count;
word_data_list=& word_data_list[passel_work_count];
}
}
inline void DoWorkEnd(TWorkThread* thread_data){
thread_data->waitfor_event= new_thread_event;
thread_data->func=0 ;
thread_data->state = thrReady;
}
};
void do_work_end(TWorkThread* thread_data)
{
thread_data->pool-> DoWorkEnd(thread_data);
}
//TWorkThreadPool end;
////
static TWorkThreadPool g_work_thread_pool(DefthreadCount);//线程池
int computePot() {
static bool is_inti_work=false ;
static TWorkData work_list[DefthreadCount];
static TWorkData* pwork_list[DefthreadCount];
int i;
if (! is_inti_work)
{
long fi=0 ;
for (int i=0;i<DefthreadCount;++ i)
{
if (0== i)
work_list[i].iBegin=0 ;
else
work_list[i].iBegin =work_list[i-1 ].iEnd;
if (i==DefthreadCount-1 )
work_list[i].iEnd=(long )NPARTS;
else
work_list[i].iEnd =(long)( (double)(NPARTS-1)*sqrt((double)(i+1)/DefthreadCount)+1+0.5 );
pwork_list[i]=& work_list[i];
}
is_inti_work=true ;
}
g_work_thread_pool.work_execute((TThreadCallBack)computePotPart,(void **)(&pwork_list[0 ]),DefthreadCount);
for (i=0;i<DefthreadCount;++ i)
pot+= work_list[i].fResult;
return 0 ;
}
代码执行时间 0.125秒 相对于原始代码加速比:3176.0%
注意到一个事实,float比double版快出了很多,why?
原来,double版中为了使用SSE而在float和double之间的转换花费了很多的时间!
我不知道这个问题是AMD64x2 CPU的问题还是在酷睿2上也一样;
double版中为了优化这个转换,我预先保存一份r数组的float转化值计算,这样就能节约double到float转换;
(double版完整源代码参见《英特尔多核平台编码优化大赛的优化过程》)
定义一个临时数组rf:
__declspec(align(16)) float rf[DIMS][(NPARTS+3)/4*4];
修改updatePositions函数:
int i,j;
for (i=0;i<DIMS;++ i)
{
for( j=0; j<NPARTS; ++ j )
{
r[i][j] -=( 0.5 + _my_rand() *(1.0/ RAND_MAX) );
rf[i][j]=(float )r[i][j];
}
}
}
然后重新实现computePotPart_forj:
{
__m128d lcpot= _mm_setzero_pd();
__m128d _mmi0=_mm_set1_pd(-r[0 ][i]);
__m128d _mmi1=_mm_set1_pd(-r[1 ][i]);
__m128d _mmi2=_mm_set1_pd(-r[2 ][i]);
__m128 _mmi0f=_mm_set1_ps(-rf[0 ][i]);
__m128 _mmi1f=_mm_set1_ps(-rf[1 ][i]);
__m128 _mmi2f=_mm_set1_ps(-rf[2 ][i]);
int j=0 ;
//*
for(;j+4<i;j+=4 )
{
__m128d dm0=_mm_add_pd(*(__m128d*)&r[0 ][j],_mmi0);
dm0= _mm_mul_pd(dm0,dm0);
__m128d dm1=_mm_add_pd(*(__m128d*)&r[1 ][j],_mmi1);
dm1= _mm_mul_pd(dm1,dm1);
__m128d dm2=_mm_add_pd(*(__m128d*)&r[2 ][j],_mmi2);
dm2= _mm_mul_pd(dm2,dm2);
dm0= _mm_add_pd(dm0,dm1);
dm0= _mm_add_pd(dm0,dm2);
__m128d dm5=_mm_add_pd(*(__m128d*)&r[0][j+2 ],_mmi0);
dm5= _mm_mul_pd(dm5,dm5);
__m128d dm6=_mm_add_pd(*(__m128d*)&r[1][j+2 ],_mmi1);
dm6= _mm_mul_pd(dm6,dm6);
dm2=_mm_add_pd(*(__m128d*)&r[2][j+2 ],_mmi2);
dm2= _mm_mul_pd(dm2,dm2);
dm5= _mm_add_pd(dm5,dm6);
dm5= _mm_add_pd(dm5,dm2);
// 用SSE的rsqrt近似计算1/sqrt(a); 然后使用牛顿迭代来提高开方精度
// 1/sqrt(a)的牛顿迭代公式x_next=(3-a*x*x)*x*0.5 =( 1.5 + (a*(-0.5)) * x*x) ) * x
{
/* __m128 sm0=_mm_cvtpd_ps(dm0);
__m128 sm1=_mm_cvtpd_ps(dm5);
sm0=_mm_movelh_ps(sm0,sm1);*/
__m128 sm0 =_mm_add_ps(*(__m128*)&rf[0 ][j],_mmi0f);
sm0= _mm_mul_ps(sm0,sm0);
__m128 sm1=_mm_add_ps(*(__m128*)&rf[1 ][j],_mmi1f);
sm1= _mm_mul_ps(sm1,sm1);
__m128 sm2=_mm_add_ps(*(__m128*)&rf[2 ][j],_mmi2f);
sm2= _mm_mul_ps(sm2,sm2);
sm0= _mm_add_ps(sm0,sm1);
sm0= _mm_add_ps(sm0,sm2);
__m128 sma=_mm_mul_ps(sm0,xmms_0_5); //a*(-0.5)
sm0=_mm_rsqrt_ps(sm0); // 计算1/sqrt(a)
//牛顿迭代,提高开方精度
sma= _mm_mul_ps(sma,sm0);
sma= _mm_mul_ps(sma,sm0);
sma= _mm_add_ps(sma,xmms1_5);
sm0= _mm_mul_ps(sm0,sma);
__m128d dma=_mm_mul_pd(dm0,xmmd_0_5); //a*(-0.5)
__m128d dmb= _mm_mul_pd(dm5,xmmd_0_5);
sm1= _mm_movehl_ps(sm1,sm0);
dm0=_mm_cvtps_pd(sm0); //
dm5=_mm_cvtps_pd(sm1); //
//再次迭代,加倍精度
dma= _mm_mul_pd(dma,dm0);
dmb= _mm_mul_pd(dmb,dm5);
dma= _mm_mul_pd(dma,dm0);
dmb= _mm_mul_pd(dmb,dm5);
dma= _mm_add_pd(dma,xmmd1_5);
dmb= _mm_add_pd(dmb,xmmd1_5);
dm0= _mm_mul_pd(dm0,dma);
dm5= _mm_mul_pd(dm5,dmb);
}
lcpot= _mm_add_pd(lcpot,dm0);
lcpot= _mm_add_pd(lcpot,dm5);
}
for (;j+1<i;++ j)
{
__m128d dm0=_mm_set_sd(r[0 ][j]);
dm0= _mm_add_pd(dm0,_mmi0);
dm0= _mm_mul_pd(dm0,dm0);
__m128d dm1=_mm_set_sd(r[1 ][j]);
dm1= _mm_add_sd(dm1,_mmi1);
dm1= _mm_mul_sd(dm1,dm1);
__m128d dm2=_mm_set_sd(r[2 ][j]);
dm2= _mm_add_sd(dm2,_mmi2);
dm2= _mm_mul_sd(dm2,dm2);
dm0= _mm_add_sd(dm0,dm1);
dm0= _mm_add_sd(dm0,dm2);
{
__m128 sm0= _mm_cvtpd_ps(dm0);
__m128d dma=_mm_mul_sd(dm0,xmmd_0_5); //a*(-0.5)
sm0=_mm_rsqrt_ss(sm0); //计算1/sqrt(a)
dm0=_mm_cvtps_pd(sm0); //
//牛顿迭代,提高开方精度
dm1= _mm_mul_sd(dm0,dm0);
dm1= _mm_mul_sd(dm1,dma);
dm1= _mm_add_sd(dm1,xmmd1_5);
dm0= _mm_mul_sd(dm0,dm1);
//再次迭代
dma= _mm_mul_sd(dma,dm0);
dma= _mm_mul_sd(dma,dm0);
dma= _mm_add_sd(dma,xmmd1_5);
dm0= _mm_mul_sd(dm0,dma);
}
lcpot= _mm_add_sd(lcpot,dm0);
}
*pResult=_mm_add_pd(* pResult,lcpot);
}
该代码对速度有很小的改进,但精度确只到小数点后5/6位(过早的降低了运算精度,后面的计算使误差放大),
再增加一次牛顿叠代就能弥补精度的缺失,但速度上就没有优势了;只能放弃该方案;
既然硬件的float和double之间的转换慢,那我手工来写一个会不会更好一些呢? (比较奇怪的尝试:)
见代码:
(该函数利用了double/float的IEE浮点编码结构)
const __m128i _xmmd2f_esub= _mm_set_epi64(_mmd2f_esub,_mmd2f_esub);
void computePotPart_forj_d2f(int i,__m128d* pResult)
{
__m128d lcpot= _mm_setzero_pd();
__m128d _mmi0=_mm_set1_pd(-r[0 ][i]);
__m128d _mmi1=_mm_set1_pd(-r[1 ][i]);
__m128d _mmi2=_mm_set1_pd(-r[2 ][i]);
int j=0 ;
//*
for(;j+4<i;j+=4 )
{
__m128d dm0=_mm_add_pd(*(__m128d*)&r[0 ][j],_mmi0);
dm0= _mm_mul_pd(dm0,dm0);
__m128d dm1=_mm_add_pd(*(__m128d*)&r[1 ][j],_mmi1);
dm1= _mm_mul_pd(dm1,dm1);
__m128d dm2=_mm_add_pd(*(__m128d*)&r[2 ][j],_mmi2);
dm2= _mm_mul_pd(dm2,dm2);
dm0= _mm_add_pd(dm0,dm1);
dm0= _mm_add_pd(dm0,dm2);
__m128d dm5=_mm_add_pd(*(__m128d*)&r[0][j+2 ],_mmi0);
dm5= _mm_mul_pd(dm5,dm5);
__m128d dm6=_mm_add_pd(*(__m128d*)&r[1][j+2 ],_mmi1);
dm6= _mm_mul_pd(dm6,dm6);
dm2=_mm_add_pd(*(__m128d*)&r[2][j+2 ],_mmi2);
dm2= _mm_mul_pd(dm2,dm2);
dm5= _mm_add_pd(dm5,dm6);
dm5= _mm_add_pd(dm5,dm2);
// 用SSE的rsqrt近似计算1/sqrt(a); 然后使用牛顿迭代来提高开方精度
// 1/sqrt(a)的牛顿迭代公式x_next=(3-a*x*x)*x*0.5 =( 1.5 + (a*(-0.5)) * x*x) ) * x
{
__m128d dma=_mm_mul_pd(dm0,xmmd_0_5); //a*(-0.5)
__m128d dmb= _mm_mul_pd(dm5,xmmd_0_5);
(*(__m128i*)&dm0)=_mm_sub_epi64((*(__m128i*)& dm0),_xmmd2f_esub);
(*(__m128i*)&dm5)=_mm_sub_epi64((*(__m128i*)& dm5),_xmmd2f_esub);
(*(__m128i*)&dm0)=_mm_srli_epi64(*(__m128i*)&dm0,32-3 );
(*(__m128i*)&dm5)=_mm_srli_epi64(*(__m128i*)&dm5,32-3 );
(*(__m128i*)&dm0)=_mm_slli_epi64(*(__m128i*)&dm0,32 );
__m128 sm0;
(*(__m128i*)&sm0)=_mm_xor_si128(*(__m128i*)&dm0,*(__m128i*)& dm5);
__m128 sma=_mm_mul_ps(sm0,xmms_0_5); //a*(-0.5)
sm0=_mm_rsqrt_ps(sm0); // 计算1/sqrt(a)
//牛顿迭代,提高开方精度
sma= _mm_mul_ps(sma,sm0);
sma= _mm_mul_ps(sma,sm0);
sma= _mm_add_ps(sma,xmms1_5);
sm0= _mm_mul_ps(sm0,sma);
(*(__m128i*)&dm0)=_mm_srli_epi64(*(__m128i*)&sm0,3); //
(*(__m128i*)&dm5)=_mm_slli_epi64(*(__m128i*)&sm0,32); //
(*(__m128i*)&dm5)=_mm_srli_epi64(*(__m128i*)&dm5,3); //
(*(__m128i*)&dm0)=_mm_add_epi64((*(__m128i*)& dm0),_xmmd2f_esub);
(*(__m128i*)&dm5)=_mm_add_epi64((*(__m128i*)& dm5),_xmmd2f_esub);
//再次迭代,加倍精度
dma= _mm_mul_pd(dma,dm0);
dmb= _mm_mul_pd(dmb,dm5);
dma= _mm_mul_pd(dma,dm0);
dmb= _mm_mul_pd(dmb,dm5);
dma= _mm_add_pd(dma,xmmd1_5);
dmb= _mm_add_pd(dmb,xmmd1_5);
dm0= _mm_mul_pd(dm0,dma);
dm5= _mm_mul_pd(dm5,dmb);
}
lcpot= _mm_add_pd(lcpot,dm0);
lcpot= _mm_add_pd(lcpot,dm5);
}
for (;j+1<i;++ j)
{
__m128d dm0=_mm_set_sd(r[0 ][j]);
dm0= _mm_add_pd(dm0,_mmi0);
dm0= _mm_mul_pd(dm0,dm0);
__m128d dm1=_mm_set_sd(r[1 ][j]);
dm1= _mm_add_sd(dm1,_mmi1);
dm1= _mm_mul_sd(dm1,dm1);
__m128d dm2=_mm_set_sd(r[2 ][j]);
dm2= _mm_add_sd(dm2,_mmi2);
dm2= _mm_mul_sd(dm2,dm2);
dm0= _mm_add_sd(dm0,dm1);
dm0= _mm_add_sd(dm0,dm2);
{
__m128 sm0= _mm_cvtpd_ps(dm0);
__m128d dma=_mm_mul_sd(dm0,xmmd_0_5); //a*(-0.5)
sm0=_mm_rsqrt_ss(sm0); //计算1/sqrt(a)
dm0=_mm_cvtps_pd(sm0); //
//牛顿迭代,提高开方精度
dm1= _mm_mul_sd(dm0,dm0);
dm1= _mm_mul_sd(dm1,dma);
dm1= _mm_add_sd(dm1,xmmd1_5);
dm0= _mm_mul_sd(dm0,dm1);
//再次迭代
dma= _mm_mul_sd(dma,dm0);
dma= _mm_mul_sd(dma,dm0);
dma= _mm_add_sd(dma,xmmd1_5);
dm0= _mm_mul_sd(dm0,dma);
}
lcpot= _mm_add_sd(lcpot,dm0);
}
*pResult=_mm_add_pd(* pResult,lcpot);
}
该函数的速度比原来的代码稍慢:) 放弃之
(我还尝试过这样的代码(代码没有保存:( )
代码原理和上面的很接近,利用IEE的浮点格式,强制把double转成float后(通过指数平衡和移位),
分两路使用_mm_rsqrt_ps、牛顿叠代等;但这个慢慢,也放弃了)
既然硬件的float和double之间的转换慢,那我就不转换来看看怎样实现;
不使用SSE的_mm_rsqrt_ps指令,而利用IEE浮点格式生成一个粗略的近似解,然后迭代(迭代了3次);
(也可以用查表的方式来得到初始解,但在SSE体系中,这种实现很可能得不偿失,所以没有去实现);(该函数利用了double的IEE浮点编码结构)这样以后,速度有了一些提高,但精度还有点不够(大概有6位小数位精度),再次迭代的话就失去了速度优势;
不知道有没有比魔法数0x5fe6ec85,0xe7de30da更好的魔法数:)
放弃;
const __m128i xmmi64_mn= _mm_set1_epi64(_mmi_mn);
void computePotPart_forj_int(int i,__m128d* pResult)
{
__m128d lcpot= _mm_setzero_pd();
__m128d _mmi0=_mm_set1_pd(-r[0 ][i]);
__m128d _mmi1=_mm_set1_pd(-r[1 ][i]);
__m128d _mmi2=_mm_set1_pd(-r[2 ][i]);
int j=0 ;
//*
for(;j+4<i;j+=4 )
{
__m128d dm0=_mm_add_pd(*(__m128d*)&r[0 ][j],_mmi0);
dm0= _mm_mul_pd(dm0,dm0);
__m128d dm1=_mm_add_pd(*(__m128d*)&r[1 ][j],_mmi1);
dm1= _mm_mul_pd(dm1,dm1);
__m128d dm2=_mm_add_pd(*(__m128d*)&r[2 ][j],_mmi2);
dm2= _mm_mul_pd(dm2,dm2);
dm0= _mm_add_pd(dm0,dm1);
dm0= _mm_add_pd(dm0,dm2);
__m128d dm5=_mm_add_pd(*(__m128d*)&r[0][j+2 ],_mmi0);
dm5= _mm_mul_pd(dm5,dm5);
__m128d dm6=_mm_add_pd(*(__m128d*)&r[1][j+2 ],_mmi1);
dm6= _mm_mul_pd(dm6,dm6);
dm2=_mm_add_pd(*(__m128d*)&r[2][j+2 ],_mmi2);
dm2= _mm_mul_pd(dm2,dm2);
dm5= _mm_add_pd(dm5,dm6);
dm5= _mm_add_pd(dm5,dm2);
// 利用IEE double 浮点格式的编码生成1/sqrt(a)的一个近似值; 然后使用牛顿迭代来提高精度
// 1/sqrt(a)的牛顿迭代公式x_next=(3-a*x*x)*x*0.5 =( 1.5 + (a*(-0.5)) * x*x) ) * x
{
__m128i xmmi0= xmmi64_mn;
__m128d dma=_mm_mul_pd(dm0,xmmd_0_5); //a*(-0.5)
__m128d dmb= _mm_mul_pd(dm5,xmmd_0_5);
*(__m128i*)&dm0=_mm_srli_epi64(*(__m128i*)&dm0,1 );
*(__m128i*)&dm5=_mm_srli_epi64(*(__m128i*)&dm5,1 );
*(__m128i*)&dm0=_mm_sub_epi64(xmmi0,*(__m128i*)& dm0);
*(__m128i*)&dm5=_mm_sub_epi64(xmmi0,*(__m128i*)& dm5);
//迭代,加倍精度
dm1 = _mm_mul_pd(dma,dm0);
dm2= _mm_mul_pd(dmb,dm5);
dm1= _mm_mul_pd(dm1,dm0);
dm2= _mm_mul_pd(dm2,dm5);
dm1= _mm_add_pd(dm1,xmmd1_5);
dm2= _mm_add_pd(dm2,xmmd1_5);
dm0= _mm_mul_pd(dm0,dm1);
dm5= _mm_mul_pd(dm5,dm2);
dm1= _mm_mul_pd(dma,dm0);
dm2= _mm_mul_pd(dmb,dm5);
dm1= _mm_mul_pd(dm1,dm0);
dm2= _mm_mul_pd(dm2,dm5);
dm1= _mm_add_pd(dm1,xmmd1_5);
dm2= _mm_add_pd(dm2,xmmd1_5);
dm0= _mm_mul_pd(dm0,dm1);
dm5= _mm_mul_pd(dm5,dm2);
dma= _mm_mul_pd(dma,dm0);
dmb= _mm_mul_pd(dmb,dm5);
dma= _mm_mul_pd(dma,dm0);
dmb= _mm_mul_pd(dmb,dm5);
dma= _mm_add_pd(dma,xmmd1_5);
dmb= _mm_add_pd(dmb,xmmd1_5);
dm0= _mm_mul_pd(dm0,dma);
dm5= _mm_mul_pd(dm5,dmb);
//精度还有点不够 但再次迭代的话就失去了速度优势
}
lcpot= _mm_add_pd(lcpot,dm0);
lcpot= _mm_add_pd(lcpot,dm5);
}
for (;j+1<i;++j)
{
__m128d dm0=_mm_set_sd(r[0][j]);
dm0=_mm_add_pd(dm0,_mmi0);
dm0=_mm_mul_pd(dm0,dm0);
__m128d dm1=_mm_set_sd(r[1][j]);
dm1=_mm_add_sd(dm1,_mmi1);
dm1=_mm_mul_sd(dm1,dm1);
__m128d dm2=_mm_set_sd(r[2][j]);
dm2=_mm_add_sd(dm2,_mmi2);
dm2=_mm_mul_sd(dm2,dm2);
dm0=_mm_add_sd(dm0,dm1);
dm0=_mm_add_sd(dm0,dm2);
__m128 sm0=_mm_cvtpd_ps(dm0);
__m128d dma=_mm_mul_sd(dm0,xmmd_0_5); //a*(-0.5)
sm0=_mm_rsqrt_ss(sm0); //计算1/sqrt(a)
dm0=_mm_cvtps_pd(sm0); //
//牛顿迭代,提高开方精度
dm1=_mm_mul_sd(dm0,dm0);
dm1=_mm_mul_sd(dm1,dma);
dm1=_mm_add_sd(dm1,xmmd1_5);
dm0=_mm_mul_sd(dm0,dm1);
dma=_mm_mul_sd(dma,dm0);
dma=_mm_mul_sd(dma,dm0);
dma=_mm_add_sd(dma,xmmd1_5);
dm0=_mm_mul_sd(dm0,dma);
}
lcpot= _mm_add_sd(lcpot,dm0);
}
*pResult=_mm_add_pd(* pResult,lcpot);
}
还想到一个没有时间去实现的方案:由于SSE和x87是独立的两个硬件,那么可以使它们并行执行;
SSE部件代码不变,但循环做更大的展开,然后让x87承担一路或两路运算;
完