英特尔多核平台编码优化大赛的优化过程--补充

                   英特尔多核平台编码优化大赛的优化过程--补充
                      
HouSisong@GMail.com  2007.01.20

tag: 多核编程,sse2,牛顿迭代,代码优化,优化大赛,invsqrt,开方

   主要文章请参看我的《英特尔多核平台编码优化大赛的优化过程》: http://blog.csdn.net/housisong/archive/2007/01/20/1488465.aspx
本文章是其补充;提供一个完整的float实现版本、double到float的手工转换、手工得到invSqrt的粗略起始迭代值 等其它几个不算成功的实现;

    我测试和优化过程中用的 CPU:AMD64x2 3600+ (双核CPU)
                      操作系统:Windows XP 32bit
                        编译器:Visual Studio 2005

大赛公布的原始代码执行时间          3.97秒


一个float完整实现版本(牺牲了计算精度),源代码如下:
(如果用汇编实现应该还可以把速度提高一些,或者使用ICC编译器:)

/* compute the potential energy of a collection of */
/* particles interacting via pairwise potential    */
#include 
<stdio.h>
#include 
<stdlib.h>
#include 
<math.h>
#include 
<windows.h>
#include   
<xmmintrin.h>//使用SSE1
#include   <process.h>
#include   
<vector>


////

// 作者:侯思松  HouSisong@263.net
//计算结果的精度为float单精度浮点 版本


#define _IS_FAST 
//以牺牲精度的方式加快速度,否则就运算达到float单精度

////

//#define _NEW_TIME_CLOCK

#ifdef _NEW_TIME_CLOCK
    
#define clock_t double
    
double CLOCKS_PER_SEC=0.0 ;
    inline 
double
 clock() {
        __int64 result;
        
if (CLOCKS_PER_SEC==0
)
        {
            QueryPerformanceFrequency((LARGE_INTEGER 
*)&
result);
            CLOCKS_PER_SEC
=(double
)result;
        }
        QueryPerformanceCounter((LARGE_INTEGER 
*)&
result);
        
return (double
)result;
    }
#else

  #include 
<time.h>
#endif

#define _IS_USES_MY_RAND
//单线程执行rand函数,所以使用自定义rand是安全的
                                              
const long DefthreadCount=2//1,2,4,8,16,..   把计算任务分成多个任务并行执行



float& m128_value(__m128& x,const long index) { return ((float*)(& x))[index]; }

#define NPARTS 1000

#define NITER 201
#define DIMS 3

#ifdef _IS_USES_MY_RAND
    
class  CMyRand
    {
    
private
:
        unsigned 
long
 _my_holdrand;
    
public
:
        CMyRand():_my_holdrand(
1
){}
        inline 
int _my_rand (void
)
        {
            unsigned 
long result=_my_holdrand * 214013 + 2531011
;
            _my_holdrand 
=
result;
            
return  ( (result>>16&
 RAND_MAX );
        }
    };
    CMyRand _MyRand;
    inline 
int _my_rand (void){ return
 _MyRand._my_rand(); }
#else

    
#define  _my_rand  rand
#endif

int rand( void  );
int
 computePot();
void initPositions(void
);
void updatePositions(void
);

__declspec(align(
16)) float r[DIMS][(NPARTS+3)/4*4];  //16byte对齐

double  pot;

int
 main() {
   
int
 i;
   clock_t start, stop;
   
//char ctmp; std::cin>>ctmp;


   initPositions();
   updatePositions();
 
   start
= clock();
   
for( i=0; i<NITER; i++
 ) {
          pot 
= 0.0
;
     computePot();
      
if (i%10 == 0

      {     
     printf(
"%5d: Potential: %20.7f "
, i, pot);}
     updatePositions();

   }
   pot 
= 0.0
;
   stop
=
clock();
   printf (
"Seconds = %10.9f ",(double)(stop-start)/
 CLOCKS_PER_SEC);
   
   
return 0
;
}
 
 
void
 initPositions() {
   
int
 i, j;
   
for( i=0; i<DIMS; i++
 ){
       
for( j=0; j<NPARTS; j++
 ) 
            r[i][j]
= (float)( 0.5 + _my_rand() *(1.0/
RAND_MAX) );
   }
}
 
void
 updatePositions() {
   
int
  i,j;

   
for (i=0;i<DIMS;++
i)
   {
       
for( j=0; j<NPARTS; ++
j )
       {
            r[i][j] 
-=(float)( 0.5 + _my_rand() *(1.0/
RAND_MAX) );
       }
   }
}

struct
 TWorkData
{
    
long
     iBegin;
    
long
     iEnd;
    
double
   fResult;
};

const __m128  xmms1_5={ (1.5),(1.5),(1.5),(1.5
) };
const __m128  xmms_0_5={ (-0.5),(-0.5),(-0.5),(-0.5
) };

void computePotPart_forj(int i,__m128*
 pResult)
{
      __m128 lcpot
=
_mm_setzero_ps();
      __m128 _mmi0
=_mm_set1_ps(-r[0
][i]);
      __m128 _mmi1
=_mm_set1_ps(-r[1
][i]);
      __m128 _mmi2
=_mm_set1_ps(-r[2
][i]);
      
int j=0
;
      
//for( j=0; j<i-1; j++ ) {      //
"j<i-1"比较奇怪,疑为"j<i"  !  
      
//* 把这个循环做4次展开

      for(;j+4<i;j+=4 )
      {
          __m128 sm0
=_mm_add_ps(*(__m128*)&r[0
][j],_mmi0);
          sm0
=
_mm_mul_ps(sm0,sm0);
          __m128 sm1
=_mm_add_ps(*(__m128*)&r[1
][j],_mmi1);
          sm1
=
_mm_mul_ps(sm1,sm1);
          __m128 sm2
=_mm_add_ps(*(__m128*)&r[2
][j],_mmi2);
          sm2
=
_mm_mul_ps(sm2,sm2);
          sm0
=
_mm_add_ps(sm0,sm1);
          sm0
=
_mm_add_ps(sm0,sm2);

            sm1
=_mm_rsqrt_ps(sm0);  // 1/sqrt(,,,)


          #ifndef _IS_FAST
            
// 牛顿迭代,提高开方精度
            
// 1/sqrt(a)的牛顿迭代公式x_next=(3-a*x*x)*x*0.5 =( 1.5 + (a*(-0.5)) * x*x) ) * x 

            sm0=_mm_mul_ps(sm0,xmms_0_5); //a*(-0.5)
            sm0= _mm_mul_ps(sm0,sm1);
            sm0
=
_mm_mul_ps(sm0,sm1); 
            sm0
=
_mm_add_ps(sm0,xmms1_5); 
            sm0
=
_mm_mul_ps(sm0,sm1); 
          
#else

            sm0
= sm1; 
          
#endif


          lcpot
= _mm_add_ps(lcpot,sm0);
      }
//*/

      for(;j<i-1;++ j)
      {
          __m128 sm0
=_mm_set_ss(r[0
][j]);
          sm0
=
_mm_add_ss(sm0,_mmi0);
          sm0
=
_mm_mul_ss(sm0,sm0);
          __m128 sm1
=_mm_set_ss(r[1
][j]);
          sm1
=
_mm_add_ss(sm1,_mmi1);
          sm1
=
_mm_mul_ss(sm1,sm1);
          __m128 sm2
=_mm_set_ss(r[2
][j]);
          sm2
=
_mm_add_ss(sm2,_mmi2);
          sm2
=
_mm_mul_ss(sm2,sm2);
          sm0
=
_mm_add_ss(sm0,sm1);
          sm0
=
_mm_add_ss(sm0,sm2);

          sm1
=
_mm_rsqrt_ss(sm0); 
          #ifndef _IS_FAST
            
//
牛顿迭代,提高开方精度
            
// 1/sqrt(a)的牛顿迭代公式x_next=(3-a*x*x)*x*0.5 =( 1.5 + (a*(-0.5)) * x*x) ) * x 

            sm0=_mm_mul_ps(sm0,xmms_0_5); //a*(-0.5)
            sm0= _mm_mul_ps(sm0,sm1);
            sm0
=
_mm_mul_ps(sm0,sm1); 
            sm0
=
_mm_add_ps(sm0,xmms1_5); 
            sm0
=
_mm_mul_ps(sm0,sm1); 
          
#else

            sm0
= sm1; 
          
#endif


          lcpot
= _mm_add_ss(lcpot,sm0);
      }
      
*pResult=_mm_add_ps(*
pResult,lcpot);
}

void computePotPart(TWorkData*
 work_data) {
   
int
 i;

   __m128 lcpot
=
_mm_setzero_ps();

  
// #pragma omp parallel for schedule(static)

  for( i=work_data->iBegin; i<work_data->iEnd; i++  ) {
      computePotPart_forj(i,
&
lcpot);
   }

   __m128 dt0;      
   dt0
=
_mm_movehl_ps(lcpot,lcpot);
   lcpot
=
_mm_add_ps(lcpot,dt0); 
   dt0
=_mm_shuffle_ps(lcpot,lcpot,1
);
   lcpot
=
_mm_add_ss(lcpot,dt0); 

   work_data
->fResult=m128_value(lcpot,0
);
}

/

// 工作线程池 TWorkThreadPool
//
用于把一个任务拆分成多个线程任务
//
要求每个小任务任务量相近
//todo:改成任务领取模式

class  TWorkThreadPool;

typedef 
void (*TThreadCallBack)(void *
 pData);
enum TThreadState{ thrStartup=0
, thrReady,  thrBusy, thrTerminate, thrDeath };

class
 TWorkThread
{
public
:
    
volatile
 HANDLE             thread_handle;
    
volatile enum
 TThreadState  state;
    
volatile
 TThreadCallBack    func;
    
volatile void *             pdata;  //work data     

    volatile  HANDLE             waitfor_event;
    TWorkThreadPool
*
            pool;
    
volatile
 DWORD              thread_ThreadAffinityMask;

    TWorkThread() { memset(
this,0,sizeof
(TWorkThread));  }
};

void do_work_end(TWorkThread*
 thread_data);

void __cdecl thread_dowork(TWorkThread* thread_data) //void __stdcall thread_dowork(TWorkThread* thread_data)

{
    
volatile TThreadState& state=thread_data->
state;
    SetThreadAffinityMask(GetCurrentThread(),thread_data
->
thread_ThreadAffinityMask);
    state 
=
 thrStartup;

    
while(true
)
    {
        WaitForSingleObject(thread_data
->waitfor_event, -1
);
        
if(state ==
 thrTerminate)
            
break
;

        state 
=
 thrBusy;
        
volatile TThreadCallBack& func=thread_data->
func;
        
if (func!=0
)
            func((
void *)thread_data->
pdata);
        do_work_end(thread_data);
    }
    state 
=
 thrDeath;
    _endthread();
//ExitThread(0);

}

class
 TWorkThreadPool
{
private
:
    
volatile
 HANDLE             thread_event;
    
volatile
 HANDLE             new_thread_event;
    std::vector
<TWorkThread>
    work_threads;
    inline 
int passel_count() const { return (int)work_threads.size()+1
; }
    
void inti_threads(long
 work_count) {
        SYSTEM_INFO SystemInfo;
        GetSystemInfo(
&
SystemInfo);
        
long cpu_count =
SystemInfo.dwNumberOfProcessors;
        
long best_count =
cpu_count;
        
if (cpu_count>work_count) best_count=
work_count;

        
long newthrcount=best_count - 1
;
        work_threads.resize(newthrcount);
        thread_event 
= CreateSemaphore(00,newthrcount , 0
);
        new_thread_event 
= CreateSemaphore(00,newthrcount , 0
);
        
for(long i = 0; i < newthrcount; ++
i)
        {
            work_threads[i].waitfor_event
=
thread_event;
            work_threads[i].state 
=
 thrTerminate;
            work_threads[i].pool
=this
;
            work_threads[i].thread_ThreadAffinityMask
=1<<(i+1
);
            
//DWORD thr_id;

            work_threads[i].thread_handle =(HANDLE)_beginthread((void (__cdecl *)(void *))thread_dowork, 0, (void*)& work_threads[i]); 
                
//CreateThread(0, 0, (LPTHREAD_START_ROUTINE)thread_dowork,(void*) &work_threads[i], 0, &thr_id);

        }
        SetThreadAffinityMask(GetCurrentThread(),
0x01
);
        
for(long i = 0; i < newthrcount; ++
i)
        {
            
while(true
) { 
                
if (work_threads[i].state == thrStartup) break
;
                
else Sleep(0
);
            }
            work_threads[i].state 
=
 thrReady;
        }
    }
    
void free_threads(void
)
    {
        
long thr_count=(long
)work_threads.size();
        
long
 i;
        
for(i = 0; i <thr_count; ++
i)
        {
            
while(true
) {  
                
if (work_threads[i].state == thrReady) break
;
                
else Sleep(0
);
            }
            work_threads[i].state
=
thrTerminate;
        }
        
if (thr_count>0
)
            ReleaseSemaphore(thread_event,thr_count, 
0
);
        
for(i = 0; i <thr_count; ++
i)
        {
            
while(true
) {  
                
if (work_threads[i].state == thrDeath) break
;
                
else Sleep(0
);
            }
        }
        CloseHandle(thread_event);
        CloseHandle(new_thread_event);
        work_threads.clear();
    }
    
void passel_work(TThreadCallBack work_proc,void** word_data_list,int
 work_count)    {
        
//
assert(work_count>=1);
        
//assert(work_count<=passel_count());

        if (work_count==1 )
        {
            work_proc(word_data_list[
0
]);
        }
        
else

        {
            
long  i;
            
long thr_count=(long
)work_threads.size();
            
for(i = 0; i < work_count-1++
i)
            {
                work_threads[i].func  
=
 work_proc;
                work_threads[i].pdata  
=
word_data_list[i];
                work_threads[i].state 
=
 thrBusy;
            }
            
for(i =  work_count-1; i < thr_count; ++
i)
            {
                work_threads[i].func  
= 0
;
                work_threads[i].pdata  
=0
;
                work_threads[i].state 
=
 thrBusy;
            }
            
if (thr_count>0
)
                ReleaseSemaphore(thread_event,thr_count, 
0
);

            
//current thread do a work

            work_proc(word_data_list[work_count-1 ]);


            
//wait for work finish  

            for(i = 0; i <thr_count; ++ i)
            {
                
while(true
) {  
                    
if (work_threads[i].state == thrReady) break
;
                    
else Sleep(0
);
                }
            }
            std::swap(thread_event,new_thread_event);
        }
    }
public
:
    
explicit TWorkThreadPool(unsigned long work_count):thread_event(0
),work_threads() { 
        
//assert(work_count>=1);  

        inti_threads(work_count);            }
    
~
TWorkThreadPool() {  free_threads(); }
    
long best_work_count() const { return
 passel_count(); }
    
void work_execute(TThreadCallBack work_proc,void** word_data_list,int
 work_count)    {        
        
while (work_count>0
)
        {
            
long
 passel_work_count;
            
if (work_count>=
passel_count())
                passel_work_count
=
passel_count();
            
else

                passel_work_count
= work_count;

            passel_work(work_proc,word_data_list,passel_work_count);

            work_count
-=
passel_work_count;
            word_data_list
=&
word_data_list[passel_work_count];
        }
    }
    inline 
void DoWorkEnd(TWorkThread*
 thread_data){ 
        thread_data
->waitfor_event=
new_thread_event; 
        thread_data
->func=0
;
        thread_data
->state =
 thrReady;
    }
};
void do_work_end(TWorkThread*
 thread_data)
{
    thread_data
->pool->
DoWorkEnd(thread_data);
}
//TWorkThreadPool end;

////

static TWorkThreadPool g_work_thread_pool(DefthreadCount);//线程池

int  computePot() {
    
static bool is_inti_work=false
;
    
static
 TWorkData   work_list[DefthreadCount];
    
static TWorkData*
  pwork_list[DefthreadCount];

    
int
 i;
    
if (!
is_inti_work)
    {
        
long fi=0
;
        
for (int i=0;i<DefthreadCount;++
i)
        {
            
if (0==
i)
                work_list[i].iBegin
=0
;
            
else

                work_list[i].iBegin
=work_list[i-1 ].iEnd;
            
if (i==DefthreadCount-1
)
                work_list[i].iEnd
=(long
)NPARTS;
            
else

                work_list[i].iEnd
=(long)( (double)(NPARTS-1)*sqrt((double)(i+1)/DefthreadCount)+1+0.5  );
            pwork_list[i]
=&
work_list[i];
        }
        is_inti_work
=true
;
    }

    g_work_thread_pool.work_execute((TThreadCallBack)computePotPart,(
void **)(&pwork_list[0
]),DefthreadCount);


    
for (i=0;i<DefthreadCount;++
i)
       pot
+=
work_list[i].fResult;

    
return 0
;
}

代码执行时间          0.125秒    相对于原始代码加速比:3176.0%


注意到一个事实,float比double版快出了很多,why?
原来,double版中为了使用SSE而在float和double之间的转换花费了很多的时间!
我不知道这个问题是AMD64x2 CPU的问题还是在酷睿2上也一样;
double版中为了优化这个转换,我预先保存一份r数组的float转化值计算,这样就能节约double到float转换;
(double版完整源代码参见《英特尔多核平台编码优化大赛的优化过程》)
定义一个临时数组rf:
  __declspec(align(16)) float  rf[DIMS][(NPARTS+3)/4*4];
  修改updatePositions函数:

void  updatePositions() {
   
int
  i,j;

   
for (i=0;i<DIMS;++
i)
   {
       
for( j=0; j<NPARTS; ++
j )
       {
      r[i][j] 
-=0.5 + _my_rand() *(1.0/
RAND_MAX) );
   rf[i][j]
=(float
)r[i][j];
       }
   }
}


然后重新实现computePotPart_forj:

void computePotPart_forj_double_float(int i,__m128d*  pResult)
{
      __m128d lcpot
=
_mm_setzero_pd();
      __m128d _mmi0
=_mm_set1_pd(-r[0
][i]);
      __m128d _mmi1
=_mm_set1_pd(-r[1
][i]);
      __m128d _mmi2
=_mm_set1_pd(-r[2
][i]);
      __m128 _mmi0f
=_mm_set1_ps(-rf[0
][i]);
      __m128 _mmi1f
=_mm_set1_ps(-rf[1
][i]);
      __m128 _mmi2f
=_mm_set1_ps(-rf[2
][i]);
      
int j=0
;
      
//*

      for(;j+4<i;j+=4 )
      {
          __m128d dm0
=_mm_add_pd(*(__m128d*)&r[0
][j],_mmi0);
          dm0
=
_mm_mul_pd(dm0,dm0);
          __m128d dm1
=_mm_add_pd(*(__m128d*)&r[1
][j],_mmi1);
          dm1
=
_mm_mul_pd(dm1,dm1);
          __m128d dm2
=_mm_add_pd(*(__m128d*)&r[2
][j],_mmi2);
          dm2
=
_mm_mul_pd(dm2,dm2);
          dm0
=
_mm_add_pd(dm0,dm1);
          dm0
=
_mm_add_pd(dm0,dm2);

          __m128d dm5
=_mm_add_pd(*(__m128d*)&r[0][j+2
],_mmi0);
          dm5
=
_mm_mul_pd(dm5,dm5);
          __m128d dm6
=_mm_add_pd(*(__m128d*)&r[1][j+2
],_mmi1);
          dm6
=
_mm_mul_pd(dm6,dm6);
          dm2
=_mm_add_pd(*(__m128d*)&r[2][j+2
],_mmi2);
          dm2
=
_mm_mul_pd(dm2,dm2);
          dm5
=
_mm_add_pd(dm5,dm6);
          dm5
=
_mm_add_pd(dm5,dm2);

          
//
用SSE的rsqrt近似计算1/sqrt(a); 然后使用牛顿迭代来提高开方精度
          
// 1/sqrt(a)的牛顿迭代公式x_next=(3-a*x*x)*x*0.5 =( 1.5 + (a*(-0.5)) * x*x) ) * x 

          {
             
/*
 __m128 sm0=_mm_cvtpd_ps(dm0);
              __m128 sm1=_mm_cvtpd_ps(dm5);
              sm0=_mm_movelh_ps(sm0,sm1);
*/


              __m128 sm0
=_mm_add_ps(*(__m128*)&rf[0 ][j],_mmi0f);
              sm0
=
_mm_mul_ps(sm0,sm0);
              __m128 sm1
=_mm_add_ps(*(__m128*)&rf[1
][j],_mmi1f);
              sm1
=
_mm_mul_ps(sm1,sm1);
              __m128 sm2
=_mm_add_ps(*(__m128*)&rf[2
][j],_mmi2f);
              sm2
=
_mm_mul_ps(sm2,sm2);
              sm0
=
_mm_add_ps(sm0,sm1);
              sm0
=
_mm_add_ps(sm0,sm2);

              __m128 sma
=_mm_mul_ps(sm0,xmms_0_5); //a*(-0.5)

              sm0=_mm_rsqrt_ps(sm0); // 计算1/sqrt(a)
              
//牛顿迭代,提高开方精度

              sma= _mm_mul_ps(sma,sm0);
              sma
=
_mm_mul_ps(sma,sm0); 
              sma
=
_mm_add_ps(sma,xmms1_5); 
              sm0
=
_mm_mul_ps(sm0,sma); 

                  __m128d dma
=_mm_mul_pd(dm0,xmmd_0_5); //a*(-0.5)

                  __m128d dmb= _mm_mul_pd(dm5,xmmd_0_5);

              sm1
=
_mm_movehl_ps(sm1,sm0);
              dm0
=_mm_cvtps_pd(sm0); // 

              dm5=_mm_cvtps_pd(sm1); //  

              
//再次迭代,加倍精度 

                  dma= _mm_mul_pd(dma,dm0);
                  dmb
=
_mm_mul_pd(dmb,dm5);
                  dma
=
_mm_mul_pd(dma,dm0);
                  dmb
=
_mm_mul_pd(dmb,dm5);
                  dma
=
_mm_add_pd(dma,xmmd1_5);
                  dmb
=
_mm_add_pd(dmb,xmmd1_5);
                  dm0
=
_mm_mul_pd(dm0,dma);
                  dm5
=
_mm_mul_pd(dm5,dmb);
          }

          lcpot
=
_mm_add_pd(lcpot,dm0);
          lcpot
=
_mm_add_pd(lcpot,dm5);
      }

      
for (;j+1<i;++
j)
      {
          __m128d dm0
=_mm_set_sd(r[0
][j]);
          dm0
=
_mm_add_pd(dm0,_mmi0);
          dm0
=
_mm_mul_pd(dm0,dm0);
          __m128d dm1
=_mm_set_sd(r[1
][j]);
          dm1
=
_mm_add_sd(dm1,_mmi1);
          dm1
=
_mm_mul_sd(dm1,dm1);
          __m128d dm2
=_mm_set_sd(r[2
][j]);
          dm2
=
_mm_add_sd(dm2,_mmi2);
          dm2
=
_mm_mul_sd(dm2,dm2);
          dm0
=
_mm_add_sd(dm0,dm1);
          dm0
=
_mm_add_sd(dm0,dm2);

         { 
              __m128 sm0
=
_mm_cvtpd_ps(dm0);
              __m128d dma
=_mm_mul_sd(dm0,xmmd_0_5); //a*(-0.5)

              sm0=_mm_rsqrt_ss(sm0); //计算1/sqrt(a)
              dm0=_mm_cvtps_pd(sm0); //  
              
//牛顿迭代,提高开方精度

              dm1= _mm_mul_sd(dm0,dm0);
              dm1
=
_mm_mul_sd(dm1,dma);
              dm1
=
_mm_add_sd(dm1,xmmd1_5);
              dm0
=
_mm_mul_sd(dm0,dm1);

              
//再次迭代

                  dma= _mm_mul_sd(dma,dm0);
                  dma
=
_mm_mul_sd(dma,dm0);
                  dma
=
_mm_add_sd(dma,xmmd1_5);
                  dm0
=
_mm_mul_sd(dm0,dma);
          }

          lcpot
=
_mm_add_sd(lcpot,dm0);
      }

      
*pResult=_mm_add_pd(*
pResult,lcpot);
}

该代码对速度有很小的改进,但精度确只到小数点后5/6位(过早的降低了运算精度,后面的计算使误差放大),
再增加一次牛顿叠代就能弥补精度的缺失,但速度上就没有优势了;只能放弃该方案;


既然硬件的float和double之间的转换慢,那我手工来写一个会不会更好一些呢? (比较奇怪的尝试:)
见代码:
(该函数利用了double/float的IEE浮点编码结构)

const __m64   _mmd2f_esub =_mm_set_pi32((1023-127<< (52-32),0 ); 
const __m128i _xmmd2f_esub=
_mm_set_epi64(_mmd2f_esub,_mmd2f_esub); 
void computePotPart_forj_d2f(int i,__m128d*
 pResult)
{
      __m128d lcpot
=
_mm_setzero_pd();
      __m128d _mmi0
=_mm_set1_pd(-r[0
][i]);
      __m128d _mmi1
=_mm_set1_pd(-r[1
][i]);
      __m128d _mmi2
=_mm_set1_pd(-r[2
][i]);
      
int j=0
;
  
//*

      for(;j+4<i;j+=4 )
      {
          __m128d dm0
=_mm_add_pd(*(__m128d*)&r[0
][j],_mmi0);
          dm0
=
_mm_mul_pd(dm0,dm0);
          __m128d dm1
=_mm_add_pd(*(__m128d*)&r[1
][j],_mmi1);
          dm1
=
_mm_mul_pd(dm1,dm1);
          __m128d dm2
=_mm_add_pd(*(__m128d*)&r[2
][j],_mmi2);
          dm2
=
_mm_mul_pd(dm2,dm2);
          dm0
=
_mm_add_pd(dm0,dm1);
          dm0
=
_mm_add_pd(dm0,dm2);

          __m128d dm5
=_mm_add_pd(*(__m128d*)&r[0][j+2
],_mmi0);
          dm5
=
_mm_mul_pd(dm5,dm5);
          __m128d dm6
=_mm_add_pd(*(__m128d*)&r[1][j+2
],_mmi1);
          dm6
=
_mm_mul_pd(dm6,dm6);
          dm2
=_mm_add_pd(*(__m128d*)&r[2][j+2
],_mmi2);
          dm2
=
_mm_mul_pd(dm2,dm2);
          dm5
=
_mm_add_pd(dm5,dm6);
          dm5
=
_mm_add_pd(dm5,dm2);

            
//
用SSE的rsqrt近似计算1/sqrt(a); 然后使用牛顿迭代来提高开方精度
          
// 1/sqrt(a)的牛顿迭代公式x_next=(3-a*x*x)*x*0.5 =( 1.5 + (a*(-0.5)) * x*x) ) * x 

          {
           __m128d dma
=_mm_mul_pd(dm0,xmmd_0_5); //a*(-0.5)

           __m128d dmb= _mm_mul_pd(dm5,xmmd_0_5);

            (
*(__m128i*)&dm0)=_mm_sub_epi64((*(__m128i*)&
dm0),_xmmd2f_esub);
            (
*(__m128i*)&dm5)=_mm_sub_epi64((*(__m128i*)&
dm5),_xmmd2f_esub);
                       
          (
*(__m128i*)&dm0)=_mm_srli_epi64(*(__m128i*)&dm0,32-3
);
            (
*(__m128i*)&dm5)=_mm_srli_epi64(*(__m128i*)&dm5,32-3
);
            (
*(__m128i*)&dm0)=_mm_slli_epi64(*(__m128i*)&dm0,32
);
          __m128 sm0;
            (
*(__m128i*)&sm0)=_mm_xor_si128(*(__m128i*)&dm0,*(__m128i*)&
dm5);

          __m128 sma
=_mm_mul_ps(sm0,xmms_0_5); //a*(-0.5)

          sm0=_mm_rsqrt_ps(sm0); // 计算1/sqrt(a)
                      
//牛顿迭代,提高开方精度

                      sma= _mm_mul_ps(sma,sm0);
                      sma
=
_mm_mul_ps(sma,sm0); 
                      sma
=
_mm_add_ps(sma,xmms1_5); 
                      sm0
=
_mm_mul_ps(sm0,sma); 

          (
*(__m128i*)&dm0)=_mm_srli_epi64(*(__m128i*)&sm0,3); // 

          (*(__m128i*)&dm5)=_mm_slli_epi64(*(__m128i*)&sm0,32); // 
          (*(__m128i*)&dm5)=_mm_srli_epi64(*(__m128i*)&dm5,3); // 
                      (*(__m128i*)&dm0)=_mm_add_epi64((*(__m128i*)& dm0),_xmmd2f_esub);
                      (
*(__m128i*)&dm5)=_mm_add_epi64((*(__m128i*)&
dm5),_xmmd2f_esub);

          
//再次迭代,加倍精度 

          dma= _mm_mul_pd(dma,dm0);
          dmb
=
_mm_mul_pd(dmb,dm5);
          dma
=
_mm_mul_pd(dma,dm0);
          dmb
=
_mm_mul_pd(dmb,dm5);
          dma
=
_mm_add_pd(dma,xmmd1_5);
          dmb
=
_mm_add_pd(dmb,xmmd1_5);
          dm0
=
_mm_mul_pd(dm0,dma);
          dm5
=
_mm_mul_pd(dm5,dmb);
          }

          lcpot
=
_mm_add_pd(lcpot,dm0);
          lcpot
=
_mm_add_pd(lcpot,dm5);
  }

  
for (;j+1<i;++
j)
      {
          __m128d dm0
=_mm_set_sd(r[0
][j]);
          dm0
=
_mm_add_pd(dm0,_mmi0);
          dm0
=
_mm_mul_pd(dm0,dm0);
          __m128d dm1
=_mm_set_sd(r[1
][j]);
          dm1
=
_mm_add_sd(dm1,_mmi1);
          dm1
=
_mm_mul_sd(dm1,dm1);
          __m128d dm2
=_mm_set_sd(r[2
][j]);
          dm2
=
_mm_add_sd(dm2,_mmi2);
          dm2
=
_mm_mul_sd(dm2,dm2);
          dm0
=
_mm_add_sd(dm0,dm1);
          dm0
=
_mm_add_sd(dm0,dm2);

        { 
          __m128 sm0
=
_mm_cvtpd_ps(dm0);
          __m128d dma
=_mm_mul_sd(dm0,xmmd_0_5); //a*(-0.5)

          sm0=_mm_rsqrt_ss(sm0); //计算1/sqrt(a)
          dm0=_mm_cvtps_pd(sm0); //  
                      
//牛顿迭代,提高开方精度

          dm1= _mm_mul_sd(dm0,dm0);
          dm1
=
_mm_mul_sd(dm1,dma);
          dm1
=
_mm_add_sd(dm1,xmmd1_5);
          dm0
=
_mm_mul_sd(dm0,dm1);

          
//再次迭代

          dma= _mm_mul_sd(dma,dm0);
          dma
=
_mm_mul_sd(dma,dm0);
          dma
=
_mm_add_sd(dma,xmmd1_5);
          dm0
=
_mm_mul_sd(dm0,dma);
          }

          lcpot
=
_mm_add_sd(lcpot,dm0);
      }

  
*pResult=_mm_add_pd(*
pResult,lcpot);
}

该函数的速度比原来的代码稍慢:)  放弃之
(我还尝试过这样的代码(代码没有保存:( )
代码原理和上面的很接近,利用IEE的浮点格式,强制把double转成float后(通过指数平衡和移位),
分两路使用_mm_rsqrt_ps、牛顿叠代等;但这个慢慢,也放弃了)

 

既然硬件的float和double之间的转换慢,那我就不转换来看看怎样实现;
不使用SSE的_mm_rsqrt_ps指令,而利用IEE浮点格式生成一个粗略的近似解,然后迭代(迭代了3次);
(也可以用查表的方式来得到初始解,但在SSE体系中,这种实现很可能得不偿失,所以没有去实现);(该函数利用了double的IEE浮点编码结构)这样以后,速度有了一些提高,但精度还有点不够(大概有6位小数位精度),再次迭代的话就失去了速度优势;
不知道有没有比魔法数0x5fe6ec85,0xe7de30da更好的魔法数:)
放弃;

const __m64   _mmi_mn=_mm_set_pi32(0x5fe6ec85,0xe7de30da );
const __m128i xmmi64_mn=
_mm_set1_epi64(_mmi_mn);
void computePotPart_forj_int(int i,__m128d*
 pResult)
{
      __m128d lcpot
=
_mm_setzero_pd();
      __m128d _mmi0
=_mm_set1_pd(-r[0
][i]);
      __m128d _mmi1
=_mm_set1_pd(-r[1
][i]);
      __m128d _mmi2
=_mm_set1_pd(-r[2
][i]);
        
int j=0
;
         
//*

      for(;j+4<i;j+=4 )
      {
          __m128d dm0
=_mm_add_pd(*(__m128d*)&r[0
][j],_mmi0);
          dm0
=
_mm_mul_pd(dm0,dm0);
          __m128d dm1
=_mm_add_pd(*(__m128d*)&r[1
][j],_mmi1);
          dm1
=
_mm_mul_pd(dm1,dm1);
          __m128d dm2
=_mm_add_pd(*(__m128d*)&r[2
][j],_mmi2);
          dm2
=
_mm_mul_pd(dm2,dm2);
          dm0
=
_mm_add_pd(dm0,dm1);
          dm0
=
_mm_add_pd(dm0,dm2);

          __m128d dm5
=_mm_add_pd(*(__m128d*)&r[0][j+2
],_mmi0);
          dm5
=
_mm_mul_pd(dm5,dm5);
          __m128d dm6
=_mm_add_pd(*(__m128d*)&r[1][j+2
],_mmi1);
          dm6
=
_mm_mul_pd(dm6,dm6);
          dm2
=_mm_add_pd(*(__m128d*)&r[2][j+2
],_mmi2);
          dm2
=
_mm_mul_pd(dm2,dm2);
          dm5
=
_mm_add_pd(dm5,dm6);
          dm5
=
_mm_add_pd(dm5,dm2);

          
//
利用IEE double 浮点格式的编码生成1/sqrt(a)的一个近似值; 然后使用牛顿迭代来提高精度
          
// 1/sqrt(a)的牛顿迭代公式x_next=(3-a*x*x)*x*0.5 =( 1.5 + (a*(-0.5)) * x*x) ) * x 

          {
              __m128i xmmi0
=
xmmi64_mn;
              __m128d dma
=_mm_mul_pd(dm0,xmmd_0_5); //a*(-0.5)

              __m128d dmb= _mm_mul_pd(dm5,xmmd_0_5);
              
*(__m128i*)&dm0=_mm_srli_epi64(*(__m128i*)&dm0,1
);
              
*(__m128i*)&dm5=_mm_srli_epi64(*(__m128i*)&dm5,1
);
              
*(__m128i*)&dm0=_mm_sub_epi64(xmmi0,*(__m128i*)&
dm0);
              
*(__m128i*)&dm5=_mm_sub_epi64(xmmi0,*(__m128i*)&
dm5);

              
//迭代,加倍精度


              dm1
= _mm_mul_pd(dma,dm0);
              dm2
=
_mm_mul_pd(dmb,dm5);
              dm1
=
_mm_mul_pd(dm1,dm0);
              dm2
=
_mm_mul_pd(dm2,dm5);
              dm1
=
_mm_add_pd(dm1,xmmd1_5);
              dm2
=
_mm_add_pd(dm2,xmmd1_5);
              dm0
=
_mm_mul_pd(dm0,dm1);
              dm5
=
_mm_mul_pd(dm5,dm2);

              dm1
=
_mm_mul_pd(dma,dm0);
              dm2
=
_mm_mul_pd(dmb,dm5);
              dm1
=
_mm_mul_pd(dm1,dm0);
              dm2
=
_mm_mul_pd(dm2,dm5);
              dm1
=
_mm_add_pd(dm1,xmmd1_5);
              dm2
=
_mm_add_pd(dm2,xmmd1_5);
              dm0
=
_mm_mul_pd(dm0,dm1);
              dm5
=
_mm_mul_pd(dm5,dm2);

              dma
=
_mm_mul_pd(dma,dm0);
              dmb
=
_mm_mul_pd(dmb,dm5);
              dma
=
_mm_mul_pd(dma,dm0);
              dmb
=
_mm_mul_pd(dmb,dm5);
              dma
=
_mm_add_pd(dma,xmmd1_5);
              dmb
=
_mm_add_pd(dmb,xmmd1_5);
              dm0
=
_mm_mul_pd(dm0,dma);
              dm5
=
_mm_mul_pd(dm5,dmb);
              
//精度还有点不够  但再次迭代的话就失去了速度优势

              }

          lcpot
=
_mm_add_pd(lcpot,dm0);
          lcpot
=
_mm_add_pd(lcpot,dm5);
      }

      for (;j+1<i;++j)
      {
          __m128d dm0=_mm_set_sd(r[0][j]);
          dm0=_mm_add_pd(dm0,_mmi0);
          dm0=_mm_mul_pd(dm0,dm0);
          __m128d dm1=_mm_set_sd(r[1][j]);
          dm1=_mm_add_sd(dm1,_mmi1);
          dm1=_mm_mul_sd(dm1,dm1);
          __m128d dm2=_mm_set_sd(r[2][j]);
          dm2=_mm_add_sd(dm2,_mmi2);
          dm2=_mm_mul_sd(dm2,dm2);
          dm0=_mm_add_sd(dm0,dm1);
          dm0=_mm_add_sd(dm0,dm2);
   {
     __m128 sm0=_mm_cvtpd_ps(dm0);
     __m128d dma=_mm_mul_sd(dm0,xmmd_0_5); //a*(-0.5)
     sm0=_mm_rsqrt_ss(sm0); //计算1/sqrt(a)
     dm0=_mm_cvtps_pd(sm0); //
              //牛顿迭代,提高开方精度
     dm1=_mm_mul_sd(dm0,dm0);
     dm1=_mm_mul_sd(dm1,dma);
     dm1=_mm_add_sd(dm1,xmmd1_5);
     dm0=_mm_mul_sd(dm0,dm1);
     //再次迭代
      dma=_mm_mul_sd(dma,dm0);
      dma=_mm_mul_sd(dma,dm0);
      dma=_mm_add_sd(dma,xmmd1_5);
      dm0=_mm_mul_sd(dm0,dma);
    }


          lcpot
=
_mm_add_sd(lcpot,dm0);
      }

  
*pResult=_mm_add_pd(*
pResult,lcpot);
}

 

 还想到一个没有时间去实现的方案:由于SSE和x87是独立的两个硬件,那么可以使它们并行执行;
SSE部件代码不变,但循环做更大的展开,然后让x87承担一路或两路运算;

 

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值