StellerSim 是一个基于离散单元数值计算的物理演化程序,该算法最早从天体动力学星云演化计算中产生,被广泛用于游戏开发引擎中的物理演化、碰撞事件、特效生成等等模块。该程序采用C++语言,完整计算了一个粒子发生器引擎模块膨胀与坍缩效果,粒子发生器被广泛用于unity、 Unreal等著名游戏物理引擎软件。通过天气物理学领域的数值模拟科学计算软件,来熟悉对串行
软件进行并行性能优化的基本方法与步骤。
代码目录如下:
headers 目录
- global.h
- initial.h
- sphkernel.h
global.h 代码如下:
#ifndef _GLOBAL_H
#define _GLOBAL_H
extern int N;
extern double n;
extern double t;
extern double tEnd;
extern double dt;
extern double M;
extern double R;
extern double** pos;
extern double** vel;
extern double** acc;
extern double* rho;
extern double* P;
extern double lmbda;
extern double m;
extern double** dx;
extern double** dy;
extern double** dz;
extern double** r;
extern double** W;
extern double** wx;
extern double** wy;
extern double** wz;
extern double** gradPara;
const double h = 0.1;
const double k = 0.1;
const double nu = 1.0;
const bool plotRealTime = true;
const double pi = 3.1415926;
#endif
initial.h 代码如下:
#ifndef _INITIAL_H
#define _INITIAL_H
extern double* allocMEM();
extern void initialize();
#endif
sphkernel.h 代码如下:
#ifndef _SPHKERNEL_H
#define _SPHKERNEL_H
extern void getPairwiseSeparations(double** &pos);
extern void getW(double** &dx, double** &dy, double** &dz, const double h);
extern void getGradW(double** &dx, double** &dy, double** &dz, const double h);
extern void getDensity(double** &pos, double &m, const double h);
extern void getPressure(double* &rho, const double k, double &n);
extern void getAcc(double** &pos, double** &vel, double &m, const double h, const double k, double &n, double lmbda, const double nu);
#endif
src 目录
- global.cpp
- initial.cpp
- main.cpp
- sphkernel.cpp
global.cpp 代码如下:
#include <stdlib.h>
int N = 10;
double n = 1.0;
double t;
double tEnd;
double dt;
double M = 2.0;
double R = 0.75;
double m = 0.0;
double** pos = NULL;
double** vel = NULL;
double** acc = NULL;
double* rho = NULL;
double* P = NULL;
double lmbda = 0.0;
double** dx = NULL;
double** dy = NULL;
double** dz = NULL;
double** r = NULL;
double** W = NULL;
double** wx = NULL;
double** wy = NULL;
double** wz = NULL;
double** gradPara = NULL;
initial.cpp 代码如下:
#include <stdlib.h>
#include <iostream>
#include <math.h>
#include <assert.h>
#include "global.h"
using namespace std;
double* assignParticleMem(int Nparticle)
{
double* ptr = new double[Nparticle];
return ptr;
}
void allocMEM()
{
pos = new double*[3]; assert(pos!=NULL);
vel = new double*[3]; assert(vel!=NULL);
acc = new double*[3]; assert(acc!=NULL);
rho = new double [N]; assert(rho!=NULL);
P = new double [N]; assert( P!=NULL);
dx = new double*[N]; assert( dx!=NULL);
dy = new double*[N]; assert( dy!=NULL);
dz = new double*[N]; assert( dz!=NULL);
r = new double*[N]; assert( r!=NULL);
W = new double*[N]; assert( W!=NULL);
wx = new double*[N]; assert( wx!=NULL);
wy = new double*[N]; assert( wy!=NULL);
wz = new double*[N]; assert( wz!=NULL);
gradPara = new double*[N]; assert( gradPara!=NULL);
for (int i = 0; i < 3; i++)
{
pos[i] = assignParticleMem(N); assert(pos[i]!=NULL);
vel[i] = assignParticleMem(N); assert(vel[i]!=NULL);
acc[i] = assignParticleMem(N); assert(acc[i]!=NULL);
}
for (int i = 0; i < N; i++)
{
dx[i] = assignParticleMem(N); assert( dx[i]!=NULL);
dy[i] = assignParticleMem(N); assert( dy[i]!=NULL);
dz[i] = assignParticleMem(N); assert( dz[i]!=NULL);
r[i] = assignParticleMem(N); assert( r[i]!=NULL);
W[i] = assignParticleMem(N); assert( W[i]!=NULL);
wx[i] = assignParticleMem(N); assert( wx[i]!=NULL);
wy[i] = assignParticleMem(N); assert( wy[i]!=NULL);
wz[i] = assignParticleMem(N); assert( wz[i]!=NULL);
gradPara[i] = assignParticleMem(N); assert( gradPara[i]!=NULL);
}
}
void initialize()
{
srand(42);
int randS, randE;
randS = 0; randE = 2;
for (int i = 0; i < N; i++)
{
pos[0][i] = pow(-1, (rand()%(randS - randE) + randS +1))*rand() / double(RAND_MAX);
pos[1][i] = pow(-1, (rand()%(randS - randE) + randS +1))*rand() / double(RAND_MAX);
pos[2][i] = pow(-1, (rand()%(randS - randE) + randS +1))*rand() / double(RAND_MAX);
vel[0][i] = 0.0;
vel[1][i] = 0.0;
vel[2][i] = 0.0;
rho[i] = 0.0;
P[i] = 0.0;
for (int j = 0; j < N; j++)
{
dx[i][j] = 0.0;
dy[i][j] = 0.0;
dz[i][j] = 0.0;
r[i][j] = 0.0;
W[i][j] = 0.0;
wx[i][j] = 0.0;
wy[i][j] = 0.0;
wz[i][j] = 0.0;
gradPara[i][j] = 0.0;
}
}
}
main.cpp 代码如下:
#include <stdlib.h>
#include <iostream>
#include <iomanip>
#include <math.h>
#include <cmath>
#include "global.h"
#include "initial.h"
#include "sphkernel.h"
#include <sys/time.h>
#include <fstream>
#include <iomanip>
#include <omp.h>
using namespace std;
int main(int argc, char *argv[])
{
if (argc < 2)
{
cout << "SPHexe [number of particles:int]" << endl;
exit(0);
}
N = atoi(argv[1]);
cout << "StellarSim performs on " << N << " particles" << endl;
allocMEM();
lmbda = 2.0 * k * (1.0 + n) * pow(pi, (-3.0 / (2.0 * n))) * (M * tgamma(5.0 / 2.0 + n) / pow(R, 3.0) / pow(tgamma(1.0 + n), (1.0 / n))) / pow(R, 2.0);
cout << "lmbda is " << lmbda << endl;
m = M / N;
initialize();
#pragma omp parallel
getAcc(pos, vel, m, h, k, n, lmbda, nu);
t = 0.0;
tEnd = 12;
dt = 0.04;
int Nt = int(ceil(tEnd / dt));
struct timeval tpstart, tpend;
gettimeofday(&tpstart, NULL);
for (int i = 0; i <= Nt; i++)
{
#pragma omp parallel
{
#pragma omp for
for (int j = 0; j < N; j++)
{
vel[0][j] += acc[0][j] * dt / 2;
vel[1][j] += acc[1][j] * dt / 2;
vel[2][j] += acc[2][j] * dt / 2;
pos[0][j] += vel[0][j] * dt;
pos[1][j] += vel[1][j] * dt;
pos[2][j] += vel[2][j] * dt;
}
getAcc(pos, vel, m, h, k, n, lmbda, nu);
#pragma omp for
for (int j = 0; j < N; j++)
{
vel[0][j] += acc[0][j] * dt / 2;
vel[1][j] += acc[1][j] * dt / 2;
vel[2][j] += acc[2][j] * dt / 2;
}
#pragma omp master
{
// cout << "Time Step is: " << setw(4) << t << endl;
t += dt;
}
getDensity(pos, m, h);
}
}
gettimeofday(&tpend, NULL);
double timeuse = 1000000 * (tpend.tv_sec - tpstart.tv_sec) + tpend.tv_usec - tpstart.tv_usec;
timeuse /= 1000000;
cout << "elapsed time: " << setprecision(3) << setw(6) << timeuse << " s" << endl;
ofstream file("acc_value.txt");
file.setf(ios::scientific);
for (int j = 0; j < N; j++)
{
file << setprecision(8) << setw(18) << acc[0][j] << ","
<< setw(18) << acc[1][j] << "," << setw(18) << acc[2][j] << endl;
}
file.close();
return 0;
}
sphkernel.cpp 代码如下:
#include <stdlib.h>
#include <iostream>
#include <math.h>
#include <arm_neon.h>
#include "global.h"
#include <omp.h>
using namespace std;
void getPairwiseSeparations(double** &pos)
{
#pragma omp for
for (int i = 0; i < N; i++)
{
float64x2_t vi0 = vdupq_n_f64(pos[0][i]);
float64x2_t vi1 = vdupq_n_f64(pos[1][i]);
float64x2_t vi2 = vdupq_n_f64(pos[2][i]);
for (int j = 0; j < N / 2 * 2; j += 2)
{
float64x2_t vj0 = vld1q_f64(&pos[0][j]);
float64x2_t vj1 = vld1q_f64(&pos[1][j]);
float64x2_t vj2 = vld1q_f64(&pos[2][j]);
vst1q_f64(&dx[i][j], vsubq_f64(vj0, vi0));
vst1q_f64(&dy[i][j], vsubq_f64(vj1, vi1));
vst1q_f64(&dz[i][j], vsubq_f64(vj2, vi2));
}
for (int j = N / 2 * 2; j < N; j++)
{
dx[i][j] = pos[0][j] - pos[0][i];
dy[i][j] = pos[1][j] - pos[1][i];
dz[i][j] = pos[2][j] - pos[2][i];
}
}
}
void getW(double** &dx, double** &dy, double** &dz, const double h)
{
double a = (1.0 / (h * sqrt(pi)));
double b = pow(a, 3.0);
double c = -1.0 * pow(h, 2);
#pragma omp for
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N / 2 * 2; j += 2)
{
float64x2_t vdx = vld1q_f64(&dx[i][j]);
float64x2_t vdy = vld1q_f64(&dy[i][j]);
float64x2_t vdz = vld1q_f64(&dz[i][j]);
float64x2_t vdxyz = vmulq_f64(vdx,vdx) + vmulq_f64(vdy,vdy) + vmulq_f64(vdz,vdz);
vdxyz = vsqrtq_f64(vdxyz);
vdxyz = vmulq_f64(vdxyz, vdxyz);
vst1q_f64(&r[i][j], vdxyz);
W[i][j] = b * exp((r[i][j] / c));
W[i][j + 1] = b * exp((r[i][j + 1] / c));
}
for (int j = N / 2 * 2; j < N; j++)
{
r[i][j] = sqrt(pow(dx[i][j], 2.0) + pow(dy[i][j], 2.0) + pow(dz[i][j], 2.0));
W[i][j] = pow((1.0 / (h * sqrt(pi))), 3.0) * exp((-pow(r[i][j], 2) / pow(h, 2)));
}
}
}
void getGradW(double** &dx, double** &dy, double** &dz, const double h)
{
double a = -1.0 * pow(h, 2);
double b = pow(h, 5);
double c = pow(pi, 3 / 2);
#pragma omp for
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N / 2 * 2; j += 2)
{
float64x2_t vdx = vld1q_f64(&dx[i][j]);
float64x2_t vdy = vld1q_f64(&dy[i][j]);
float64x2_t vdz = vld1q_f64(&dz[i][j]);
float64x2_t vdxyz = vmulq_f64(vdx,vdx) + vmulq_f64(vdy,vdy) + vmulq_f64(vdz,vdz);
vdxyz = vsqrtq_f64(vdxyz);
vdxyz = vmulq_f64(vdxyz, vdxyz);
vst1q_f64(&r[i][j], vdxyz);
gradPara[i][j] = -2 * exp(r[i][j] / a) / b / c;
wx[i][j] = gradPara[i][j] * dx[i][j];
wy[i][j] = gradPara[i][j] * dy[i][j];
wz[i][j] = gradPara[i][j] * dz[i][j];
gradPara[i][j + 1] = -2 * exp(r[i][j + 1] / a) / b / c;
wx[i][j + 1] = gradPara[i][j + 1] * dx[i][j + 1];
wy[i][j + 1] = gradPara[i][j + 1] * dy[i][j + 1];
wz[i][j + 1] = gradPara[i][j + 1] * dz[i][j + 1];
}
for (int j = N / 2 * 2; j < N; j++)
{
r[i][j] = sqrt(pow(dx[i][j], 2.0) + pow(dy[i][j], 2.0) + pow(dz[i][j], 2.0));
gradPara[i][j] = -2 * exp(-pow(r[i][j], 2) / pow(h, 2)) / pow(h, 5) / pow(pi, 3 / 2);
wx[i][j] = gradPara[i][j] * dx[i][j];
wy[i][j] = gradPara[i][j] * dy[i][j];
wz[i][j] = gradPara[i][j] * dz[i][j];
}
}
}
void getDensity(double** &pos, double &m, const double h)
{
getPairwiseSeparations(pos);
getW(dx, dy, dz, h);
#pragma omp for
for (int i = 0; i < N; i++)
{
double sum = 0.0;
float64x2_t vec0 = vdupq_n_f64(0.0);
float64x2_t vec1 = vdupq_n_f64(0.0);
float64x2_t temp = vdupq_n_f64(0.0);
for (int j = 0; j < N / 4 * 4; j += 4)
{
vec0 = vld1q_f64(&W[i][j + 0]);
vec1 = vld1q_f64(&W[i][j + 2]);
temp = vaddq_f64(temp, vec0);
temp = vaddq_f64(temp, vec1);
}
sum = vaddvq_f64(temp);
for ( int j = N / 4 * 4; j < N; j++)
{
sum += W[i][j];
}
rho[i] += m * sum;
}
}
void getPressure(double* &rho, const double k, double &n)
{
#pragma omp for
for (int j = 0; j < N; j++)
P[j] = k * pow(rho[j], (1 + 1 / n));
}
void getAcc(double** &pos, double** &vel, double &m, const double h, const double k, double &n, double lmbda, const double nu)
{
getDensity(pos, m, h);
getPressure(rho, k, n);
getPairwiseSeparations(pos);
getGradW(dx, dy, dz, h);
#pragma omp for
for (int i = 0; i < N; i++)
{
float64x2_t vm = vdupq_n_f64(m);
float64x2_t vtmp1 = vdupq_n_f64(P[i] / pow(rho[i], 2));
float64x2_t vacc0 = vdupq_n_f64(0.0);
float64x2_t vacc1 = vdupq_n_f64(0.0);
float64x2_t vacc2 = vdupq_n_f64(0.0);
float64x2_t vwx, vwy, vwz;
float64_t acc0, acc1, acc2;
for (int j = 0; j < N / 2 * 2; j += 2)
{
float64x2_t vP = vld1q_f64(&P[j]);
float64x2_t vrho = vld1q_f64(&rho[j]);
float64x2_t vtmp2 = vdivq_f64(vmulq_f64(vP, vP), vmulq_f64(vrho, vrho));
vwx = vld1q_f64(&wx[i][j]);
vwy = vld1q_f64(&wy[i][j]);
vwz = vld1q_f64(&wz[i][j]);
vacc0 = vfmaq_f64(vacc0, vmulq_f64(vm,vwx), vaddq_f64(vtmp1, vtmp2));
vacc1 = vfmaq_f64(vacc1, vmulq_f64(vm,vwy), vaddq_f64(vtmp1, vtmp2));
vacc2 = vfmaq_f64(vacc2, vmulq_f64(vm,vwz), vaddq_f64(vtmp1, vtmp2));
}
acc0 = vaddvq_f64(vacc0);
acc1 = vaddvq_f64(vacc1);
acc2 = vaddvq_f64(vacc2);
for (int j = N / 2 * 2; j < N; j++)
{
acc0 += m * (P[i] / pow(rho[i], 2) + pow(P[j] / rho[j],2) ) * wx[i][j];
acc1 += m * (P[i] / pow(rho[i], 2) + pow(P[j] / rho[j],2) ) * wy[i][j];
acc2 += m * (P[i] / pow(rho[i], 2) + pow(P[j] / rho[j],2) ) * wz[i][j];
}
acc[0][i] += acc0;
acc[1][i] += acc1;
acc[2][i] += acc2;
}
#pragma omp for
for (int i = 0; i < N; i++)
{
acc[0][i] -= lmbda * pos[0][i];
acc[1][i] -= lmbda * pos[1][i];
acc[2][i] -= lmbda * pos[2][i];
}
#pragma omp for
for (int i = 0; i < N; i++)
{
acc[0][i] -= nu * vel[0][i];
acc[1][i] -= nu * vel[1][i];
acc[2][i] -= nu * vel[2][i];
}
}
Makefile 代码如下:
CXX = g++
TARGET = main
CFLAGS = -fopenmp
SRCDIR = src
OBJDIR = obj
ALLSRC = $(foreach dir, $(SRCDIR), $(wildcard $(dir)/*.cpp))
ALLOBJ = $(patsubst $(SRCDIR)/%.cpp, $(OBJDIR)/%.o, $(ALLSRC))
INCLUDES = -I ./headers
$(TARGET): $(ALLOBJ)
$(CXX) $(CFLAGS) $^ -o $@
$(OBJDIR)/%.o: $(SRCDIR)/%.cpp
$(CXX) $(INCLUDES) $(CFLAGS) -c $< -o $@
PHONY: clean
clean:
rm -f $(ALLOBJ) $(TARGET)
使用64线程进行提交后,运行时间为 3.63 s,加速比为 136.9,效果显著。