1.下载解压StellarSim压缩文件,进入安装目录,编辑Makefile,在LDFLAGS参数后添加-pg
CC = g++
CFLAGS = -Wall -Wextra -O1 -Iheaders
SRC = $(wildcard src/*.cpp)
OBJ = $(SRC:.cpp=.o)
EXEC = stellarSim
LDFLAGS = -lm -pg
all: $(EXEC)
$(EXEC): $(OBJ)
$(CC) $^ -o $@ $(LDFLAGS)
$(CC) $(CFLAGS) $(OBJECTS) -o $@
%.o: %.cpp
$(CC) $(CFLAGS) -c $< -o $@
$(CC) $(CFLAGS) $(INC) -c $< -o $@
clean:
rm -f $(OBJ) $(EXEC)
2.运行make命令,编译StellarSim程序
3.使用如下命令运行程序。
yhrun -p thcp1 ./stellarSim 1024
4.使用gprof工具分析StellarSim程序,找到时间开销最大的核心子函数。
可以看到耗时最长的是getAcc函数,时间45.88秒,占比43.88%,getDensity第二,占比17.47....
5.找到这些函数
void getPairwiseSeparations(double** &pos)
{
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
dx[i][j] = pos[0][j] - pos[0][i];
dy[i][j] = pos[1][j] - pos[1][i];
dz[i][j] = pos[2][j] - pos[2][i];
//fprintf(stdout, "%12.6f", dz[i][j]);
//fflush(stdout);
}
//fprintf(stdout,"\n");
}
}
void getW(double** &dx, double** &dy, double** &dz, const double h)
{
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
r[i][j] = sqrt(pow(dx[i][j],2.0) + pow(dy[i][j],2.0) + pow(dz[i][j],2.0));
W[i][j] = pow((1.0 / (h*sqrt(pi))), 3.0) * exp((-pow(r[i][j],2) / pow(h,2)));
//fprintf(stdout, "%12.6f", r[i][j]);
//fprintf(stdout, "%12.6f", W[i][j]);
//fflush(stdout);
}
//fprintf(stdout,"\n");
}
}
void getGradW(double** &dx, double** &dy, double** &dz, const double h)
{
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
r[i][j] = sqrt(pow(dx[i][j],2.0) + pow(dy[i][j],2.0) + pow(dz[i][j],2.0));
gradPara[i][j] = -2 * exp(-pow(r[i][j],2) / pow(h,2)) / pow(h,5) / pow(pi,(3/2));
wx[i][j] = gradPara[i][j]*dx[i][j];
wy[i][j] = gradPara[i][j]*dy[i][j];
wz[i][j] = gradPara[i][j]*dz[i][j];
//fprintf(stdout, "%12.6f", wy[i][j]);
//fflush(stdout);
}
//fprintf(stdout,"\n");
}
}
void getDensity(double** &pos, double &m, const double h)
{
getPairwiseSeparations(pos);
getW(dx, dy, dz, h);
for (int j = 0; j < N; j++) {
for (int i = 0; i < N; i++) {
rho[j] += m * W[i][j];
}
//fprintf(stdout, "%12.6f", rho[i]);
//fflush(stdout);
//fprintf(stdout,"\n");
}
}
void getAcc(double** &pos, double** &vel, double &m, const double h, const double k, double &n, double lmbda, const double nu)
{
getDensity(pos, m, h);
getPressure(rho, k, n);
getPairwiseSeparations(pos);
getGradW(dx, dy, dz, h);
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
acc[0][j] -= m * ( P[j]/pow(rho[j],2) + pow(P[i]/rho[i],2) ) * wx[i][j];
acc[1][j] -= m * ( P[j]/pow(rho[j],2) + pow(P[i]/rho[i],2) ) * wy[i][j];
acc[2][j] -= m * ( P[j]/pow(rho[j],2) + pow(P[i]/rho[i],2) ) * wz[i][j];
}
}
for (int j = 0; j < N; j++) {
acc[0][j] -= lmbda * pos[0][j];
acc[1][j] -= lmbda * pos[1][j];
acc[2][j] -= lmbda * pos[2][j];
}
for (int j = 0; j < N; j++) {
acc[0][j] -= nu * vel[0][j];
acc[1][j] -= nu * vel[1][j];
acc[2][j] -= nu * vel[2][j];
}
}
6.要优化getAcc函数的性能,可以考虑以下几个方面:
-
减少重复计算:在循环中存在大量重复的计算,例如重复计算 P[j]/pow(rho[j],2) 和 P[i]/rho[i]。您可以在循环外部进行这些计算,并将结果存储起来,以避免重复计算。
-
使用并行化:考虑循环的并行化处理,特别是第一个和第二个循环。您可以使用并行计算库(如OpenMP)来加速计算过程。
-
数据结构优化:检查数据结构的使用方式,确保内存访问模式是高效的。例如,可以尝试重新组织数据结构,以便更好地利用缓存和减少内存访问次数。
-
函数调用开销:如果
getDensity
、getPressure
、getPairwiseSeparations
、getGradW
等函数具有较大的开销,可以考虑将它们的计算结果缓存起来,避免重复调用。 -
参数传递优化:考虑对函数参数进行优化,避免不必要的参数传递,尤其是对于数组和指针类型的参数。
-
算法优化:根据具体的物理模型和计算需求,考虑是否存在更高效的数值计算算法。
-
编译器优化:确保使用适当的编译器选项进行优化,例如启用优化标志
-O3
7.接下来采用 OpenMP多线程并行优化
#include <omp.h>
void getPairwiseSeparations(double** &pos)
{
#pragma omp parallel for collapse(2)
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
dx[i][j] = pos[0][j] - pos[0][i];
dy[i][j] = pos[1][j] - pos[1][i];
dz[i][j] = pos[2][j] - pos[2][i];
//fprintf(stdout, "%12.6f", dz[i][j]);
//fflush(stdout);
}
//fprintf(stdout,"\n");
}
}
void getW(double** &dx, double** &dy, double** &dz, const double h)
{
#pragma omp parallel for collapse(2)
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
r[i][j] = sqrt(pow(dx[i][j],2.0) + pow(dy[i][j],2.0) + pow(dz[i][j],2.0));
W[i][j] = pow((1.0 / (h*sqrt(pi))), 3.0) * exp((-pow(r[i][j],2) / pow(h,2)));
//fprintf(stdout, "%12.6f", r[i][j]);
//fprintf(stdout, "%12.6f", W[i][j]);
//fflush(stdout);
}
//fprintf(stdout,"\n");
}
}
void getGradW(double** &dx, double** &dy, double** &dz, const double h)
{
#pragma omp parallel for collapse(2)
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
r[i][j] = sqrt(pow(dx[i][j],2.0) + pow(dy[i][j],2.0) + pow(dz[i][j],2.0));
gradPara[i][j] = -2 * exp(-pow(r[i][j],2) / pow(h,2)) / pow(h,5) / pow(pi,(3/2));
wx[i][j] = gradPara[i][j]*dx[i][j];
wy[i][j] = gradPara[i][j]*dy[i][j];
wz[i][j] = gradPara[i][j]*dz[i][j];
//fprintf(stdout, "%12.6f", wy[i][j]);
//fflush(stdout);
}
//fprintf(stdout,"\n");
}
}
void getDensity(double** &pos, double &m, const double h)
{
getPairwiseSeparations(pos);
getW(dx, dy, dz, h);
#pragma omp parallel
for (int i = 0; i < N; i++) {
#pragma omp for //reduction(+:rho[j])
for (int j = 0; j < N; j++) {
rho[j] += m * W[i][j];
}
//fprintf(stdout, "%12.6f", rho[j]);
// fflush(stdout);
//fprintf(stdout,"\n");
}
}
void getAcc(double** &pos, double** &vel, double &m, const double h, const double k, double &n, double lmbda, const double nu)
{
getDensity(pos, m, h);
getPressure(rho, k, n);
getPairwiseSeparations(pos);
getGradW(dx, dy, dz, h);
#pragma omp parallel
{
for (int i = 0; i < N; i++) {
#pragma omp for
for (int j = 0; j < N; j++) {
acc[0][j] -= m * ( P[j]/pow(rho[j],2) + pow(P[i]/rho[i],2) ) * wx[i][j];
acc[1][j] -= m * ( P[j]/pow(rho[j],2) + pow(P[i]/rho[i],2) ) * wy[i][j];
acc[2][j] -= m * ( P[j]/pow(rho[j],2) + pow(P[i]/rho[i],2) ) * wz[i][j];
}
}
#pragma omp barrier
#pragma omp for
for (int j = 0; j < N; j++) {
acc[0][j] -= lmbda * pos[0][j];
acc[1][j] -= lmbda * pos[1][j];
acc[2][j] -= lmbda * pos[2][j];
acc[0][j] -= nu * vel[0][j];
acc[1][j] -= nu * vel[1][j];
acc[2][j] -= nu * vel[2][j];
}
}
}
8.接下来采用 SIMD向量化并行计算优化
#include <arm_neon.h>
// 使用C/C++函数计算exp()函数
float64x2_t neon_exp(float64x2_t input)
{
// 计算指数值
float64x2_t result;
result[0] = exp(input[0]);
result[1] = exp(input[1]);
return result;
}
void getPairwiseSeparations(double** &pos)
{
float64x2_t dx_i_j,dy_i_j,dz_i_j;
float64x2_t pos_0_j, pos_1_j, pos_2_j;
float64x2_t pos_0_i, pos_1_i, pos_2_i;
for (int i = 0; i < N ; i ++) {
pos_0_i = vdupq_n_f64(pos[0][i]);
pos_1_i = vdupq_n_f64(pos[1][i]);
pos_2_i = vdupq_n_f64(pos[2][i]);
for (int j = 0; j < N/ 2 * 2; j += 2) {
pos_0_j = vld1q_f64(&pos[0][j]);
pos_1_j = vld1q_f64(&pos[1][j]);
pos_2_j = vld1q_f64(&pos[2][j]);
dx_i_j = vld1q_f64(&dx[i][j]);
dy_i_j = vld1q_f64(&dy[i][j]);
dz_i_j = vld1q_f64(&dz[i][j]);
//dx[i][j] = pos[0][j] - pos[0][i];
//dy[i][j] = pos[1][j] - pos[1][i];
//dz[i][j] = pos[2][j] - pos[2][i];
dx_i_j = vsubq_f64(pos_0_j,pos_0_i);
dy_i_j = vsubq_f64(pos_1_j,pos_1_i);
dz_i_j = vsubq_f64(pos_2_j,pos_2_i);
vst1q_f64(&dx[i][j], dx_i_j);
vst1q_f64(&dy[i][j], dy_i_j);
vst1q_f64(&dz[i][j], dz_i_j);
//fprintf(stdout, "%12.6f", dz[i][j]);
//fflush(stdout);
}
//fprintf(stdout,"\n");
}
}
void getW(double** &dx, double** &dy, double** &dz, const double h)
{
float tmp1 = pow((1.0 / (h*sqrt(pi))), 3.0);
float tmp2 = -1 / pow(h,2);
float64x2_t tmp1Vec = vdupq_n_f64(tmp1);
float64x2_t tmp2Vec = vdupq_n_f64(tmp2);
float64x2_t r_i_j,W_i_j;
float64x2_t dx_i_j,dy_i_j,dz_i_j;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N/ 2 * 2; j += 2) {
dx_i_j = vld1q_f64(&dx[i][j]);
dy_i_j = vld1q_f64(&dy[i][j]);
dz_i_j = vld1q_f64(&dz[i][j]);
r_i_j = vld1q_f64(&r[i][j]);
W_i_j = vld1q_f64(&W[i][j]);
//r[i][j] = sqrt(pow(dx[i][j],2.0) + pow(dy[i][j],2.0) + pow(dz[i][j],2.0));
r_i_j = vsqrtq_f64(vaddq_f64(vmulq_f64(dx_i_j, dx_i_j),vaddq_f64(vmulq_f64(dy_i_j, dy_i_j),vmulq_f64(dz_i_j, dz_i_j))));
//W[i][j] = pow((1.0 / (h*sqrt(pi))), 3.0) * exp((-pow(r[i][j],2) / pow(h,2)));
W_i_j = vmulq_f64(tmp1Vec,neon_exp(vmulq_f64(vmulq_f64(r_i_j,r_i_j),tmp2Vec)));
vst1q_f64(&r[i][j], r_i_j);
vst1q_f64(&W[i][j], W_i_j);
//fprintf(stdout, "%12.6f", r[i][j]);
//fprintf(stdout, "%12.6f", W[i][j]);
//fflush(stdout);
}
//fprintf(stdout,"\n");
}
}
void getGradW(double** &dx, double** &dy, double** &dz, const double h)
{
float tmp1 = -1 / pow(h,2);
float tmp2 = -2 / pow(h,5) / pow(pi,(3/2));
float64x2_t tmp1Vec = vdupq_n_f64(tmp1);
float64x2_t tmp2Vec = vdupq_n_f64(tmp2);
float64x2_t dx_i_j,dy_i_j,dz_i_j;
float64x2_t r_i_j,gradPara_i_j;
float64x2_t wx_i_j,wy_i_j,wz_i_j;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N/ 2 * 2; j += 2) {
dx_i_j = vld1q_f64(&dx[i][j]);
dy_i_j = vld1q_f64(&dy[i][j]);
dz_i_j = vld1q_f64(&dz[i][j]);
r_i_j = vld1q_f64(&r[i][j]);
gradPara_i_j = vld1q_f64(&gradPara[i][j]);
wx_i_j = vld1q_f64(&wx[i][j]);
wy_i_j = vld1q_f64(&wy[i][j]);
wz_i_j = vld1q_f64(&wz[i][j]);
//r[i][j] = sqrt(pow(dx[i][j],2.0) + pow(dy[i][j],2.0) + pow(dz[i][j],2.0));
r_i_j = vsqrtq_f64(vaddq_f64(vmulq_f64(dx_i_j, dx_i_j),vaddq_f64(vmulq_f64(dy_i_j, dy_i_j),vmulq_f64(dz_i_j, dz_i_j))));
//gradPara[i][j] = -2 * exp(-pow(r[i][j],2) / pow(h,2)) / pow(h,5) / pow(pi,(3/2));
gradPara_i_j = vmulq_f64(neon_exp(vmulq_f64(vmulq_f64(r_i_j,r_i_j),tmp1Vec)),tmp2Vec);
//wx[i][j] = gradPara[i][j]*dx[i][j];
//wy[i][j] = gradPara[i][j]*dy[i][j];
//wz[i][j] = gradPara[i][j]*dz[i][j];
wx_i_j = vmulq_f64(gradPara_i_j,dx_i_j);
wy_i_j = vmulq_f64(gradPara_i_j,dy_i_j);
wz_i_j = vmulq_f64(gradPara_i_j,dz_i_j);
vst1q_f64(&wx[i][j], wx_i_j);
vst1q_f64(&wy[i][j], wy_i_j);
vst1q_f64(&wz[i][j], wz_i_j);
//fprintf(stdout, "%12.6f", wy[i][j]);
//fflush(stdout);
}
//fprintf(stdout,"\n");
}
}
void getDensity(double** &pos, double &m, const double h)
{
float64x2_t mVec = vdupq_n_f64(m);
float64x2_t W_i_j,rho_j;
getPairwiseSeparations(pos);
getW(dx, dy, dz, h);
for (int i = 0; i < N; i++) {
for (int j = 0; j < N/ 2 * 2; j += 2) {
rho_j = vld1q_f64(&rho[j]);
W_i_j = vld1q_f64(&W[i][j]);
rho_j = vfmaq_f64(rho_j,mVec,W_i_j);
vst1q_f64(&rho[j], rho_j);
}
//fprintf(stdout, "%12.6f", rho[i]);
//fflush(stdout);
//fprintf(stdout,"\n");
}
}
void getAcc(double** &pos, double** &vel, double &m, const double h, const double k, double &n, double lmbda, const double nu)
{
getDensity(pos, m, h);
getPressure(rho, k, n);
getPairwiseSeparations(pos);
getGradW(dx, dy, dz, h);
float64x2_t mVec = vdupq_n_f64(m);
float64x2_t lmbdaVec = vdupq_n_f64(lmbda);
float64x2_t nuVec = vdupq_n_f64(nu);
float64x2_t acc_0_j, acc_1_j, acc_2_j;
float64x2_t P_i,rho_i,P_i_div_rho_i_sq;
float64x2_t P_j,rho_j,P_j_div_rho_j_sq;
float64x2_t wx_i_j,wy_i_j,wz_i_j;
for (int i = 0; i < N; i++) {
P_i = vdupq_n_f64(P[i]);
rho_i = vdupq_n_f64(rho[i]);
P_i_div_rho_i_sq = vmulq_f64(vmulq_f64(P_i, P_i), vrecpeq_f64(vmulq_f64(rho_i, rho_i)));
for (int j = 0; j < N/ 2 * 2; j += 2) {
P_j = vld1q_f64(&P[j]);
rho_j = vld1q_f64(&rho[j]);
P_j_div_rho_j_sq = vmulq_f64(P_j, vrecpeq_f64(vmulq_f64(rho_j, rho_j)));
acc_0_j = vld1q_f64(&acc[0][j]);
acc_1_j = vld1q_f64(&acc[1][j]);
acc_2_j = vld1q_f64(&acc[2][j]);
wx_i_j = vld1q_f64(&wx[i][j]);
wy_i_j = vld1q_f64(&wy[i][j]);
wz_i_j = vld1q_f64(&wz[i][j]);
//acc[0][j] -= m * ( P[j]/pow(rho[j],2) + pow(P[i]/rho[i],2) ) * wx[i][j];
//acc[1][j] -= m * ( P[j]/pow(rho[j],2) + pow(P[i]/rho[i],2) ) * wy[i][j];
//acc[2][j] -= m * ( P[j]/pow(rho[j],2) + pow(P[i]/rho[i],2) ) * wz[i][j];
acc_0_j = vfmsq_f64(acc_0_j, vmulq_f64(mVec,wx_i_j), vaddq_f64(P_j_div_rho_j_sq, P_i_div_rho_i_sq));
acc_1_j = vfmsq_f64(acc_1_j, vmulq_f64(mVec,wy_i_j), vaddq_f64(P_j_div_rho_j_sq, P_i_div_rho_i_sq));
acc_2_j = vfmsq_f64(acc_2_j, vmulq_f64(mVec,wz_i_j), vaddq_f64(P_j_div_rho_j_sq, P_i_div_rho_i_sq));
vst1q_f64(&acc[0][j], acc_0_j);
vst1q_f64(&acc[1][j], acc_1_j);
vst1q_f64(&acc[2][j], acc_2_j);
}
}
for (int j = 0; j < N /2 * 2; j += 2) {
acc_0_j = vld1q_f64(&acc[0][j]);
acc_1_j = vld1q_f64(&acc[1][j]);
acc_2_j = vld1q_f64(&acc[2][j]);
//acc[0][j] -= lmbda * pos[0][j];
//acc[1][j] -= lmbda * pos[1][j];
//acc[2][j] -= lmbda * pos[2][j];
acc_0_j = vfmsq_f64(acc_0_j, lmbdaVec, vld1q_f64(&pos[0][j]));
acc_1_j = vfmsq_f64(acc_1_j, lmbdaVec, vld1q_f64(&pos[1][j]));
acc_2_j = vfmsq_f64(acc_2_j, lmbdaVec, vld1q_f64(&pos[2][j]));
//acc[0][j] -= nu * vel[0][j];
//acc[1][j] -= nu * vel[1][j];
//acc[2][j] -= nu * vel[2][j];
acc_0_j = vfmsq_f64(acc_0_j, nuVec, vld1q_f64(&vel[0][j]));
acc_1_j = vfmsq_f64(acc_1_j, nuVec, vld1q_f64(&vel[1][j]));
acc_2_j = vfmsq_f64(acc_2_j, nuVec, vld1q_f64(&vel[2][j]));
vst1q_f64(&acc[0][j], acc_0_j);
vst1q_f64(&acc[1][j], acc_1_j);
vst1q_f64(&acc[2][j], acc_2_j);
}
}
9.最后我们将两种优化方式合并
#include <arm_neon.h>
#include <omp.h>
// 使用C/C++函数计算exp()函数
float64x2_t neon_exp(float64x2_t input)
{
// 计算指数值
float64x2_t result;
result[0] = exp(input[0]);
result[1] = exp(input[1]);
return result;
}
void getPairwiseSeparations(double** &pos)
{
float64x2_t dx_i_j,dy_i_j,dz_i_j;
float64x2_t pos_0_j, pos_1_j, pos_2_j;
float64x2_t pos_0_i, pos_1_i, pos_2_i;
#pragma omp parallel for
for (int i = 0; i < N ; i ++) {
pos_0_i = vdupq_n_f64(pos[0][i]);
pos_1_i = vdupq_n_f64(pos[1][i]);
pos_2_i = vdupq_n_f64(pos[2][i]);
for (int j = 0; j < N/ 2 * 2; j += 2) {
pos_0_j = vld1q_f64(&pos[0][j]);
pos_1_j = vld1q_f64(&pos[1][j]);
pos_2_j = vld1q_f64(&pos[2][j]);
dx_i_j = vld1q_f64(&dx[i][j]);
dy_i_j = vld1q_f64(&dy[i][j]);
dz_i_j = vld1q_f64(&dz[i][j]);
//dx[i][j] = pos[0][j] - pos[0][i];
//dy[i][j] = pos[1][j] - pos[1][i];
//dz[i][j] = pos[2][j] - pos[2][i];
dx_i_j = vsubq_f64(pos_0_j,pos_0_i);
dy_i_j = vsubq_f64(pos_1_j,pos_1_i);
dz_i_j = vsubq_f64(pos_2_j,pos_2_i);
vst1q_f64(&dx[i][j], dx_i_j);
vst1q_f64(&dy[i][j], dy_i_j);
vst1q_f64(&dz[i][j], dz_i_j);
//fprintf(stdout, "%12.6f", dz[i][j]);
//fflush(stdout);
}
//fprintf(stdout,"\n");
}
}
void getW(double** &dx, double** &dy, double** &dz, const double h)
{
float tmp1 = pow((1.0 / (h*sqrt(pi))), 3.0);
float tmp2 = -1 / pow(h,2);
float64x2_t tmp1Vec = vdupq_n_f64(tmp1);
float64x2_t tmp2Vec = vdupq_n_f64(tmp2);
float64x2_t r_i_j,W_i_j;
float64x2_t dx_i_j,dy_i_j,dz_i_j;
#pragma omp parallel for
for (int i = 0; i < N; i++) {
for (int j = 0; j < N/ 2 * 2; j += 2) {
dx_i_j = vld1q_f64(&dx[i][j]);
dy_i_j = vld1q_f64(&dy[i][j]);
dz_i_j = vld1q_f64(&dz[i][j]);
r_i_j = vld1q_f64(&r[i][j]);
W_i_j = vld1q_f64(&W[i][j]);
//r[i][j] = sqrt(pow(dx[i][j],2.0) + pow(dy[i][j],2.0) + pow(dz[i][j],2.0));
r_i_j = vsqrtq_f64(vaddq_f64(vmulq_f64(dx_i_j, dx_i_j),vaddq_f64(vmulq_f64(dy_i_j, dy_i_j),vmulq_f64(dz_i_j, dz_i_j))));
//W[i][j] = pow((1.0 / (h*sqrt(pi))), 3.0) * exp((-pow(r[i][j],2) / pow(h,2)));
W_i_j = vmulq_f64(tmp1Vec,neon_exp(vmulq_f64(vmulq_f64(r_i_j,r_i_j),tmp2Vec)));
vst1q_f64(&r[i][j], r_i_j);
vst1q_f64(&W[i][j], W_i_j);
//fprintf(stdout, "%12.6f", r[i][j]);
//fprintf(stdout, "%12.6f", W[i][j]);
//fflush(stdout);
}
//fprintf(stdout,"\n");
}
}
void getGradW(double** &dx, double** &dy, double** &dz, const double h)
{
float tmp1 = -1 / pow(h,2);
float tmp2 = -2 / pow(h,5) / pow(pi,(3/2));
float64x2_t tmp1Vec = vdupq_n_f64(tmp1);
float64x2_t tmp2Vec = vdupq_n_f64(tmp2);
float64x2_t dx_i_j,dy_i_j,dz_i_j;
float64x2_t r_i_j,gradPara_i_j;
float64x2_t wx_i_j,wy_i_j,wz_i_j;
#pragma omp parallel for
for (int i = 0; i < N; i++) {
for (int j = 0; j < N/ 2 * 2; j += 2) {
dx_i_j = vld1q_f64(&dx[i][j]);
dy_i_j = vld1q_f64(&dy[i][j]);
dz_i_j = vld1q_f64(&dz[i][j]);
r_i_j = vld1q_f64(&r[i][j]);
gradPara_i_j = vld1q_f64(&gradPara[i][j]);
wx_i_j = vld1q_f64(&wx[i][j]);
wy_i_j = vld1q_f64(&wy[i][j]);
wz_i_j = vld1q_f64(&wz[i][j]);
//r[i][j] = sqrt(pow(dx[i][j],2.0) + pow(dy[i][j],2.0) + pow(dz[i][j],2.0));
r_i_j = vsqrtq_f64(vaddq_f64(vmulq_f64(dx_i_j, dx_i_j),vaddq_f64(vmulq_f64(dy_i_j, dy_i_j),vmulq_f64(dz_i_j, dz_i_j))));
//gradPara[i][j] = -2 * exp(-pow(r[i][j],2) / pow(h,2)) / pow(h,5) / pow(pi,(3/2));
gradPara_i_j = vmulq_f64(neon_exp(vmulq_f64(vmulq_f64(r_i_j,r_i_j),tmp1Vec)),tmp2Vec);
//wx[i][j] = gradPara[i][j]*dx[i][j];
//wy[i][j] = gradPara[i][j]*dy[i][j];
//wz[i][j] = gradPara[i][j]*dz[i][j];
wx_i_j = vmulq_f64(gradPara_i_j,dx_i_j);
wy_i_j = vmulq_f64(gradPara_i_j,dy_i_j);
wz_i_j = vmulq_f64(gradPara_i_j,dz_i_j);
vst1q_f64(&wx[i][j], wx_i_j);
vst1q_f64(&wy[i][j], wy_i_j);
vst1q_f64(&wz[i][j], wz_i_j);
//fprintf(stdout, "%12.6f", wy[i][j]);
//fflush(stdout);
}
//fprintf(stdout,"\n");
}
}
void getDensity(double** &pos, double &m, const double h)
{
float64x2_t mVec = vdupq_n_f64(m);
float64x2_t W_i_j,rho_j;
getPairwiseSeparations(pos);
getW(dx, dy, dz, h);
//#pragma omp parallel for 这里不能进行omp优化会影响结果
for (int i = 0; i < N; i++) {
for (int j = 0; j < N/ 2 * 2; j += 2) {
rho_j = vld1q_f64(&rho[j]);
W_i_j = vld1q_f64(&W[i][j]);
rho_j = vfmaq_f64(rho_j,mVec,W_i_j);
vst1q_f64(&rho[j], rho_j);
}
//fprintf(stdout, "%12.6f", rho[i]);
//fflush(stdout);
//fprintf(stdout,"\n");
}
}
void getAcc(double** &pos, double** &vel, double &m, const double h, const double k, double &n, double lmbda, const double nu)
{
getDensity(pos, m, h);
getPressure(rho, k, n);
getPairwiseSeparations(pos);
getGradW(dx, dy, dz, h);
float64x2_t mVec = vdupq_n_f64(m);
float64x2_t lmbdaVec = vdupq_n_f64(lmbda);
float64x2_t nuVec = vdupq_n_f64(nu);
float64x2_t acc_0_j, acc_1_j, acc_2_j;
float64x2_t P_i,rho_i,P_i_div_rho_i_sq;
float64x2_t P_j,rho_j,P_j_div_rho_j_sq;
float64x2_t wx_i_j,wy_i_j,wz_i_j;
//#pragma omp parallel for 这里不能进行omp优化会影响结果
for (int i = 0; i < N; i++) {
P_i = vdupq_n_f64(P[i]);
rho_i = vdupq_n_f64(rho[i]);
P_i_div_rho_i_sq = vmulq_f64(vmulq_f64(P_i, P_i), vrecpeq_f64(vmulq_f64(rho_i, rho_i)));
for (int j = 0; j < N/ 2 * 2; j += 2) {
P_j = vld1q_f64(&P[j]);
rho_j = vld1q_f64(&rho[j]);
P_j_div_rho_j_sq = vmulq_f64(P_j, vrecpeq_f64(vmulq_f64(rho_j, rho_j)));
acc_0_j = vld1q_f64(&acc[0][j]);
acc_1_j = vld1q_f64(&acc[1][j]);
acc_2_j = vld1q_f64(&acc[2][j]);
wx_i_j = vld1q_f64(&wx[i][j]);
wy_i_j = vld1q_f64(&wy[i][j]);
wz_i_j = vld1q_f64(&wz[i][j]);
//acc[0][j] -= m * ( P[j]/pow(rho[j],2) + pow(P[i]/rho[i],2) ) * wx[i][j];
//acc[1][j] -= m * ( P[j]/pow(rho[j],2) + pow(P[i]/rho[i],2) ) * wy[i][j];
//acc[2][j] -= m * ( P[j]/pow(rho[j],2) + pow(P[i]/rho[i],2) ) * wz[i][j];
acc_0_j = vfmsq_f64(acc_0_j, vmulq_f64(mVec,wx_i_j), vaddq_f64(P_j_div_rho_j_sq, P_i_div_rho_i_sq));
acc_1_j = vfmsq_f64(acc_1_j, vmulq_f64(mVec,wy_i_j), vaddq_f64(P_j_div_rho_j_sq, P_i_div_rho_i_sq));
acc_2_j = vfmsq_f64(acc_2_j, vmulq_f64(mVec,wz_i_j), vaddq_f64(P_j_div_rho_j_sq, P_i_div_rho_i_sq));
vst1q_f64(&acc[0][j], acc_0_j);
vst1q_f64(&acc[1][j], acc_1_j);
vst1q_f64(&acc[2][j], acc_2_j);
}
}
for (int j = 0; j < N /2 * 2; j += 2) {
acc_0_j = vld1q_f64(&acc[0][j]);
acc_1_j = vld1q_f64(&acc[1][j]);
acc_2_j = vld1q_f64(&acc[2][j]);
//acc[0][j] -= lmbda * pos[0][j];
//acc[1][j] -= lmbda * pos[1][j];
//acc[2][j] -= lmbda * pos[2][j];
acc_0_j = vfmsq_f64(acc_0_j, lmbdaVec, vld1q_f64(&pos[0][j]));
acc_1_j = vfmsq_f64(acc_1_j, lmbdaVec, vld1q_f64(&pos[1][j]));
acc_2_j = vfmsq_f64(acc_2_j, lmbdaVec, vld1q_f64(&pos[2][j]));
//acc[0][j] -= nu * vel[0][j];
//acc[1][j] -= nu * vel[1][j];
//acc[2][j] -= nu * vel[2][j];
acc_0_j = vfmsq_f64(acc_0_j, nuVec, vld1q_f64(&vel[0][j]));
acc_1_j = vfmsq_f64(acc_1_j, nuVec, vld1q_f64(&vel[1][j]));
acc_2_j = vfmsq_f64(acc_2_j, nuVec, vld1q_f64(&vel[2][j]));
vst1q_f64(&acc[0][j], acc_0_j);
vst1q_f64(&acc[1][j], acc_1_j);
vst1q_f64(&acc[2][j], acc_2_j);
}
}