GPU编程 CUDA C++ 分子动力学模拟【GPU加速版】迷你代码

ZZXDX11

已于 2023-04-08 11:02:29 修改

阅读量962

点赞数 5

文章标签： c++ 开发语言 c语言算法

于 2023-04-06 17:00:48 首次发布

本文链接：https://blog.csdn.net/qq_42536162/article/details/129989541

版权

分子动力学模拟对一个具有一定初始条件和边界条件且具有相互作用（分子力场molecular force feild）的多粒子系统的运动方程进行数值积分，得到系统在相空间（phase space）中的一条离散的轨迹（trajectory），并用统计力学的方法从这条相轨迹中提取出有用的物理结果（例如：温度、压强、自由能、构像转变机理）。

N粒子系统（3N个坐标，3N个动量构成一个6N维相空间）

初始化：

坐标初始化（使用固态氩晶体结构加上超近原子距离预警）氩晶体为单原子且只有LJ势

速度初始化（使用随机速度分布并加上0总动量约束和温度缩放校正，不用Maxwell分布）

边界条件：使用正交盒子加最小镜像约定

相互作用的LJ势：势能表达式，力表达式，两体势的xyz分解，力的xyz分解

数值积分：采用“速度-Verlet”积分方法

原子单位制：四个基本量：电子伏特(eV)、埃(A)、原子单位质量(amu)、温度(K)

推导单位制：力(eV*A^-1)，速度(eV^1/2)*(amu^-1/2)，时间(A*amu^1/2)*(eV^-1/2)飞秒时间步长，波尔兹曼常数 KB=8.617343x10^-5 eV*(K^-1)

编译与运行：

$ make                # 双精度版需要在Makefile文件中的CFLAGS选项后加上 -DUSE_DP
$ ./ljmd nx Ne        # 体系晶胞原子数量为 nx*nx*nx*4 ，Ne 代表平衡过程的步数

程序架构：

主函数（main.cu）

--- 常量、自定义结构体、浮点数类型（common.h）

--- 内存分配与释放（memory.cuh 和 memory.cu）

--- 模拟系统的初始化（initialize.h 和 initialize.cu）

--- 构建邻居列表（neighbor.h 和 neighbor.cu）

--------- 使用最小镜像（mic.h）

--- 运动方程的积分（integrate.h 和 integrate.cu）【在GPU中计算】

--------- 求力和势能（force.h 和 force.cu）【在GPU中计算】

--------------- 使用最小镜像（mic.h）

main.cu：

#include "common.h"       //单双精度，波尔兹曼常量，时间校正常量，Atom结构体，Box结构体
#include "memory.h"       //内存分配与释放
#include "initialize.h"   //坐标与速度初始化
#include "neighbor.h"     //GPU计算各原子的邻居列表
#include "integrate.h"    //GPU积分引擎
#include <stdlib.h>
#include <stdio.h>

int main(int argc, char **argv)
{
    int nx = 5;      //晶胞数量
    int Ne = 2000;   //平衡步数
    int Np = 2000;   //生产步数

    if (argc != 3)   //必须以如下格式的命令运行：   $ ./ljmd nx Ne
    { 
        printf("Usage: %s nx Ne\n", argv[0]);
        exit(1);
    }
    else
    {
        nx = atoi(argv[1]);
        Ne = atoi(argv[2]);
        Np = Ne;    //默认生产步数和平衡步数一样多，也可以自己修改源码重新编译自己想要的步数
    }

    int N = 4 * nx * nx * nx;   //总的体系原子数
    int Ns = 100;   //生产时轨迹写入间隔步数
    int MN = 200;   //Max Neighbor 最多邻居数
    real T_0 = 60.0;  //目标温度K
    real ax = 5.385;  //初始相邻两原子间距A
    real time_step = 5.0 / TIME_UNIT_CONVERSION;   //5飞秒时间步长转换为自然单位时间
    Atom atom;
    allocate_memory(N, MN, &atom);  //memory.cu中一个自定义内存分配函数，分配给atom对象各参数内存
    for (int n = 0; n < N; ++n) { atom.m[n] = 40.0; }   //原子质量
    initialize_position(nx, ax, &atom);   //位置初始化
    initialize_velocity(N, T_0, &atom);   //速度初始化
    find_neighbor(N, MN, &atom);          //计算邻居列表
    equilibration(Ne, N, MN, T_0, time_step, &atom);   //平衡过程
    production(Np, Ns, N, MN, T_0, time_step, &atom);  //生产过程
    deallocate_memory(&atom);   //memory.cu中一个自定义内存释放函数，释放atom对象各参数内存
    return 0;
}

common.h：

#pragma once

#ifdef DOUBLE_PRECISION
    typedef double real;         //使用双精度浮点数编译
#else
    typedef float real;          //使用单精度浮点数编译（默认）
#endif

#define K_B                   8.617343e-5         //波尔兹曼常量
#define TIME_UNIT_CONVERSION  1.018051e+1         //原子单位制时间 = 1fs/1.018051e+1

struct Atom       //关键数据存储两遍，两个指针，CPU内存中一遍，GPU显存中一遍
{
    real *m;
    real *x;
    real *y;
    real *z;
    real *vx;
    real *vy;
    real *vz;
    real *fx;
    real *fy;
    real *fz;
    real *pe;
    real *ke;
    real *box;

    int *g_NN;         //Neighbor Number
    int *g_NL;         //Neighbor List
    real *g_m;         //GPU中存储的质量
    real *g_x;
    real *g_y;
    real *g_z;
    real *g_vx;
    real *g_vy;
    real *g_vz;
    real *g_fx;
    real *g_fy;
    real *g_fz;
    real *g_pe;        //GPU中存储的动能
    real *g_ke;        //GPU中存储的势能
};

struct Box
{
    real lx;        //盒子x方向长度
    real ly;
    real lz;
    real lx2;       //盒子x方向长度的一半，为了方便mic变换
    real ly2;
    real lz2;
};

memory.cu：

#include "error.cuh"
#include "memory.h"
#include <stdlib.h>

void allocate_memory(int N, int MN, Atom *atom)    //自定义内存分配函数
{                                                  //分配CPU主存
    atom->m  = (real*) malloc(N * sizeof(real));   //原子质量列表
    atom->x  = (real*) malloc(N * sizeof(real));
    atom->y  = (real*) malloc(N * sizeof(real));
    atom->z  = (real*) malloc(N * sizeof(real));
    atom->vx = (real*) malloc(N * sizeof(real));
    atom->vy = (real*) malloc(N * sizeof(real));
    atom->vz = (real*) malloc(N * sizeof(real));
    atom->fx = (real*) malloc(N * sizeof(real));
    atom->fy = (real*) malloc(N * sizeof(real));
    atom->fz = (real*) malloc(N * sizeof(real));
    atom->pe = (real*) malloc(N * sizeof(real));
    atom->ke = (real*) malloc(N * sizeof(real));
    atom->box = (real*) malloc(6 * sizeof(real));   //盒子大小 x, y, z, x/2, y/2, z/2
                                                            
                                                              //分配GPU显存 
    CHECK(cudaMalloc((void**)&atom->g_NN, sizeof(int) * N));   //Neighbor Number
    CHECK(cudaMalloc((void**)&atom->g_NL, sizeof(int) * N * MN));  //Neighbor List
    CHECK(cudaMalloc((void**)&atom->g_m, sizeof(real) * N));
    CHECK(cudaMalloc((void**)&atom->g_x, sizeof(real) * N));
    CHECK(cudaMalloc((void**)&atom->g_y, sizeof(real) * N));
    CHECK(cudaMalloc((void**)&atom->g_z, sizeof(real) * N));
    CHECK(cudaMalloc((void**)&atom->g_vx, sizeof(real) * N));
    CHECK(cudaMalloc((void**)&atom->g_vy, sizeof(real) * N));
    CHECK(cudaMalloc((void**)&atom->g_vz, sizeof(real) * N));
    CHECK(cudaMalloc((void**)&atom->g_fx, sizeof(real) * N));
    CHECK(cudaMalloc((void**)&atom->g_fy, sizeof(real) * N));
    CHECK(cudaMalloc((void**)&atom->g_fz, sizeof(real) * N));
    CHECK(cudaMalloc((void**)&atom->g_pe, sizeof(real) * N));
    CHECK(cudaMalloc((void**)&atom->g_ke, sizeof(real) * N));
}

void deallocate_memory(Atom *atom)      //自定义内存释放函数
{
    free(atom->m);
    free(atom->x);
    free(atom->y);
    free(atom->z);
    free(atom->vx);
    free(atom->vy);
    free(atom->vz);
    free(atom->fx);
    free(atom->fy);
    free(atom->fz);
    free(atom->pe);
    free(atom->ke);
    free(atom->box);

    CHECK(cudaFree(atom->g_NN));
    CHECK(cudaFree(atom->g_NL));
    CHECK(cudaFree(atom->g_m));
    CHECK(cudaFree(atom->g_x));
    CHECK(cudaFree(atom->g_y));
    CHECK(cudaFree(atom->g_z));
    CHECK(cudaFree(atom->g_vx));
    CHECK(cudaFree(atom->g_vy));
    CHECK(cudaFree(atom->g_vz));
    CHECK(cudaFree(atom->g_fx));
    CHECK(cudaFree(atom->g_fy));
    CHECK(cudaFree(atom->g_fz));
    CHECK(cudaFree(atom->g_pe));
    CHECK(cudaFree(atom->g_ke));
}

memory.h：

#pragma once
#include "common.h"

void allocate_memory(int N, int MN, Atom *atom);
void deallocate_memory(Atom *atom);

initialize.cu：

#include "initialize.h"
#include "error.cuh"
#include <stdlib.h>
#include <math.h>

static void scale_velocity(int N, real T_0, Atom *atom)   //校正温度到目标T_0温度
{
    real *m = atom->m;
    real *vx = atom->vx;
    real *vy = atom->vy;
    real *vz = atom->vz;
    real temperature = 0.0;
    for (int n = 0; n < N; ++n) 
    {
        real v2 = vx[n]*vx[n] + vy[n]*vy[n] + vz[n]*vz[n];     
        temperature += m[n] * v2; 
    }
    temperature /= 3.0 * K_B * N;    //根据“能均分定理“确定当前速度下体系的温度
    real scale_factor = sqrt(T_0 / temperature);    //确定校正到T_0温度所需的乘子
    for (int n = 0; n < N; ++n)
    { 
        vx[n] *= scale_factor;
        vy[n] *= scale_factor;
        vz[n] *= scale_factor;
    }
}

void initialize_position(int nx, real ax, Atom *atom)   //初始化坐标
{
    atom->box[0] = ax * nx;               //盒子在x轴方向上的长度
    atom->box[1] = ax * nx;
    atom->box[2] = ax * nx;
    atom->box[3] = atom->box[0] * 0.5;    //盒子在x轴方向上一半的长度，方便用于mic变换
    atom->box[4] = atom->box[1] * 0.5;
    atom->box[5] = atom->box[2] * 0.5;
    real *x = atom->x;
    real *y = atom->y;
    real *z = atom->z;                    //以某点(ix, iy, iz)为中心，射出如下4个向量：
    real x0[4] = {0.0, 0.0, 0.5, 0.5};    //    (0.0, 0.0, 0.0), (0.0, 0.5, 0.5), 
    real y0[4] = {0.0, 0.5, 0.0, 0.5};    //    (0.5, 0.0, 0.5), (0.5, 0.5, 0.0)
    real z0[4] = {0.0, 0.5, 0.5, 0.0};    //以生成4个原子，最后再平移中心点(ix, iy, iz)进行晶胞复制
    int n = 0;
    for (int ix = 0; ix < nx; ++ix)        //nx是晶胞中原子个数
    {
        for (int iy = 0; iy < nx; ++iy)
        {
            for (int iz = 0; iz < nx; ++iz)
            {
                for (int i = 0; i < 4; ++i)
                {
                    x[n] = (ix + x0[i]) * ax;
                    y[n] = (iy + y0[i]) * ax;
                    z[n] = (iz + z0[i]) * ax;
                    n++;
                }
            }
        }
    }

    int m1 = sizeof(real) * 4 * nx * nx * nx;    //内存大小
    CHECK(cudaMemcpy(atom->g_x, atom->x, m1, cudaMemcpyHostToDevice));   //用法cudaMemcpy(device_array, host_array, size, cudaMemcpyHostToDevice)
    CHECK(cudaMemcpy(atom->g_y, atom->y, m1, cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(atom->g_z, atom->z, m1, cudaMemcpyHostToDevice));
}

void initialize_velocity(int N, real T_0, Atom *atom)   //分配初始速度
{
    real *m = atom->m;
    real *vx = atom->vx;
    real *vy = atom->vy;
    real *vz = atom->vz;
    real momentum_average[3] = {0.0, 0.0, 0.0};
    for (int n = 0; n < N; ++n)
    { 
        vx[n] = -1.0 + (rand() * 2.0) / RAND_MAX; //调用该函数会返回一个介于0 ~ RAND_MAX（可以通过stdlib.h中的宏定义来获取）
        vy[n] = -1.0 + (rand() * 2.0) / RAND_MAX; // -1.0 是为了把均值调为0
        vz[n] = -1.0 + (rand() * 2.0) / RAND_MAX;    
        
        momentum_average[0] += m[n] * vx[n] / N;     //动量平均
        momentum_average[1] += m[n] * vy[n] / N;
        momentum_average[2] += m[n] * vz[n] / N;
    } 
    for (int n = 0; n < N; ++n)                  //消除xyz各方向上的总平动能
    { 
        vx[n] -= momentum_average[0] / m[n];     //vx[n]是第n个原子的x方向的速度
        vy[n] -= momentum_average[1] / m[n];
        vz[n] -= momentum_average[2] / m[n]; 
    }
    scale_velocity(N, T_0, atom);    //调用之前写的缩放函数，对当前速度分布进行缩放

    CHECK(cudaMemcpy(atom->g_m, atom->m, sizeof(real) * N, cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(atom->g_vx, atom->vx, sizeof(real) * N, cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(atom->g_vy, atom->vy, sizeof(real) * N, cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(atom->g_vz, atom->vz, sizeof(real) * N, cudaMemcpyHostToDevice));
}

initialize.h：

#pragma once
#include "common.h"

void initialize_position(int nx, real ax, Atom *atom);
void initialize_velocity(int N, real T_0, Atom *atom);

neighbor.cu：

#include "neighbor.h"
#include "mic.h"
#include <stdio.h>
#include <stdlib.h>

static void __global__ gpu_find_neighbor    //静态GPU内联核函数
(
    int N, int MN, int *g_NN, int *g_NL, Box box, 
    real *g_x, real *g_y, real *g_z, real cutoff2
)
{
    int n1 = blockIdx.x * blockDim.x + threadIdx.x;
    if (n1 < N)
    {
        int count = 0;
        real x1 = g_x[n1];
        real y1 = g_y[n1];
        real z1 = g_z[n1];
        for (int n2 = 0; n2 < N; n2++)
        {
            real x12 = g_x[n2] - x1;
            real y12 = g_y[n2] - y1;
            real z12 = g_z[n2] - z1;
            apply_mic(box, &x12, &y12, &z12);  //盒子box是6元数组
            real d12_square = x12*x12 + y12*y12 + z12*z12;
            if ((n2 != n1) && (d12_square < cutoff2))
            {
                g_NL[count++ * N + n1] = n2;    // g_NL[]生成邻居列表 Neighbor List
            }
        }
        g_NN[n1] = count;    // g_NN[] Neighbor number 各原子邻居数
    }
}

void find_neighbor(int N, int MN, Atom *atom)   //包装函数
{
    real cutoff = 11.0;
    real cutoff2 = cutoff * cutoff;

    Box box;                    //盒子有6个元素
    box.lx = atom->box[0];      //盒子x方向的长度
    box.ly = atom->box[1];
    box.lz = atom->box[2];
    box.lx2 = atom->box[3];     //盒子x轴方向上一半的长度，方便用于mic变换
    box.ly2 = atom->box[4];
    box.lz2 = atom->box[5];

    int block_size = 128;     // RTX 3080Ti  是 1024 1024 64
    int grid_size = (N - 1) / block_size + 1;    //向上取整
    gpu_find_neighbor<<<grid_size, block_size>>>    //调用GPU核函数
    (
        N, MN, atom->g_NN, atom->g_NL, box,
        atom->g_x, atom->g_y, atom->g_z, cutoff2
    );
}

neighbor.h：

#pragma once
#include "common.h"

void find_neighbor(int N, int MN, Atom *atom);

mic.h：

#pragma once

static void __device__ apply_mic   // 静态内联函数（高性能），进行最小镜像变换
(
    Box box, real *x12, real *y12, real *z12
)
{
    if      (*x12 < - box.lx2) { *x12 += box.lx; }      //box.lx2代表盒子在x轴方向上一半的长度
    else if (*x12 > + box.lx2) { *x12 -= box.lx; }
    if      (*y12 < - box.ly2) { *y12 += box.ly; }      //box.ly2代表盒子在y轴方向上一半的长度
    else if (*y12 > + box.ly2) { *y12 -= box.ly; }
    if      (*z12 < - box.lz2) { *z12 += box.lz; }      //box.lz2代表盒子在z轴方向上一半的长度
    else if (*z12 > + box.lz2) { *z12 -= box.lz; }
}

integrate.cu：

#include "integrate.h"
#include "error.cuh"
#include "force.h"
#include "reduce.h"
#include <stdio.h>
#include <math.h>
#include <time.h>

static void __global__ gpu_scale_velocity    // GPU核函数：速度缩放
(
    int N, real scale_factor, 
    real *g_vx, real *g_vy, real *g_vz
)
{
    int n = blockDim.x * blockIdx.x + threadIdx.x;
    if (n < N)
    { 
        g_vx[n] *= scale_factor;
        g_vy[n] *= scale_factor;
        g_vz[n] *= scale_factor;
    }
}

static void scale_velocity(int N, real T_0, Atom *atom)    //包装函数
{
    real temperature = sum(N, atom->g_ke) / (1.5 * K_B * N);   //通过当前动能计算当前温度
    real scale_factor = sqrt(T_0 / temperature);   //和目标温度T_0之间的缩放因子

    int block_size = 128;
    int grid_size = (N - 1) / block_size + 1;
    gpu_scale_velocity<<<grid_size, block_size>>>     // 调用核函数
    (N, scale_factor, atom->g_vx, atom->g_vy, atom->g_vz);
}

static void __global__ gpu_integrate
(
    int N, real time_step, real time_step_half,
    real *g_m, real *g_x, real *g_y, real *g_z,
    real *g_vx, real *g_vy, real *g_vz,
    real *g_fx, real *g_fy, real *g_fz, 
    real *g_ke, int flag
)
{
    int n = blockDim.x * blockIdx.x + threadIdx.x;
    if (n < N)
    {
        real mass = g_m[n];
        real mass_inv = 1.0 / mass;    //使用乘法比除法计算效率高
        real ax = g_fx[n] * mass_inv;     //加速度
        real ay = g_fy[n] * mass_inv;
        real az = g_fz[n] * mass_inv;
        real vx = g_vx[n];
        real vy = g_vy[n];
        real vz = g_vz[n];

        vx += ax * time_step_half; //更新v(t) --> v(t+dt/2) 或者 v(t+dt/2) --> v(t+dt)
        vy += ay * time_step_half;
        vz += az * time_step_half;
        g_vx[n] = vx;
        g_vy[n] = vy;
        g_vz[n] = vz;

        if (flag == 1)      //上半轮循环计算 v(t+dt/2), r(t+dt), F(t+dt)
        { 
            g_x[n] += vx * time_step; 
            g_y[n] += vy * time_step; 
            g_z[n] += vz * time_step; 
        }
        else                //下半轮循环计算 v(t+dt) 和 动能
        {
            g_ke[n] = (vx*vx + vy*vy + vz*vz) * mass * 0.5;
        }
    }
}

static void integrate(int N, real time_step, Atom *atom, int flag)   //包装函数
{
    real time_step_half = time_step * 0.5;

    int block_size = 128;
    int grid_size = (N - 1) / block_size + 1;
    gpu_integrate<<<grid_size, block_size>>>    //调用核函数
    (
        N, time_step, time_step_half,
        atom->g_m, atom->g_x, atom->g_y, atom->g_z,
        atom->g_vx, atom->g_vy, atom->g_vz,
        atom->g_fx, atom->g_fy, atom->g_fz, 
        atom->g_ke, flag
    );
}

void equilibration
(
    int Ne, int N, int MN, real T_0, 
    real time_step, Atom *atom
)
{
    find_force(N, MN, atom);
    for (int step = 0; step < Ne; ++step)
    { 
        integrate(N, time_step, atom, 1);   //模式1，上半轮积分，v(t+dt/2)，r(t+dt)
        find_force(N, MN, atom);   //调用函数，计算力, F(t+dt)
        integrate(N, time_step, atom, 2);   //模式2，下半轮积分，v(t+dt)，总动能
        scale_velocity(N, T_0, atom);   //热交换控温，缩放速度
    } 
}

void production
(
    int Np, int Ns, int N, int MN, real T_0, 
    real time_step, Atom *atom
)
{
    float t_force = 0.0f;
    CHECK(cudaDeviceSynchronize());
    clock_t t_total_start = clock();   //总计时开始

    FILE *fid_e = fopen("energy.txt", "w");     //产生一个写模式文件流句柄fid
    for (int step = 0; step < Np; ++step)
    {  
        integrate(N, time_step, atom, 1);   //模式1，上半轮积分，v(t+dt/2)，r(t+dt)

        CHECK(cudaDeviceSynchronize());
        clock_t t_force_start = clock();   //力计算部分计时开始

        find_force(N, MN, atom);   //调用函数，计算力, F(t+dt)

        CHECK(cudaDeviceSynchronize());
        clock_t t_force_stop = clock();      //力计算部分计时结束

        t_force += float(t_force_stop - t_force_start) / CLOCKS_PER_SEC;  //力计时差

        integrate(N, time_step, atom, 2);   //模式2，下半轮积分，v(t+dt)，总动能

        if (0 == step % Ns)   //Ns：文件写入间隔
        {
            fprintf(fid_e, "%g %g\n", sum(N, atom->g_ke), sum(N, atom->g_pe));  //写入总动能，总势能
        }
    }
    fclose(fid_e);   //关闭文件流

    clock_t t_total_stop = clock();    //总计时结束

    float t_total = float(t_total_stop - t_total_start) / CLOCKS_PER_SEC;   //总计时差
    printf("Time used for production = %g s\n", t_total);
    printf("Time used for force part = %g s\n", t_force);
}

integrate.h：

#pragma once
#include "common.h"

void equilibration
(
    int Ne, int N, int MN, real T_0, 
    real time_step, Atom *atom
);

void production
(
    int Np, int Ns, int N, int MN, real T_0, 
    real time_step, Atom *atom
);

force.cu：

#include "error.cuh"
#include "force.h"
#include "mic.h"

struct LJ             // LJ 势结构体，储存一些常量，避免反复计算
{
    real cutoff2;
    real e24s6; 
    real e48s12;
    real e4s6;
    real e4s12;
};

static void __global__ gpu_find_force
(
    LJ lj, int N, int *g_NN, int *g_NL, Box box,
    real *g_x, 
    real *g_y, 
    real *g_z,
    real *g_fx, real *g_fy, real *g_fz, real *g_pe
)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;    //将GPU线程一维序列化
    if (i < N)         //一个原子i开一个线程
    {
        real fx = 0.0;
        real fy = 0.0;
        real fz = 0.0;
        real potential = 0.0;
        int NN = g_NN[i];
        real x_i = g_x[i];
        real y_i = g_y[i];
        real z_i = g_z[i];
        for (int k = 0; k < NN; ++k)    //（单线程内）逃脱不了的对第二个原子j的循环
        {   
            int j = g_NL[i + N * k];
            real x_ij  = g_x[j] - x_i;
            real y_ij  = g_y[j] - y_i;
            real z_ij  = g_z[j] - z_i;
            apply_mic(box, &x_ij, &y_ij, &z_ij);       //需要box信息
            real r2 = x_ij*x_ij + y_ij*y_ij + z_ij*z_ij;
            if (r2 > lj.cutoff2) { continue; }

            real r2inv = 1.0 / r2;
            real r4inv = r2inv * r2inv;       //用乘法代替除法，提高计算效率
            real r6inv = r2inv * r4inv;
            real r8inv = r4inv * r4inv;
            real r12inv = r4inv * r8inv;
            real r14inv = r6inv * r8inv;
            real f_ij = lj.e24s6 * r8inv - lj.e48s12 * r14inv;     //ij之间的力
            potential += lj.e4s12 * r12inv - lj.e4s6 * r6inv;      //ij之间的势
            fx += f_ij * x_ij;           //i原子对j求和，LJ势受力的xyz方向分解
            fy += f_ij * y_ij;
            fz += f_ij * z_ij;
        }
        g_fx[i] = fx;      // i原子x方向上的合力
        g_fy[i] = fy; 
        g_fz[i] = fz; 
        g_pe[i] = potential * 0.5;   // i原子 LJ势
    }
}

void find_force(int N, int MN, Atom *atom)       //包装函数
{
    const real epsilon = 1.032e-2;               //计算存储常量
    const real sigma = 3.405;
    const real cutoff = 10.0;
    const real cutoff2 = cutoff * cutoff;
    const real sigma_3 = sigma * sigma * sigma;
    const real sigma_6 = sigma_3 * sigma_3;
    const real sigma_12 = sigma_6 * sigma_6;
    const real e24s6 = 24.0 * epsilon * sigma_6; 
    const real e48s12 = 48.0 * epsilon * sigma_12;
    const real e4s6 = 4.0 * epsilon * sigma_6;
    const real e4s12 = 4.0 * epsilon * sigma_12;
    LJ lj;                   //给LJ结构体赋值
    lj.cutoff2 = cutoff2;
    lj.e24s6 = e24s6;
    lj.e48s12 = e48s12;
    lj.e4s6 = e4s6;
    lj.e4s12 = e4s12;

    Box box;
    box.lx = atom->box[0];
    box.ly = atom->box[1];
    box.lz = atom->box[2];
    box.lx2 = atom->box[3];
    box.ly2 = atom->box[4];
    box.lz2 = atom->box[5];

    int block_size = 128;
    int grid_size = (N - 1) / block_size + 1;
    gpu_find_force<<<grid_size, block_size>>>        //调用GPU的核函数
    (
        lj, N,  atom->g_NN, atom->g_NL, box,
        atom->g_x, atom->g_y, atom->g_z,
        atom->g_fx, atom->g_fy, atom->g_fz, atom->g_pe
    );
}

force.h：

#pragma once
#include "common.h"

void find_force(int N, int MN, Atom *atom);

reduce.cu：

#include "reduce.h"
#include "error.cuh"
#include <cooperative_groups.h>         //使用线程协作组提高性能
using namespace cooperative_groups;
const int block_size = 128;

void __global__ reduce_cp(const real *d_x, real *d_y, const int N)     //高性能数组归约求和函数
{
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    extern __shared__ real s_y[];

    real y = 0.0;
    const int stride = blockDim.x * gridDim.x;
    for (int n = bid * blockDim.x + tid; n < N; n += stride)
    {
        y += d_x[n];
    }
    s_y[tid] = y;
    __syncthreads();

    for (int offset = blockDim.x >> 1; offset >= 32; offset >>= 1)
    {
        if (tid < offset)
        {
            s_y[tid] += s_y[tid + offset];
        }
        __syncthreads();
    }

    y = s_y[tid];

    thread_block_tile<32> g = tiled_partition<32>(this_thread_block());
    for (int i = g.size() >> 1; i > 0; i >>= 1)
    {
        y += g.shfl_down(y, i);
    }

    if (tid == 0)
    {
        d_y[bid] = y;
    }
}

__device__ real static_y[block_size];

real sum(const int N, const real *d_x)     //包装后的求和函数sum
{
    real *d_y;
    CHECK(cudaGetSymbolAddress((void**)&d_y, static_y));
    
    const int smem = sizeof(real) * block_size;

    reduce_cp<<<block_size, block_size, smem>>>(d_x, d_y, N);
    reduce_cp<<<1, block_size, smem>>>(d_y, d_y, block_size);

    real h_y[1] = {0};
    CHECK(cudaMemcpy(h_y, d_y, sizeof(real), cudaMemcpyDeviceToHost));

    return h_y[0];
}

reduce.h：

#include "common.h"

real sum(const int N, const real *d_x);

error.cuh：

#pragma once         //错误检查宏函数
#include <stdio.h>

#define CHECK(call)                                   \
do                                                    \
{                                                     \
    const cudaError_t error_code = call;              \
    if (error_code != cudaSuccess)                    \
    {                                                 \
        printf("CUDA Error:\n");                      \
        printf("    File:       %s\n", __FILE__);     \
        printf("    Line:       %d\n", __LINE__);     \
        printf("    Error code: %d\n", error_code);   \
        printf("    Error text: %s\n",                \
            cudaGetErrorString(error_code));          \
        exit(1);                                      \
    }                                                 \
} while (0)