Cell Linked List
概念
🌟cell linked list算法是一种高效的邻域搜索算法,广泛应用于分子动力学(Molecular Dynamics, MD)模拟中,用于计算原子之间的相互作用力。该算法通过将模拟区域划分成多个小的立方体单元(cells),并使用链表来组织每个单元中的原子数据,从而加速邻域搜索过程。
⭐️过程:通常在有限的截止距离𝑟𝑐内计算相互作用。一个原子只与该截止距离内的原子相互作用,这极大地减少了需要计算的相互作用数量。Cell Linked List通过将模拟盒子分割成更小的单元,并且仅考虑每个单元及其相邻单元内的相互作用。
Cell Linked List步骤
-
划分模拟区域:将整个模拟区域划分为一系列小的立方体单元(cells)。这些单元的边长通常选择为原子相互作用力的截断半径(cutoff radius) 𝑟𝑐 的值,这样可以确保每个单元只需要与其周围的单元进行相互作用计算。
-
链表构建:
-
定义数据结构:
// 数据结构定义 float4* coords; // 存储原子坐标的数组 int* cell_starts; // 存储每个单元链表起始索引的数组 int* cell_ends; // 存储每个单元链表结束索引的数组 int* cell_list; // 存储链表中每个原子索引的数组
-
构建细胞链表:每个线程处理一个原子,计算其所在的单元索引。使用原子操作更新单元结束索引,记录原子在细胞链表中的位置。
__global__ void build_cell_list(int n_atoms, float4* coords, int* cell_starts, int* cell_ends, int* cell_list, int3 n_cells, float3 box_size, float cell_size) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n_atoms) { float4 coord = coords[i]; int3 cell_idx; cell_idx.x = floorf(coord.x / cell_size); cell_idx.y = floorf(coord.y / cell_size); cell_idx.z = floorf(coord.z / cell_size); cell_idx.x = (cell_idx.x + n_cells.x) % n_cells.x; cell_idx.y = (cell_idx.y + n_cells.y) % n_cells.y; cell_idx.z = (cell_idx.z + n_cells.z) % n_cells.z; int cell_idx_flat = cell_idx.x + cell_idx.y * n_cells.x + cell_idx.z * n_cells.x * n_cells.y; int cell_start = atomicAdd(&cell_ends[cell_idx_flat], 1); cell_list[cell_start] = i; } }
-
-
计算相互作用:每个线程处理一个原子,遍历其所在单元及相邻单元中的原子。如果两个原子之间的距离小于截止距离,则计算并更新它们之间的相互作用力。
__global__ void compute_interactions(int n_atoms, float4* coords, float4* forces, int* cell_starts, int* cell_ends, int* cell_list, int3 n_cells, float3 box_size, float cell_size, float cutoff_squared) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n_atoms) { float4 coord_i = coords[i]; float4 force_i = make_float4(0.0f, 0.0f, 0.0f, 0.0f); int3 cell_idx; cell_idx.x = floorf(coord_i.x / cell_size); cell_idx.y = floorf(coord_i.y / cell_size); cell_idx.z = floorf(coord_i.z / cell_size); cell_idx.x = (cell_idx.x + n_cells.x) % n_cells.x; cell_idx.y = (cell_idx.y + n_cells.y) % n_cells.y; cell_idx.z = (cell_idx.z + n_cells.z) % n_cells.z; for (int dx = -1; dx <= 1; ++dx) { for (int dy = -1; dy <= 1; ++dy) { for (int dz = -1; dz <= 1; ++dz) { int3 neighbor_idx = make_int3( (cell_idx.x + dx + n_cells.x) % n_cells.x, (cell_idx.y + dy + n_cells.y) % n_cells.y, (cell_idx.z + dz + n_cells.z) % n_cells.z ); int neighbor_idx_flat = neighbor_idx.x + neighbor_idx.y * n_cells.x + neighbor_idx.z * n_cells.x * n_cells.y; for (int j = cell_starts[neighbor_idx_flat]; j < cell_ends[neighbor_idx_flat]; ++j) { int atom_j = cell_list[j]; if (i < atom_j) { float4 coord_j = coords[atom_j]; float3 r_ij = make_float3(coord_i.x - coord_j.x, coord_i.y - coord_j.y, coord_i.z - coord_j.z); if (r_ij.x > box_size.x * 0.5f) r_ij.x -= box_size.x; if (r_ij.y > box_size.y * 0.5f) r_ij.y -= box_size.y; if (r_ij.z > box_size.z * 0.5f) r_ij.z -= box_size.z; if (r_ij.x < -box_size.x * 0.5f) r_ij.x += box_size.x; if (r_ij.y < -box_size.y * 0.5f) r_ij.y += box_size.y; if (r_ij.z < -box_size.z * 0.5f) r_ij.z += box_size.z; float r_squared = r_ij.x * r_ij.x + r_ij.y * r_ij.y + r_ij.z * r_ij.z; if (r_squared < cutoff_squared) { float force_magnitude = ...; // 根据相互作用势能计算力的大小 float3 force_ij = make_float3( force_magnitude * r_ij.x, force_magnitude * r_ij.y, force_magnitude * r_ij.z ); atomicAdd(&forces[i].x, force_ij.x); atomicAdd(&forces[i].y, force_ij.y); atomicAdd(&forces[i].z, force_ij.z); atomicAdd(&forces[atom_j].x, -force_ij.x); atomicAdd(&forces[atom_j].y, -force_ij.y); atomicAdd(&forces[atom_j].z, -force_ij.z); } } } } } } } }
-
main调用:初始化原子坐标数组和其他辅助数组,分配设备内存。调用 build_cell_list 内核函数构建细胞链表, 调用 compute_interactions 内核函数计算相互作用。
// 初始化和分配内存 int n_atoms = ...; int3 n_cells = ...; float3 box_size = ...; float cell_size = ...; float cutoff_squared = ...; float4* h_coords = ...; // 初始化原子坐标 float4* d_coords; int* d_cell_starts; int* d_cell_ends; int* d_cell_list; float4* d_forces; cudaMalloc(&d_coords, n_atoms * sizeof(float4)); cudaMalloc(&d_cell_starts, n_cells.x * n_cells.y * n_cells.z * sizeof(int)); cudaMalloc(&d_cell_ends, n_cells.x * n_cells.y * n_cells.z * sizeof(int)); cudaMalloc(&d_cell_list, n_atoms * sizeof(int)); cudaMalloc(&d_forces, n_atoms * sizeof(float4)); cudaMemcpy(d_coords, h_coords, n_atoms * sizeof(float4), cudaMemcpyHostToDevice); cudaMemset(d_cell_starts, 0, n_cells.x * n_cells.y * n_cells.z * sizeof(int)); cudaMemset(d_cell_ends, 0, n_cells.x * n_cells.y * n_cells.z * sizeof(int)); cudaMemset(d_forces, 0, n_atoms * sizeof(float4)); // 调用内核函数 int blockSize = 256; int numBlocks = (n_atoms + blockSize - 1) / blockSize; build_cell_list<<<numBlocks, blockSize>>>(n_atoms, d_coords, d_cell_starts, d_cell_ends, d_cell_list, n_cells, box_size, cell_size); compute_interactions<<<numBlocks, blockSize>>>(n_atoms, d_coords, d_forces, d_cell_starts, d_cell_ends, d_cell_list, n_cells, box_size, cell_size, cutoff_squared); // 释放内存 cudaFree(d_coords); cudaFree(d_cell_starts); cudaFree(d_cell_ends); cudaFree(d_cell_list); cudaFree(d_forces);
示例代码
#include <cuda_runtime.h>
#include <stdio.h>
#include <math.h>
#include <iostream>
// 内核函数:构建细胞链表
__global__ void build_cell_list(int n_atoms, float4* coords, int* cell_starts, int* cell_ends, int* cell_list, int3 n_cells, float3 box_size, float cell_size) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n_atoms) {
float4 coord = coords[i];
int3 cell_idx;
cell_idx.x = floorf(coord.x / cell_size);
cell_idx.y = floorf(coord.y / cell_size);
cell_idx.z = floorf(coord.z / cell_size);
cell_idx.x = (cell_idx.x + n_cells.x) % n_cells.x;
cell_idx.y = (cell_idx.y + n_cells.y) % n_cells.y;
cell_idx.z = (cell_idx.z + n_cells.z) % n_cells.z;
int cell_idx_flat = cell_idx.x + cell_idx.y * n_cells.x + cell_idx.z * n_cells.x * n_cells.y;
int cell_start = atomicAdd(&cell_ends[cell_idx_flat], 1);
cell_list[cell_start] = i;
}
}
// 内核函数:计算原子之间的相互作用
__global__ void compute_interactions(int n_atoms, float4* coords, float4* forces, int* cell_starts, int* cell_ends, int* cell_list, int3 n_cells, float3 box_size, float cell_size, float cutoff_squared) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n_atoms) {
float4 coord_i = coords[i];
float4 force_i = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
int3 cell_idx;
cell_idx.x = floorf(coord_i.x / cell_size);
cell_idx.y = floorf(coord_i.y / cell_size);
cell_idx.z = floorf(coord_i.z / cell_size);
cell_idx.x = (cell_idx.x + n_cells.x) % n_cells.x;
cell_idx.y = (cell_idx.y + n_cells.y) % n_cells.y;
cell_idx.z = (cell_idx.z + n_cells.z) % n_cells.z;
for (int dx = -1; dx <= 1; ++dx) {
for (int dy = -1; dy <= 1; ++dy) {
for (int dz = -1; dz <= 1; ++dz) {
int3 neighbor_idx = make_int3(
(cell_idx.x + dx + n_cells.x) % n_cells.x,
(cell_idx.y + dy + n_cells.y) % n_cells.y,
(cell_idx.z + dz + n_cells.z) % n_cells.z
);
int neighbor_idx_flat = neighbor_idx.x + neighbor_idx.y * n_cells.x + neighbor_idx.z * n_cells.x * n_cells.y;
for (int j = cell_starts[neighbor_idx_flat]; j < cell_ends[neighbor_idx_flat]; ++j) {
int atom_j = cell_list[j];
if (i < atom_j) {
float4 coord_j = coords[atom_j];
float3 r_ij = make_float3(coord_i.x - coord_j.x, coord_i.y - coord_j.y, coord_i.z - coord_j.z);
if (r_ij.x > box_size.x * 0.5f) r_ij.x -= box_size.x;
if (r_ij.y > box_size.y * 0.5f) r_ij.y -= box_size.y;
if (r_ij.z > box_size.z * 0.5f) r_ij.z -= box_size.z;
if (r_ij.x < -box_size.x * 0.5f) r_ij.x += box_size.x;
if (r_ij.y < -box_size.y * 0.5f) r_ij.y += box_size.y;
if (r_ij.z < -box_size.z * 0.5f) r_ij.z += box_size.z;
float r_squared = r_ij.x * r_ij.x + r_ij.y * r_ij.y + r_ij.z * r_ij.z;
if (r_squared < cutoff_squared) {
float force_magnitude = 10; // 根据相互作用势能计算力的大小
float3 force_ij = make_float3(
force_magnitude * r_ij.x,
force_magnitude * r_ij.y,
force_magnitude * r_ij.z
);
atomicAdd(&forces[i].x, force_ij.x);
atomicAdd(&forces[i].y, force_ij.y);
atomicAdd(&forces[i].z, force_ij.z);
atomicAdd(&forces[atom_j].x, -force_ij.x);
atomicAdd(&forces[atom_j].y, -force_ij.y);
atomicAdd(&forces[atom_j].z, -force_ij.z);
}
}
}
}
}
}
}
}
// 内核函数:更新原子位置和速度
__global__ void update_positions(int n_atoms, float4* coords, float4* velocities, float4* forces, float dt) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n_atoms) {
float4 force = forces[i];
float3 acceleration = make_float3(force.x, force.y, force.z);
velocities[i].x += acceleration.x * dt;
velocities[i].y += acceleration.y * dt;
velocities[i].z += acceleration.z * dt;
coords[i].x += velocities[i].x * dt;
coords[i].y += velocities[i].y * dt;
coords[i].z += velocities[i].z * dt;
}
}
int main() {
int n_atoms = 1000;
int3 n_cells = make_int3(10, 10, 10);
float3 box_size = make_float3(10.0f, 10.0f, 10.0f);
float cell_size = box_size.x / n_cells.x;
float cutoff = 2.5f;
float cutoff_squared = cutoff * cutoff;
int n_steps = 1000;
float dt = 0.001f; // 时间步长
float4* h_coords = (float4*)malloc(n_atoms * sizeof(float4));
float4* h_velocities = (float4*)malloc(n_atoms * sizeof(float4));
float4* d_coords;
float4* d_velocities;
float4* d_forces;
int* d_cell_starts;
int* d_cell_ends;
int* d_cell_list;
// 初始化示例数据
for (int i = 0; i < n_atoms; ++i) {
h_coords[i] = make_float4((float)rand() / RAND_MAX * box_size.x,
(float)rand() / RAND_MAX * box_size.y,
(float)rand() / RAND_MAX * box_size.z,
0.0f);
h_velocities[i] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
}
cudaMalloc(&d_coords, n_atoms * sizeof(float4));
cudaMalloc(&d_velocities, n_atoms * sizeof(float4));
cudaMalloc(&d_forces, n_atoms * sizeof(float4));
cudaMalloc(&d_cell_starts, n_cells.x * n_cells.y * n_cells.z * sizeof(int));
cudaMalloc(&d_cell_ends, n_cells.x * n_cells.y * n_cells.z * sizeof(int));
cudaMalloc(&d_cell_list, n_atoms * sizeof(int));
cudaMemcpy(d_coords, h_coords, n_atoms * sizeof(float4), cudaMemcpyHostToDevice);
cudaMemcpy(d_velocities, h_velocities, n_atoms * sizeof(float4), cudaMemcpyHostToDevice);
cudaMemset(d_cell_starts, 0, n_cells.x * n_cells.y * n_cells.z * sizeof(int));
cudaMemset(d_cell_ends, 0, n_cells.x * n_cells.y * n_cells.z * sizeof(int));
cudaMemset(d_forces, 0, n_atoms * sizeof(float4));
int blockSize = 256;
int numBlocks = (n_atoms + blockSize - 1) / blockSize;
for (int step = 0; step < n_steps; ++step) {
cudaMemset(d_forces, 0, n_atoms * sizeof(float4)); // 重置力数组
build_cell_list<<<numBlocks, blockSize>>>(n_atoms, d_coords, d_cell_starts, d_cell_ends, d_cell_list, n_cells, box_size, cell_size);
compute_interactions<<<numBlocks, blockSize>>>(n_atoms, d_coords, d_forces, d_cell_starts, d_cell_ends, d_cell_list, n_cells, box_size, cell_size, cutoff_squared);
update_positions<<<numBlocks, blockSize>>>(n_atoms, d_coords, d_velocities, d_forces, dt);
// 检查原子是否越界,如果是则应用周期性边界条件
cudaMemcpy(h_coords, d_coords, n_atoms * sizeof(float4), cudaMemcpyDeviceToHost);
for (int i = 0; i < n_atoms; ++i) {
h_coords[i].x = fmodf(h_coords[i].x, box_size.x);
h_coords[i].y = fmodf(h_coords[i].y, box_size.y);
h_coords[i].z = fmodf(h_coords[i].z, box_size.z);
}
cudaMemcpy(d_coords, h_coords, n_atoms * sizeof(float4), cudaMemcpyHostToDevice);
// 输出当前步的统计信息
if (step % 100 == 0) {
float4 total_momentum = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
cudaMemcpy(&total_momentum, d_velocities, sizeof(float4), cudaMemcpyDeviceToHost);
for (int i = 1; i < n_atoms; ++i) {
float4 velocity;
cudaMemcpy(&velocity, d_velocities + i, sizeof(float4), cudaMemcpyDeviceToHost);
total_momentum.x += velocity.x;
total_momentum.y += velocity.y;
total_momentum.z += velocity.z;
}
std::cout << "Step " << step << ": Total Momentum = (" << total_momentum.x << ", " << total_momentum.y << ", " << total_momentum.z << ")" << std::endl;
}
}
cudaFree(d_coords);
cudaFree(d_velocities);
cudaFree(d_forces);
cudaFree(d_cell_starts);
cudaFree(d_cell_ends);
cudaFree(d_cell_list);
free(h_coords);
free(h_velocities);
return 0;
}