之前写一道题将多重for循环改编成并行,由于没有使用同步函数,导致CPU端主线程和GPU端的数据传输出现了问题,下面是最终成功的样例代码:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <math.h>
#include <cmath>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#define G 9.8f //gravitational constant 重力
#define dt 0.01f //time step 时间片的具体时间
#define SOFTENING 2.0f //softening parameter to help with numerical instability 变量(空气的影响参数?
struct nbody {
float x, y, vx, vy, m;//m=质量
};
typedef struct nbody nbody;
//void print_help();
void step(void);
void d_step(void);
__global__ void d_step3(int* k, float* x_total, float* y_total,const nbody*input_data);
int N = 1000; // Number of Nbody
int D = 3; // dimension 尺寸?体积?
int Iter = 1000; // iteration number 迭代数
nbody* input_data;
float* den_arr; //dencity array 密度
__device__ float d_x_total = 0;
__device__ float d_y_total = 0;
int main() {
input_data = (nbody*)malloc(sizeof(nbody) * N);
den_arr = (float*)malloc(sizeof(float) * D * D);
if (!input_data || !den_arr) {
printf("malloc failed!\n");
exit(0);
}
for (int i = 0; i < N; i++) {
input_data[i].m = 1.0 / N;
input_data[i].x = (float)rand() / (float)RAND_MAX;
input_data[i].y = (float)rand() / (float)RAND_MAX;
input_data[i].vx = 0;
input_data[i].vy = 0;
}
unsigned long long start1, end1;
//这里是global代码:
//创建新的device数组保存用于计算:
start1 = clock();
//进行device端计算:
d_step();
end1 = clock();
//释放device内存数据
printf("t2= %lf\n", (double)(end1 - start1) / CLOCKS_PER_SEC);
//host端代码:
unsigned long long start, end;
start = clock();
//step();
end = clock();
printf("t1 = %lf\n", (double)(end - start) / CLOCKS_PER_SEC);
int flag = 1;
for (int i = 0; i < D * D; i++)
{
//if (d2[i] != den_arr[i]) { printf("就散错误!\n"); flag = 0; break; }
}
double t1 = (double)(end - start) / CLOCKS_PER_SEC;
double t2 = (double)(end1 - start1) / CLOCKS_PER_SEC;
if (flag)printf("6666加速比是:%lf", t1 / t2);
//Free memmory
free(input_data);
free(den_arr);
return 0;
}
void d_step(void) {
float* ax_arr = NULL; // Array to store a
float* ay_arr = NULL;
//Initialize density array 初始化
for (int i = 0; i < D * D; i++) {
den_arr[i] = 0;
}
//Allocate memory to acceleration arrary 分配内存到加速阵列
ax_arr = (float*)malloc(N * sizeof(float));
ay_arr = (float*)malloc(N * sizeof(float));
int i;
float* x_temp;
float* y_temp;
float* x_total[1] = { 0 };
float* y_total[1] = { 0 };
cudaMalloc((void**)&x_temp, sizeof(float) * 2);
cudaMalloc((void**)&y_temp, sizeof(float) * 2);
for (i = 0; i < Iter; i++) {
int k;
//printf("i=%d\n",i);
int* d_hape;
nbody* d_input_data;
cudaMalloc((void**)&d_hape, sizeof(int) * 2);
// Use for calculate a
for (k = 0; k < N; k++) {
cudaMalloc((void**)&d_input_data, sizeof(nbody) * N);
int hape[1] = { 1 };
cudaMemcpy(d_input_data, input_data, sizeof(nbody) * N, cudaMemcpyHostToDevice);
cudaMemcpy(d_hape, hape, sizeof(int)*2, cudaMemcpyHostToDevice);
//printf("%lf %lf\n", input_data[k].x, input_data[k].y);
d_step3 << <1,1000>> > (d_hape, x_temp, y_temp, d_input_data);
cudaThreadSynchronize();
cudaMemcpy(input_data, d_input_data, sizeof(nbody) * N, cudaMemcpyDeviceToHost);
cudaMemcpy(x_total, x_temp, sizeof(float) * 2, cudaMemcpyDeviceToHost);
cudaMemcpy( y_total, y_temp, sizeof(float) * 2, cudaMemcpyDeviceToHost);
//printf("k=%d x_total=%lf y_total=%lf\n", k, x_total[0], y_total[0]);
//printf("k=%d x_total=%lf y_total=%lf\n", k, x_total, y_total);
//printf("k=%d x_total=%lf y_total=%lf\n",k, x_total[0], y_total[0]);
ax_arr[k] = G * x_total[0];
ay_arr[k] = G * y_total[0];
input_data[k].vx += ax_arr[k] * dt; //velocity 加速度*时间=在这个时间片中的平均速度
input_data[k].vy += ay_arr[k] * dt;
input_data[k].x += input_data[k].vx * dt; // location 新的位置=这个时间片中的位移量+原来的位置
input_data[k].y += input_data[k].vy * dt;
//当x和y的位移量用时大于并且0小于1时,
if (input_data[k].x < 1 && input_data[k].y < 1 && input_data[k].x>0 && input_data[k].y>0) {
int x_index = floor(input_data[k].x / (1.0 / D));//位移*体积=密度?
int y_index = floor(input_data[k].y / (1.0 / D));//floor:向左取整floor(1.7)=1,floor(-2.3)=-3
den_arr[x_index * D + y_index] += (float)1.0 / N;//x轴的密度*体积+y轴的密度=最终密度?
}
cudaFree(d_input_data);
cudaFree(y_total);
cudaFree(x_total);
cudaFree(d_hape);
}
}
for (int p = 0; p < 9; p++) {
printf("%lf ", den_arr[p]);
}
printf("\n");
free(ax_arr);
free(ay_arr);
}
void step(void)
{
float* ax_arr = NULL; // Array to store a
float* ay_arr = NULL;
//Initialize density array 初始化
for (int i = 0; i < D * D; i++) {
den_arr[i] = 0;
}
//Allocate memory to acceleration arrary 分配内存到加速阵列
ax_arr = (float*)malloc(N * sizeof(float));
ay_arr = (float*)malloc(N * sizeof(float));
int i;
for (i = 0; i < Iter; i++) {
int k;
for (k = 0; k < N; k++) {
float x_total = 0;
float y_total = 0; // Use for calculate a
for (int j = 0; j < N; j++) {
if (k == j) continue;
float x_below, y_below;
float x_up, y_up;
x_below = pow(pow(input_data[j].x - input_data[k].x, 2) + pow(SOFTENING, 2), 3.0 / 2);
x_up = input_data[j].m * (input_data[j].x - input_data[k].x);
x_total += x_up / x_below;
y_below = pow(pow(input_data[j].y - input_data[k].y, 2) + pow(SOFTENING, 2), 3.0 / 2);
y_up = input_data[j].m * (input_data[j].y - input_data[k].y);
y_total += y_up / y_below;
}
ax_arr[k] = G * x_total;
ay_arr[k] = G * y_total;
input_data[k].vx += ax_arr[k] * dt; //velocity 加速度*时间=在这个时间片中的平均速度
input_data[k].vy += ay_arr[k] * dt;
input_data[k].x += input_data[k].vx * dt; // location 新的位置=这个时间片中的位移量+原来的位置
input_data[k].y += input_data[k].vy * dt;
//当x和y的位移量用时大于并且0小于1时,
if (input_data[k].x < 1 && input_data[k].y < 1 && input_data[k].x>0 && input_data[k].y>0) {
//printf("yeah!\n");
int x_index = floor(input_data[k].x / (1.0 / D));//位移*体积=密度?
int y_index = floor(input_data[k].y / (1.0 / D));//floor:向左取整floor(1.7)=1,floor(-2.3)=-3
den_arr[x_index * D + y_index] += (float)1.0 / N;//x轴的密度*体积+y轴的密度=最终密度?
}
}
}
for (int p = 0; p < 9; p++) {
printf("%lf ", den_arr[p]);
}
printf("\n");
free(ax_arr);
free(ay_arr);
}
__global__ void d_step3( int* k, float* x_total, float* y_total, const nbody* input_data)
{
int j = threadIdx.x;
//printf("%d:k=%d x_total=%lf y_total=%lf\n", j,k, x_total[0], y_total[0]);
d_x_total = 0;
d_y_total = 0;
__syncthreads();
if (k[0]!= j)
{
float x_below, y_below;
float x_up, y_up;
x_below = powf(powf(input_data[j].x - input_data[k[0]].x, 2) + powf(SOFTENING, 2), 3.0 / 2);
//printf("%lf %lf\n", input_data[j].x, input_data[k].x);
x_up = input_data[j].m * (input_data[j].x - input_data[k[0]].x);
atomicAdd(&x_total[0], x_up / x_below);
y_below = powf(powf(input_data[j].y - input_data[k[0]].y, 2) + powf(SOFTENING, 2), 3.0 / 2);
y_up = input_data[j].m * (input_data[j].y - input_data[k[0]].y);
atomicAdd(&y_total[0], y_up / y_below);
//printf("%d:k=%d x_total=%lf y_total=%lf\n", j, k, x, y);
}
__syncthreads();
x_total[0] = d_x_total;
y_total[0] = d_y_total;
//printf("k=%d x_total=%lf y_total=%lf\n", k[0], x_total[0], y_total[0]);
}
为了方便说明我将使用同步函数的代码块单独拿出来:
for (k = 0; k < N; k++) {
cudaMalloc((void**)&d_input_data, sizeof(nbody) * N);
int hape[1] = { 1 };
cudaMemcpy(d_input_data, input_data, sizeof(nbody) * N, cudaMemcpyHostToDevice);
cudaMemcpy(d_hape, hape, sizeof(int)*2, cudaMemcpyHostToDevice);
//printf("%lf %lf\n", input_data[k].x, input_data[k].y);
d_step3 << <1,1000>> > (d_hape, x_temp, y_temp, d_input_data);
cudaThreadSynchronize();
在d_step函数调用核函数的过程中,我使用了两个for循环调用核函数,倘若这里不使用cudaThreadSynchronize();函数让CPU端代码同步等待GPU端线程计算完毕,CPU端传给核函数的数据k就会出现错误,导致只能传入k=0或者k的值只传入了一部分,并没有全部传入的情况。
加入cudaThreadSynchronize();函数后,CPU端的主线程就会等待核函数里的线程全部计算完毕后再继续进行。