// error.cuh
#pragma once
#include <stdio.h>
#include <cuda_runtime_api.h>
#include <cuda_runtime.h>
#define CHECK(call) \
do \
{ \
const cudaError_t error_code = call; \
if (error_code != cudaSuccess) \
{ \
printf("CUDA Error:\n"); \
printf(" File: %s\n", __FILE__); \
printf(" Line: %d\n", __LINE__); \
printf(" Error code: %d\n", error_code); \
printf(" Error text: %s\n", \
cudaGetErrorString(error_code)); \
exit(1); \
} \
} while (0)
// reduce2gpu.cu
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <device_launch_parameters.h>
#include "error.cuh"
#include <stdio.h>
#include <cmath>
const int NUM_REPEATS = 100;
const int N = 100000000;
const int M = sizeof(float) * N;
const int BLOCK_SIZE = 128;
void timing(float* h_x, float* d_x, const int method);
int main()
{
float* h_x = (float*)malloc(M);
for (int n = 0; n < N; ++n) {
h_x[n] = 1.23;
}
float* d_x;
CHECK(cudaMalloc(&d_x, M));
printf("\nUsing global memory only:\n");
timing(h_x, d_x, 0);
printf("\nUsing static shared memory:\n");
timing(h_x, d_x, 1);
printf("\nUsing static shared memory:\n");
timing(h_x, d_x, 2);
free(h_x);
CHECK(cudaFree(d_x));
return 0;
}
/*
* blockSize中的所有数据按照元素相加,然后对每个gridSize进行再求和
*/
void __global__ reduce_global(float* d_x, float* d_y)
{
const int tid = threadIdx.x;
float* x = d_x + blockDim.x * blockIdx.x; // blockIdx.x 对应的是 gridDim
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
{
if (tid < offset)
{
x[tid] += x[tid + offset];
}
__syncthreads();
}
if (tid == 0)
{
d_y[blockIdx.x] = x[0];
}
}
void __global__ reduce_shared(float* d_x, float* d_y)
{
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int n = bid * blockDim.x + tid;
__shared__ float s_y[128];
s_y[tid] = (n < N) ? d_x[n] : 0.;
__syncthreads();
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
{
if (tid < offset)
{
s_y[tid] += s_y[tid + offset];
}
__syncthreads();
}
if (tid == 0)
{
d_y[bid] = s_y[0];
}
}
void __global__ reduce_dynamic(float* d_x, float* d_y)
{
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int n = bid * blockDim.x + tid;
extern __shared__ float s_y[];
s_y[tid] = (n < N) ? d_x[n] : 0.0;
__syncthreads();
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
{
if (tid < offset)
{
s_y[tid] += s_y[tid + offset];
}
__syncthreads();
}
if (tid == 0)
{
d_y[bid] = s_y[0];
}
}
float reduce(float* d_x, const int method)
{
int grid_size = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
const int ymem = sizeof(float) * grid_size;
const int smem = sizeof(float) * BLOCK_SIZE;
float* d_y;
CHECK(cudaMalloc(&d_y, ymem));
float* h_y = (float*)malloc(ymem);
switch (method)
{
case 0:
reduce_global << <grid_size, BLOCK_SIZE >> > (d_x, d_y);
break;
case 1:
reduce_shared << <grid_size, BLOCK_SIZE >> > (d_x, d_y);
break;
case 2:
reduce_dynamic << <grid_size, BLOCK_SIZE, smem >> > (d_x, d_y);
break;
default:
printf("Error: wrong method\n");
exit(1);
break;
}
CHECK(cudaMemcpy(h_y, d_y, ymem, cudaMemcpyDeviceToHost));
float result = 0.;
for (int n = 0; n < grid_size; ++n)
{
result += h_y[n];
}
free(h_y);
CHECK(cudaFree(d_y));
return result;
}
void timing(float* h_x, float* d_x, const int method)
{
float sum = 0;
for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
{
CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice));
cudaEvent_t start, stop;
CHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop));
CHECK(cudaEventRecord(start));
cudaEventQuery(start);
sum = reduce(d_x, method);
CHECK(cudaEventRecord(stop));
CHECK(cudaEventSynchronize(stop));
float elapsed_time;
CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
printf("Time = %g ms.\n", elapsed_time);
CHECK(cudaEventDestroy(start));
CHECK(cudaEventDestroy(stop));
}
printf("sum = %f.\n", sum);
}
代码简单讲解:
定义一个长度为100000000的数组
将数据分为cile(100000000/128)个分区, 然后对每个分区计算一个区域求和,然后对总的分区进行求和。
三个做法都是如此
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <device_launch_parameters.h>
#include "error.cuh"
#include <stdio.h>
#include <cmath>
const int NUM_REPEATS = 1;
const int N = 100000000;
const int M = sizeof(float) * N;
const int BLOCK_SIZE = 128;
void timing(float* h_x, float* d_x, const int method);
int main()
{
float* h_x = (float*)malloc(M);
for (int n = 0; n < N; ++n) {
h_x[n] = 1.23;
}
float* d_x;
CHECK(cudaMalloc(&d_x, M));
/*printf("\nUsing global memory only:\n");
timing(h_x, d_x, 0);
printf("\nUsing static shared memory:\n");
timing(h_x, d_x, 1);
printf("\nUsing static shared memory:\n");
timing(h_x, d_x, 2);*/
printf("\nUsing atomicAdd operation:\n");
timing(h_x, d_x, 3);
free(h_x);
CHECK(cudaFree(d_x));
return 0;
}
/*
* blockSize中的所有数据按照元素相加,然后对每个gridSize进行再求和
*/
void __global__ reduce_global(float* d_x, float* d_y)
{
const int tid = threadIdx.x;
float* x = d_x + blockDim.x * blockIdx.x; // blockIdx.x 对应的是 gridDim
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
{
if (tid < offset)
{
x[tid] += x[tid + offset];
}
__syncthreads();
}
if (tid == 0)
{
d_y[blockIdx.x] = x[0];
}
}
void __global__ reduce_shared(float* d_x, float* d_y)
{
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int n = bid * blockDim.x + tid;
__shared__ float s_y[128];
s_y[tid] = (n < N) ? d_x[n] : 0.;
__syncthreads();
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
{
if (tid < offset)
{
s_y[tid] += s_y[tid + offset];
}
__syncthreads();
}
if (tid == 0)
{
d_y[bid] = s_y[0];
}
}
void __global__ reduce_dynamic(float* d_x, float* d_y)
{
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int n = bid * blockDim.x + tid;
extern __shared__ float s_y[];
s_y[tid] = (n < N) ? d_x[n] : 0.0;
__syncthreads();
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
{
if (tid < offset)
{
s_y[tid] += s_y[tid + offset];
}
__syncthreads();
}
if (tid == 0)
{
d_y[bid] = s_y[0];
}
}
void __global__ reduce_atomic(const float* d_x, float* d_y)
{
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int n = bid * blockDim.x + threadIdx.x;
__shared__ float s_y[128];
s_y[tid] = (n < N) ? d_x[n] : 0.0;
__syncthreads();
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
{
if (tid < offset)
{
s_y[tid] += s_y[tid + offset];
}
__syncthreads();
}
if (tid == 0)
{
atomicAdd(d_y, s_y[0]);
}
}
float reduce(float* d_x, const int method)
{
int grid_size = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
const int ymem = sizeof(float) * grid_size;
const int smem = sizeof(float) * BLOCK_SIZE;
float* d_y;
CHECK(cudaMalloc(&d_y, ymem));
float* h_y = (float*)malloc(ymem);
switch (method)
{
case 0:
reduce_global << <grid_size, BLOCK_SIZE >> > (d_x, d_y);
break;
case 1:
reduce_shared << <grid_size, BLOCK_SIZE >> > (d_x, d_y);
break;
case 2:
reduce_dynamic << <grid_size, BLOCK_SIZE, smem >> > (d_x, d_y);
break;
case 3:
reduce_atomic << <grid_size, BLOCK_SIZE>> > (d_x, d_y);
break;
default:
printf("Error: wrong method\n");
exit(1);
break;
}
CHECK(cudaMemcpy(h_y, d_y, ymem, cudaMemcpyDeviceToHost));
float result = 0.;
for (int n = 0; n < grid_size; ++n)
{
result += h_y[n];
if (n < 10)
printf("grid_size:%d, 当前值: %f\n", n, h_y[n]);
}
free(h_y);
CHECK(cudaFree(d_y));
return result;
}
void timing(float* h_x, float* d_x, const int method)
{
float sum = 0;
for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
{
CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice));
cudaEvent_t start, stop;
CHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop));
CHECK(cudaEventRecord(start));
cudaEventQuery(start);
sum = reduce(d_x, method);
CHECK(cudaEventRecord(stop));
CHECK(cudaEventSynchronize(stop));
float elapsed_time;
CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
printf("Time = %g ms.\n", elapsed_time);
CHECK(cudaEventDestroy(start));
CHECK(cudaEventDestroy(stop));
}
printf("sum = %f.\n", sum);
}
此处添加一个原子操作:
我们通过打印,发现在d_y处已经进行了求和, 在atomicAdd下,只对d_y[0]进行累加