前言
从数组规约中了解内存使用情况
1、CPU操作
real reduce(const real *x,const int N){
real sum = 0.0;
for(int n=0;n<N;++n){
sum += x[n];
}
return sum;
}
2、全局内存
void __global__ reduce_global(real *d_x, real *d_y)
{
const int tid = threadIdx.x;
real *x = d_x + blockDim.x * blockIdx.x;
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
{
if (tid < offset)
{
x[tid] += x[tid + offset];
}
__syncthreads();
}
if (tid == 0)
{
d_y[blockIdx.x] = x[0];
}
}
3、共享内存
共享内存可以被程序员直接操控
减少对全局内存的访问,实现高效的线程块内部通信
也可以提高全局内存访问的合并度。
void __global__ reduce_shared(real *d_x, real *d_y)
{
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int n = bid * blockDim.x + tid;
__shared__ real s_y[128];
s_y[tid] = (n < N) ? d_x[n] : 0.0;
__syncthreads();
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
{
if (tid < offset)
{
s_y[tid] += s_y[tid + offset];
}
__syncthreads();
}
if (tid == 0)
{
d_y[bid] = s_y[0];
}
}
4、动态共享内存
void __global__ reduce_dynamic(real *d_x, real *d_y)
{
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int n = bid * blockDim.x + tid;
extern __shared__ real s_y[];
s_y[tid] = (n < N) ? d_x[n] : 0.0;
__syncthreads();
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
{
if (tid < offset)
{
s_y[tid] += s_y[tid + offset];
}
__syncthreads();
}
if (tid == 0)
{
d_y[bid] = s_y[0];
}
}
5、完整代码
#include "error.cuh"
#include <stdio.h>
#ifdef USE_DP
typedef double real;
#else
typedef float real;
#endif
const int NUM_REPEATS = 100;
const int N = 100000000;
const int M = sizeof(real) * N;
const int BLOCK_SIZE = 128;
void timing(real *h_x, real *d_x, const int method);
int main(void)
{
real *h_x = (real *) malloc(M);
for (int n = 0; n < N; ++n)
{
h_x[n] = 1.23;
}
real *d_x;
CHECK(cudaMalloc(&d_x, M));
printf("\nUsing global memory only:\n");
timing(h_x, d_x, 0);
printf("\nUsing static shared memory:\n");
timing(h_x, d_x, 1);
printf("\nUsing dynamic shared memory:\n");
timing(h_x, d_x, 2);
free(h_x);
CHECK(cudaFree(d_x));
return 0;
}
void __global__ reduce_global(real *d_x, real *d_y)
{
const int tid = threadIdx.x;
real *x = d_x + blockDim.x * blockIdx.x;
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
{
if (tid < offset)
{
x[tid] += x[tid + offset];
}
__syncthreads();
}
if (tid == 0)
{
d_y[blockIdx.x] = x[0];
}
}
void __global__ reduce_shared(real *d_x, real *d_y)
{
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int n = bid * blockDim.x + tid;
__shared__ real s_y[128];
s_y[tid] = (n < N) ? d_x[n] : 0.0;
__syncthreads();
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
{
if (tid < offset)
{
s_y[tid] += s_y[tid + offset];
}
__syncthreads();
}
if (tid == 0)
{
d_y[bid] = s_y[0];
}
}
void __global__ reduce_dynamic(real *d_x, real *d_y)
{
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int n = bid * blockDim.x + tid;
extern __shared__ real s_y[];
s_y[tid] = (n < N) ? d_x[n] : 0.0;
__syncthreads();
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
{
if (tid < offset)
{
s_y[tid] += s_y[tid + offset];
}
__syncthreads();
}
if (tid == 0)
{
d_y[bid] = s_y[0];
}
}
real reduce(real *d_x, const int method)
{
int grid_size = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
const int ymem = sizeof(real) * grid_size;
const int smem = sizeof(real) * BLOCK_SIZE;
real *d_y;
CHECK(cudaMalloc(&d_y, ymem));
real *h_y = (real *) malloc(ymem);
switch (method)
{
case 0:
reduce_global<<<grid_size, BLOCK_SIZE>>>(d_x, d_y);
break;
case 1:
reduce_shared<<<grid_size, BLOCK_SIZE>>>(d_x, d_y);
break;
case 2:
reduce_dynamic<<<grid_size, BLOCK_SIZE, smem>>>(d_x, d_y);
break;
default:
printf("Error: wrong method\n");
exit(1);
break;
}
CHECK(cudaMemcpy(h_y, d_y, ymem, cudaMemcpyDeviceToHost));
real result = 0.0;
for (int n = 0; n < grid_size; ++n)
{
result += h_y[n];
}
free(h_y);
CHECK(cudaFree(d_y));
return result;
}
void timing(real *h_x, real *d_x, const int method)
{
real sum = 0;
for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
{
CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice));
cudaEvent_t start, stop;
CHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop));
CHECK(cudaEventRecord(start));
cudaEventQuery(start);
sum = reduce(d_x, method);
CHECK(cudaEventRecord(stop));
CHECK(cudaEventSynchronize(stop));
float elapsed_time;
CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
printf("Time = %g ms.\n", elapsed_time);
CHECK(cudaEventDestroy(start));
CHECK(cudaEventDestroy(stop));
}
printf("sum = %f.\n", sum);
}
6、原子函数
void __global__ reduce(const real *d_x,real *d_y,const int N ){
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int n = bid * blockDim.x + tid;
extern __shared__ real s_y[];
s_y[tid] = (n<N) ? d_x[n]:0.0;
__syncthreads();
for(int offset = blockDim.x >>1;offset >0;offset >>=1){
if (tid < offset){
s_y[tid] += s_y[tid + offset];
}
__syncthreads();
}
if(tid==0){
atomicAdd(d_y,s_y[0]);
}
}
完整代码
#include "error.cuh"
#include<stdio.h>
#ifdef USE_DP
typedef double real;
#else
typedef float real;
#endif
const int NUM_REPEAT = 100;
const int N = 10000000;
const int M = sizeof(real) * N ;
const int BLOCK_SIZE = 128;
void timing(const real *d_x);
int main(void){
real *h_x = (real *) malloc(M);
for(int n=0;n<N; ++n){
h_x[n] = 1.23;
}
real *d_x;
CHECK(cudaMalloc(&d_x,M));
CHECK(cudaMemcpy(d_x,h_x,M,cudaMemcpyHostToDevice));
printf("\nusing atomicAdd:\n");
timing(d_x);
free(h_x);
CHECK(cudaFree(d_x));
return 0;
}
void __global__ reduce(const real *d_x,real *d_y,const int N ){
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int n = bid * blockDim.x + tid;
extern __shared__ real s_y[];
s_y[tid] = (n<N) ? d_x[n]:0.0;
__syncthreads();
for(int offset = blockDim.x >>1;offset >0;offset >>=1){
if (tid < offset){
s_y[tid] += s_y[tid + offset];
}
__syncthreads();
}
if(tid==0){
atomicAdd(d_y,s_y[0]);
}
}
real reduce(const real *d_x){
const int gird_size = (N+BLOCK_SIZE -1) / BLOCK_SIZE;
const int smem = sizeof(real) * BLOCK_SIZE;
real h_y[1] = {0};
real *d_y;
CHECK(cudaMalloc(&d_y,sizeof(real)));
CHECK(cudaMemcpy(d_y,h_y,sizeof(real),cudaMemcpyHostToDevice));
reduce<<<gird_size,BLOCK_SIZE,smem>>>(d_x,d_y,N);
CHECK(cudaMemcpy(h_y,d_y,sizeof(real),cudaMemcpyDeviceToHost));
CHECK(cudaFree(d_y));
return h_y[0];
}
void timing(const real *d_x){
real sum = 0;
for(int repeat=0;repeat < NUM_REPEAT;++repeat){
cudaEvent_t start,stop;
CHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop));
CHECK(cudaEventRecord(start));
cudaEventQuery(start);
sum = reduce(d_x);
CHECK(cudaEventRecord(stop));
CHECK(cudaEventSynchronize(stop));
float elapsed_time;
CHECK(cudaEventElapsedTime(&elapsed_time,start,stop));
printf("Time= %g ms.\n",elapsed_time);
CHECK(cudaEventDestroy(start));
CHECK(cudaEventDestroy(stop));
}
printf("sum = %f.\n",sum);
}
7、总结
1、从cpu计算到GPU计算,加快计算速度
2、从全局内存、共享内存、动态共享内存、原子函数,不断加速计算和减少内存使用
3、数组规约的加速示例
4、进一步研究,还可以从线程束方面进行规约计算。