文章目录
1. 设定GPU
#define CHECK(call) \
{ \
const cudaError_t error = call; \
if(error!=cudaSuccess) \
{ \
printf("error: %s: %d, ", __FILE__, __LINE__); \
printf("code:%d, reason: %S\n", error, cudaGetErrorString(error)); \
exit(1);\
}\
}
#include <stdio.h>
#include <cuda_runtime.h>
int main() {
int dev = 0;
cudaDeviceProp deviceProp;
CHECK(cudaGetDeviceProperties(&deviceProp, dev));
printf("Using device %d: %s\n",dev, deviceProp.name);
CHECK(cudaSetDevice(dev));
CHECK(cudaDeviceReset());
return EXIT_SUCCESS;
}
2. 简单的向量相加
#include <stdio.h>
#include <cuda_runtime.h>
void initdata(float *p, int size)
{
for(int i=0; i<size; i++)
{
p[i] = (float)(rand()&0xFF) / 10.0f;
}
}
__global__ void sumarray(float *a, float *b, float *c, const int n)
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
if(idx<n)
c[idx] = a[idx] + b[idx];
}
void sumOnCpu(float *a, float *b, float *c, const int n) {
for (int i = 0; i < n; i++) {
c[i] = a[i] + b[i];
}
}
void sumprintf(float *a, float *b, float *c, const int n) {
for (int i = 0; i < n; i++) {
printf("%f + %f = %f\n",a[i], b[i], c[i]);
}
}
void check(float *a, float *b, const int n)
{
double e = 1.0E-8;
for(int i = 0; i < n; i++)
{
if(abs(a[i] - b[i]) > e)
{
printf("\ndo not match");
printf("host %5.2f gpu %5.2f at current %d\n", a[i],
b[i], i);
return;
}
}
printf("\nsuccess\n");
return;
}
int main() {
int dev = 0;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
cudaSetDevice(dev);
printf("device %d: %s",dev, deviceProp.name);
int nElem = 1 << 4;
int nbytes = sizeof(float) * nElem;
float *ha, *hb, *res_g, *hc;
ha = (float *)malloc(nbytes);
hb = (float *)malloc(nbytes);
res_g = (float *)malloc(nbytes);
hc = (float *)malloc(nbytes);
initdata(ha, nElem);
initdata(hb, nElem);
memset(res_g, 0, nbytes);
memset(hc, 0, nbytes);
float *da, *db, *dc;
cudaMalloc((float**)&da, nbytes);
cudaMalloc((float**)&db, nbytes);
cudaMalloc((float**)&dc, nbytes);
cudaMemcpy(da, ha, nbytes, cudaMemcpyHostToDevice);
cudaMemcpy(db, hb, nbytes, cudaMemcpyHostToDevice);
dim3 block(32);
dim3 grid((nElem + block.x - 1)/block.x);
sumarray<<<grid, block>>>(da, db, dc, nElem);
cudaMemcpy(res_g, dc, nbytes, cudaMemcpyDeviceToHost);
sumOnCpu(ha, hb, hc, nElem);
check(res_g, hc, nElem);
// sumprintf(ha, hb, hc, nElem);
// sumprintf(ha, hb, res_g, nElem);
cudaFree(da);
cudaFree(db);
cudaFree(dc);
free(ha);
free(hb);
free(res_g);
return EXIT_SUCCESS;
}
3. 简单的矩阵相加
#include <stdio.h>
#include <cuda_runtime.h>
void initdata(float *p, int size)
{
for(int i=0; i<size; i++)
{
p[i] = (float)(rand()&0xFF) / 10.0f;
}
}
__global__ void summetrix(float *a, float *b, float *c, int nx, int ny)
{
unsigned int ix = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int iy = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int idx = ix + iy * nx;
if(ix<nx&&iy<ny)
c[idx] = a[idx] + b[idx];
}
void sumMatrixOnHost(float *A, float *B, float *C, const int nx,
const int ny)
{
float *ia = A;
float *ib = B;
float *ic = C;
for (int iy = 0; iy < ny; iy++)
{
for (int ix = 0; ix < nx; ix++)
{
ic[ix] = ia[ix] + ib[ix];
}
ia += nx;
ib += nx;
ic += nx;
}
return;
}
void checkResult(float *hostRef, float *gpuRef, const int N)
{
double epsilon = 1.0E-8;
bool match = 1;
for (int i = 0; i < N; i++)
{
if (abs(hostRef[i] - gpuRef[i]) > epsilon)
{
match = 0;
printf("host %f gpu %f\n", hostRef[i], gpuRef[i]);
break;
}
}
if (match)
printf("Arrays match.\n\n");
else
printf("Arrays do not match.\n\n");
}
int main() {
int dev;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
cudaSetDevice(dev);
int nx = 1<<8;
int ny = 1<<8;
int nElem = nx * ny;
int nbytes = sizeof(float) * nElem;
float *ha, *hb, *hc, *res_g;
ha = (float*)malloc(nbytes);
hb = (float*)malloc(nbytes);
hc = (float*)malloc(nbytes);
res_g = (float*)malloc(nbytes);
initdata(ha, nElem);
initdata(hb, nElem);
memset(res_g, 0, nbytes);
memset(hc, 0, nbytes);
float *da, *db, *dc;
cudaMalloc((float**)&da, nbytes);
cudaMalloc((float**)&db, nbytes);
cudaMalloc((float**)&dc, nbytes);
cudaMemcpy(da, ha, nbytes, cudaMemcpyHostToDevice);
cudaMemcpy(db, hb, nbytes, cudaMemcpyHostToDevice);
dim3 block(32, 32);
dim3 grid((nx + block.x -1)/block.x, (ny+block.y-1)/block.y);
summetrix<<<grid, block>>>(da, db, dc, nx, ny);
cudaMemcpy(res_g, dc, nbytes, cudaMemcpyDeviceToHost);
sumMatrixOnHost(ha, hb, hc, nx, ny);
checkResult(res_g, hc, nElem);
cudaFree(da);
cudaFree(db);
cudaFree(dc);
free(ha);
free(hb);
free(res_g);
cudaDeviceReset();
return 0;
}
4. 相邻归约,向量求和
#include <stdio.h>
__global__ void sumgpu(float *a, float *b, int n)
{
unsigned int idx = threadIdx.x + blockDim.x * blockIdx.x;
unsigned int tid = threadIdx.x;
float *idata = a + blockIdx.x * blockDim.x;
if(idx > n)return;
for(int stride=1; stride<blockDim.x; stride*=2)
{
if((tid%(stride*2))==0)
idata[tid] += idata[tid+stride];
__syncthreads();
}
if(tid==0)
b[blockIdx.x] = idata[0];
}
void initdata(float *a, int n)
{
for(int i=0; i<n; i++)
{
a[i] = (float)(rand()&0xFF) / 100.0f;
}
}
int main(){
int dev = 0;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
printf("gpu: %d, name: %s\n", dev, deviceProp.name);
cudaSetDevice(dev);
int nElem = 1<<12;
dim3 block(32);
dim3 grid((nElem + block.x - 1)/block.x);
int nbytes = sizeof(float) * nElem;
float *h_a, *h_g;
h_a = (float *)malloc(nbytes);
h_g = (float *)malloc(sizeof(float) * grid.x);
initdata(h_a, nElem);
memset(h_g, 0, sizeof(float) * grid.x);
float *d_a, *d_b;
cudaMalloc((float **)&d_a, nbytes);
cudaMalloc((float **)&d_b, sizeof(float) * grid.x);
cudaMemcpy(d_a, h_a, nbytes, cudaMemcpyHostToDevice);
cudaMemset(d_b, 0, sizeof(float) * grid.x);
float res = 0.0;
for(int i=0; i<nElem; i++)
res += h_a[i];
printf("\nsum on cpu:%f\n", res);
sumgpu<<<grid, block>>>(d_a, d_b, nElem);
cudaMemcpy(h_g, d_b, sizeof(float) * grid.x, cudaMemcpyDeviceToHost);
float h_res = 0.0;
for(int i=0; i<grid.x; i++)
h_res += h_g[i];
printf("\nsum on gpu:%f\n", h_res);
cudaFree(d_a);
cudaFree(d_b);
free(h_a);
free(h_g);
cudaDeviceReset();
}
5. 相邻归约,改善分化
#include <stdio.h>
__global__ void sumgpu2(float *a, float *b, int n)
{
unsigned int tid = threadIdx.x;
unsigned int idx = threadIdx.x + blockDim.x * blockIdx.x;
if(idx > n)return;
float *idata = a + blockIdx.x * blockDim.x;
for(int stride=1; stride<blockDim.x; stride*=2)
{
unsigned int index = 2 * stride * tid;
if(index < blockDim.x)
idata[index] += idata[index + stride];
__syncthreads();
}
if(tid == 0)b[blockIdx.x] = idata[0];
}
void initdata(float *a, int n)
{
for(int i=0; i<n; i++)
{
a[i] = (float)(rand()&0xFF) / 100.0f;
}
}
int main(){
int dev = 0;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
printf("gpu: %d, name: %s\n", dev, deviceProp.name);
cudaSetDevice(dev);
int nElem = 1<<12;
dim3 block(32);
dim3 grid((nElem + block.x - 1)/block.x);
int nbytes = sizeof(float) * nElem;
float *h_a, *h_g;
h_a = (float *)malloc(nbytes);
h_g = (float *)malloc(sizeof(float) * grid.x);
initdata(h_a, nElem);
memset(h_g, 0, sizeof(float) * grid.x);
float *d_a, *d_b;
cudaMalloc((float **)&d_a, nbytes);
cudaMalloc((float **)&d_b, sizeof(float) * grid.x);
cudaMemcpy(d_a, h_a, nbytes, cudaMemcpyHostToDevice);
cudaMemset(d_b, 0, sizeof(float) * grid.x);
float res = 0.0;
for(int i=0; i<nElem; i++)
res += h_a[i];
printf("\nsum on cpu:%f\n", res);
sumgpu2<<<grid, block>>>(d_a, d_b, nElem);
cudaMemcpy(h_g, d_b, sizeof(float) * grid.x, cudaMemcpyDeviceToHost);
float h_res = 0.0;
for(int i=0; i<grid.x; i++)
h_res += h_g[i];
printf("\nsum on gpu:%f\n", h_res);
cudaFree(d_a);
cudaFree(d_b);
free(h_a);
free(h_g);
cudaDeviceReset();
}
6. 交错归约
#include <stdio.h>
__global__ void sumgpu3(float *a, float *b, int n)
{
unsigned int tid = threadIdx.x;
unsigned int idx = threadIdx.x + blockDim.x * blockIdx.x;
if(idx>n)return;
float *idata = a + blockDim.x * blockIdx.x;
for(int stride=blockDim.x/2; stride>0; stride>>=1)
{
if(tid<stride)
idata[tid] += idata[tid+stride];
__syncthreads();
}
if(tid==0)b[blockIdx.x] = idata[0];
}
void initdata(float *a, int n)
{
for(int i=0; i<n; i++)
{
a[i] = (float)(rand()&0xFF) / 100.0f;
}
}
int main(){
int dev = 0;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
printf("gpu: %d, name: %s\n", dev, deviceProp.name);
cudaSetDevice(dev);
int nElem = 1<<12;
dim3 block(32);
dim3 grid((nElem + block.x - 1)/block.x);
int nbytes = sizeof(float) * nElem;
float *h_a, *h_g;
h_a = (float *)malloc(nbytes);
h_g = (float *)malloc(sizeof(float) * grid.x);
initdata(h_a, nElem);
memset(h_g, 0, sizeof(float) * grid.x);
float *d_a, *d_b;
cudaMalloc((float **)&d_a, nbytes);
cudaMalloc((float **)&d_b, sizeof(float) * grid.x);
cudaMemcpy(d_a, h_a, nbytes, cudaMemcpyHostToDevice);
cudaMemset(d_b, 0, sizeof(float) * grid.x);
float res = 0.0;
for(int i=0; i<nElem; i++)
res += h_a[i];
printf("\nsum on cpu:%f\n", res);
sumgpu3<<<grid, block>>>(d_a, d_b, nElem);
cudaMemcpy(h_g, d_b, sizeof(float) * grid.x, cudaMemcpyDeviceToHost);
float h_res = 0.0;
for(int i=0; i<grid.x; i++)
h_res += h_g[i];
printf("\nsum on gpu:%f\n", h_res);
cudaFree(d_a);
cudaFree(d_b);
free(h_a);
free(h_g);
cudaDeviceReset();
}
7. 展开归约
先将块叠加,即展开归约,然后再块内交错归约。提高内存吞吐量。内存延迟可以更好地被隐藏起来。
#include <stdio.h>
void initdata(float *a, int n)
{
for(int i=0; i<n; i++)
{
a[i] = (float)(rand()&0xFF) / 100.f;
}
}
__global__ void sumInterleaveUnroll(float *d, float *tmp, int n)
{
unsigned int tid = threadIdx.x;
unsigned int idx = threadIdx.x + blockDim.x * blockIdx.x * 8;
if(idx + blockDim.x * 7 < n)
{
float a1 = d[idx];
float a2 = d[idx + blockDim.x];
float a3 = d[idx + blockDim.x * 2];
float a4 = d[idx + blockDim.x * 3];
float a5 = d[idx + blockDim.x * 4];
float a6 = d[idx + blockDim.x * 5];
float a7 = d[idx + blockDim.x * 6];
float a8 = d[idx + blockDim.x * 7];
d[idx] = a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8;
}
__syncthreads();
float *idata = d + blockDim.x * blockIdx.x * 8;
for(int stride=blockDim.x/2; stride>0; stride>>=1)
{
if(tid < stride)
idata[tid] += idata[tid + stride];
__syncthreads();
}
if(tid==0)tmp[blockIdx.x] = idata[0];
}
int main()
{
int dev=0;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
printf("gpu: %d, name: %s \n", dev, deviceProp.name);
cudaSetDevice(dev);
int nElem = 1<<12;
int nbytes = sizeof(float) * nElem;
dim3 block(32);
dim3 grid((nElem + block.x - 1) / block.x);
float *ha, *htmp;
ha = (float*)malloc(nbytes);
htmp = (float*)malloc(sizeof(float)*grid.x / 8);
initdata(ha, nElem);
memset(htmp, 0, sizeof(float)*grid.x / 8);
float *da, *dtmp;
cudaMalloc((float**)&da, nbytes);
cudaMalloc((float**)&dtmp, sizeof(float)*grid.x / 8);
cudaMemcpy(da, ha, nbytes, cudaMemcpyHostToDevice);
cudaMemset(dtmp, 0, sizeof(float)*grid.x / 8);
float res = 0.0;
for(int i=0; i<nElem; i++)
res += ha[i];
printf("correct sum: %f \n", res);
sumInterleaveUnroll<<<grid.x / 8, block>>>(da, dtmp, nElem);
cudaMemcpy(htmp, dtmp, sizeof(float)*grid.x / 8, cudaMemcpyDeviceToHost);
float res_d = 0.0;
for(int i=0; i<grid.x / 8; i++)
res_d += htmp[i];
printf("gpu sum: %f \n", res_d);
cudaFree(da);
cudaFree(dtmp);
free(ha);
free(htmp);
}
8. 展开线程的归约
用线程束内隐式同步代替部分__syncthreads()
#include <stdio.h>
void initdata(float *a, int n)
{
for(int i=0; i<n; i++)
{
a[i] = (float)(rand()&0xFF) / 100.f;
}
}
__global__ void sumInterleaveUnrollWarp(float *d, float *tmp, int n)
{
unsigned int tid = threadIdx.x;
unsigned int idx = threadIdx.x + blockDim.x * blockIdx.x * 8;
if(idx + 7 * blockDim.x<n)
{
float a1 = d[idx];
float a2 = d[idx + blockDim.x];
float a3 = d[idx + blockDim.x * 2];
float a4 = d[idx + blockDim.x * 3];
float a5 = d[idx + blockDim.x * 4];
float a6 = d[idx + blockDim.x * 5];
float a7 = d[idx + blockDim.x * 6];
float a8 = d[idx + blockDim.x * 7];
d[idx] = a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8;
}
__syncthreads();
float *idata = d + 8 * blockDim.x * blockIdx.x;
for(int stride=blockDim.x/2; stride>32; stride>>=1)
{
if(tid < stride)
idata[tid] += idata[tid + stride];
__syncthreads();
}
if(tid<32)
{
volatile float *vmem = idata;
vmem[tid] += vmem[tid + 32];
vmem[tid] += vmem[tid + 16];
vmem[tid] += vmem[tid + 8];
vmem[tid] += vmem[tid + 4];
vmem[tid] += vmem[tid + 2];
vmem[tid] += vmem[tid + 1];
}
if(tid==0)tmp[blockIdx.x] = idata[0];
}
int main()
{
int dev=0;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
printf("gpu: %d, name: %s \n", dev, deviceProp.name);
cudaSetDevice(dev);
int nElem = 1<<14;
int nbytes = sizeof(float) * nElem;
dim3 block(64);
dim3 grid((nElem + block.x - 1) / block.x);
float *ha, *htmp;
ha = (float*)malloc(nbytes);
htmp = (float*)malloc(sizeof(float)*grid.x / 8);
initdata(ha, nElem);
memset(htmp, 0, sizeof(float)*grid.x / 8);
float *da, *dtmp;
cudaMalloc((float**)&da, nbytes);
cudaMalloc((float**)&dtmp, sizeof(float)*grid.x / 8);
cudaMemcpy(da, ha, nbytes, cudaMemcpyHostToDevice);
cudaMemset(dtmp, 0, sizeof(float)*grid.x / 8);
float res = 0.0;
for(int i=0; i<nElem; i++)
res += ha[i];
printf("correct sum: %f \n", res);
sumInterleaveUnrollWarp<<<grid.x / 8, block>>>(da, dtmp, nElem);
cudaMemcpy(htmp, dtmp, sizeof(float)*grid.x / 8, cudaMemcpyDeviceToHost);
float res_d = 0.0;
for(int i=0; i<grid.x / 8; i++)
res_d += htmp[i];
printf("gpu sum: %f \n", res_d);
cudaFree(da);
cudaFree(dtmp);
free(ha);
free(htmp);
}
9. 完全展开的归约
#include <stdio.h>
void initdata(float *a, int n)
{
for(int i=0; i<n; i++)
{
a[i] = (float)(rand()&0xFF) / 100.f;
}
}
__global__ void sumCompleteUnroll(float *d, float *tmp, int n)
{
unsigned int tid = threadIdx.x;
unsigned int idx = threadIdx.x + blockDim.x * blockIdx.x * 8;
if(idx + 7 * blockDim.x<n)
{
float a1 = d[idx];
float a2 = d[idx + blockDim.x];
float a3 = d[idx + blockDim.x * 2];
float a4 = d[idx + blockDim.x * 3];
float a5 = d[idx + blockDim.x * 4];
float a6 = d[idx + blockDim.x * 5];
float a7 = d[idx + blockDim.x * 6];
float a8 = d[idx + blockDim.x * 7];
d[idx] = a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8;
}
__syncthreads();
float *idata = d + 8 * blockDim.x * blockIdx.x;
if(blockDim.x>=1024 && tid <512) idata[tid] += idata[tid + 512];
__syncthreads();
if(blockDim.x>=512 && tid <256) idata[tid] += idata[tid + 256];
__syncthreads();
if(blockDim.x>=256 && tid <128) idata[tid] += idata[tid + 128];
__syncthreads();
if(blockDim.x>=128 && tid <64) idata[tid] += idata[tid + 64];
__syncthreads();
if(tid<32)
{
volatile float *vmem = idata;
vmem[tid] += vmem[tid + 32];
vmem[tid] += vmem[tid + 16];
vmem[tid] += vmem[tid + 8];
vmem[tid] += vmem[tid + 4];
vmem[tid] += vmem[tid + 2];
vmem[tid] += vmem[tid + 1];
}
if(tid==0)tmp[blockIdx.x] = idata[0];
}
int main()
{
int dev=0;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
printf("gpu: %d, name: %s \n", dev, deviceProp.name);
cudaSetDevice(dev);
int nElem = 1<<14;
int nbytes = sizeof(float) * nElem;
dim3 block(64);
dim3 grid((nElem + block.x - 1) / block.x);
float *ha, *htmp;
ha = (float*)malloc(nbytes);
htmp = (float*)malloc(sizeof(float)*grid.x / 8);
initdata(ha, nElem);
memset(htmp, 0, sizeof(float)*grid.x / 8);
float *da, *dtmp;
cudaMalloc((float**)&da, nbytes);
cudaMalloc((float**)&dtmp, sizeof(float)*grid.x / 8);
cudaMemcpy(da, ha, nbytes, cudaMemcpyHostToDevice);
cudaMemset(dtmp, 0, sizeof(float)*grid.x / 8);
float res = 0.0;
for(int i=0; i<nElem; i++)
res += ha[i];
printf("correct sum: %f \n", res);
sumCompleteUnroll<<<grid.x / 8, block>>>(da, dtmp, nElem);
cudaMemcpy(htmp, dtmp, sizeof(float)*grid.x / 8, cudaMemcpyDeviceToHost);
float res_d = 0.0;
for(int i=0; i<grid.x / 8; i++)
res_d += htmp[i];
printf("gpu sum: %f \n", res_d);
cudaFree(da);
cudaFree(dtmp);
free(ha);
free(htmp);
}
10.模板函数的归约
blockDim.x 自内核启动时一旦确定,就不能更改了,所以模板函数解决了这个问题,当编译时编译器会去检查,blockDim.x 是否固定,如果固定,直接可以删除掉内核中不可能的部分也就是上半部分,下半部分是要执行的。
#include <stdio.h>
void initdata(float *a, int n)
{
for(int i=0; i<n; i++)
{
a[i] = (float)(rand()&0xFF) / 100.f;
}
}
template <unsigned int iBlockSize>
__global__ void sumCompleteUnrolltemplate(float *d, float *tmp, int n)
{
unsigned int tid = threadIdx.x;
unsigned int idx = threadIdx.x + blockDim.x * blockIdx.x * 8;
if(idx + 7 * blockDim.x<n)
{
float a1 = d[idx];
float a2 = d[idx + blockDim.x];
float a3 = d[idx + blockDim.x * 2];
float a4 = d[idx + blockDim.x * 3];
float a5 = d[idx + blockDim.x * 4];
float a6 = d[idx + blockDim.x * 5];
float a7 = d[idx + blockDim.x * 6];
float a8 = d[idx + blockDim.x * 7];
d[idx] = a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8;
}
__syncthreads();
float *idata = d + 8 * blockDim.x * blockIdx.x;
if(iBlockSize>=1024 && tid <512) idata[tid] += idata[tid + 512];
__syncthreads();
if(iBlockSize>=512 && tid <256) idata[tid] += idata[tid + 256];
__syncthreads();
if(iBlockSize>=256 && tid <128) idata[tid] += idata[tid + 128];
__syncthreads();
if(iBlockSize>=128 && tid <64) idata[tid] += idata[tid + 64];
__syncthreads();
if(tid<32)
{
volatile float *vmem = idata;
vmem[tid] += vmem[tid + 32];
vmem[tid] += vmem[tid + 16];
vmem[tid] += vmem[tid + 8];
vmem[tid] += vmem[tid + 4];
vmem[tid] += vmem[tid + 2];
vmem[tid] += vmem[tid + 1];
}
if(tid==0)tmp[blockIdx.x] = idata[0];
}
int main()
{
int dev=0;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
printf("gpu: %d, name: %s \n", dev, deviceProp.name);
cudaSetDevice(dev);
int nElem = 1<<14;
int nbytes = sizeof(float) * nElem;
dim3 block(64);
dim3 grid((nElem + block.x - 1) / block.x);
float *ha, *htmp;
ha = (float*)malloc(nbytes);
htmp = (float*)malloc(sizeof(float)*grid.x / 8);
initdata(ha, nElem);
memset(htmp, 0, sizeof(float)*grid.x / 8);
float *da, *dtmp;
cudaMalloc((float**)&da, nbytes);
cudaMalloc((float**)&dtmp, sizeof(float)*grid.x / 8);
cudaMemcpy(da, ha, nbytes, cudaMemcpyHostToDevice);
cudaMemset(dtmp, 0, sizeof(float)*grid.x / 8);
float res = 0.0;
for(int i=0; i<nElem; i++)
res += ha[i];
printf("correct sum: %f \n", res);
sumCompleteUnrolltemplate<64><<<grid.x / 8, block>>>(da, dtmp, nElem);
cudaMemcpy(htmp, dtmp, sizeof(float)*grid.x / 8, cudaMemcpyDeviceToHost);
float res_d = 0.0;
for(int i=0; i<grid.x / 8; i++)
res_d += htmp[i];
printf("gpu sum: %f \n", res_d);
cudaFree(da);
cudaFree(dtmp);
free(ha);
free(htmp);
}
11. 使用共享内存
共享内存,减少全局内存访问
#include <stdio.h>
void initdata(float *a, int n)
{
for(int i=0; i<n; i++)
{
a[i] = (float)(rand()&0xFF) / 100.f;
}
}
template <unsigned int iBlockSize>
__global__ void sumCompleteUnrolltemplateSMEM(float *d, float *tmp, int n)
{
__shared__ float smem[iBlockSize];
unsigned int tid = threadIdx.x;
unsigned int idx = threadIdx.x + blockDim.x * blockIdx.x * 8;
float temsum = 0.0;
if(idx + 7 * blockDim.x<n)
{
float a1 = d[idx];
float a2 = d[idx + blockDim.x];
float a3 = d[idx + blockDim.x * 2];
float a4 = d[idx + blockDim.x * 3];
float a5 = d[idx + blockDim.x * 4];
float a6 = d[idx + blockDim.x * 5];
float a7 = d[idx + blockDim.x * 6];
float a8 = d[idx + blockDim.x * 7];
temsum = a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8;
}
smem[tid] = temsum;
__syncthreads();
if(iBlockSize>=1024 && tid <512) smem[tid] += smem[tid + 512];
__syncthreads();
if(iBlockSize>=512 && tid <256) smem[tid] += smem[tid + 256];
__syncthreads();
if(iBlockSize>=256 && tid <128) smem[tid] += smem[tid + 128];
__syncthreads();
if(iBlockSize>=128 && tid <64) smem[tid] += smem[tid + 64];
__syncthreads();
if(tid<32)
{
volatile float *vmem = smem;
vmem[tid] += vmem[tid + 32];
vmem[tid] += vmem[tid + 16];
vmem[tid] += vmem[tid + 8];
vmem[tid] += vmem[tid + 4];
vmem[tid] += vmem[tid + 2];
vmem[tid] += vmem[tid + 1];
}
if(tid==0)tmp[blockIdx.x] = smem[0];
}
int main()
{
int dev=0;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
printf("gpu: %d, name: %s \n", dev, deviceProp.name);
cudaSetDevice(dev);
int nElem = 1<<14;
int nbytes = sizeof(float) * nElem;
dim3 block(64);
dim3 grid((nElem + block.x - 1) / block.x);
float *ha, *htmp;
ha = (float*)malloc(nbytes);
htmp = (float*)malloc(sizeof(float)*grid.x / 8);
initdata(ha, nElem);
memset(htmp, 0, sizeof(float)*grid.x / 8);
float *da, *dtmp;
cudaMalloc((float**)&da, nbytes);
cudaMalloc((float**)&dtmp, sizeof(float)*grid.x / 8);
cudaMemcpy(da, ha, nbytes, cudaMemcpyHostToDevice);
cudaMemset(dtmp, 0, sizeof(float)*grid.x / 8);
float res = 0.0;
for(int i=0; i<nElem; i++)
res += ha[i];
printf("correct sum: %f \n", res);
sumCompleteUnrolltemplateSMEM<64><<<grid.x / 8, block>>>(da, dtmp, nElem);
cudaMemcpy(htmp, dtmp, sizeof(float)*grid.x / 8, cudaMemcpyDeviceToHost);
float res_d = 0.0;
for(int i=0; i<grid.x / 8; i++)
res_d += htmp[i];
printf("gpu sum: %f \n", res_d);
cudaFree(da);
cudaFree(dtmp);
free(ha);
free(htmp);
}