CUDA编程练习

1. 设定GPU

#define CHECK(call) \
{                   \
    const cudaError_t error = call; \
    if(error!=cudaSuccess)          \
    {               \
    printf("error: %s: %d, ", __FILE__, __LINE__); \
    printf("code:%d, reason: %S\n", error, cudaGetErrorString(error)); \
    exit(1);\
    }\
}
#include <stdio.h>
#include <cuda_runtime.h>

int main() {
    int dev = 0;
    cudaDeviceProp deviceProp;
    CHECK(cudaGetDeviceProperties(&deviceProp, dev));
    printf("Using device %d: %s\n",dev, deviceProp.name);
    CHECK(cudaSetDevice(dev));

    CHECK(cudaDeviceReset());
    return EXIT_SUCCESS;
}

2. 简单的向量相加

#include <stdio.h>
#include <cuda_runtime.h>

void initdata(float *p, int size)
{
    for(int i=0; i<size; i++)
    {
        p[i] = (float)(rand()&0xFF) / 10.0f;
    }
}
__global__ void sumarray(float *a, float *b, float *c, const int n)
{
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    if(idx<n)
        c[idx] = a[idx] + b[idx];
}
void sumOnCpu(float *a, float *b, float *c, const int n) {
    for (int i = 0; i < n; i++) {

        c[i] = a[i] + b[i];
    }
}
void sumprintf(float *a, float *b, float *c, const int n) {
    for (int i = 0; i < n; i++) {
        printf("%f + %f = %f\n",a[i], b[i], c[i]);
    }
}
void check(float *a, float *b, const int n)
{
    double e = 1.0E-8;
    for(int i = 0; i < n; i++)
    {
        if(abs(a[i] - b[i]) > e)
        {
            printf("\ndo not match");
            printf("host %5.2f gpu %5.2f at current %d\n", a[i],
                   b[i], i);
            return;
        }
    }
    printf("\nsuccess\n");
    return;
}

int main() {
    int dev = 0;
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, dev);
    cudaSetDevice(dev);
    printf("device %d: %s",dev, deviceProp.name);

    int nElem = 1 << 4;
    int nbytes = sizeof(float) * nElem;

    float *ha, *hb, *res_g, *hc;
    ha = (float *)malloc(nbytes);
    hb = (float *)malloc(nbytes);
    res_g = (float *)malloc(nbytes);
    hc = (float *)malloc(nbytes);


    initdata(ha, nElem);
    initdata(hb, nElem);
    memset(res_g, 0, nbytes);
    memset(hc, 0, nbytes);

    float *da, *db, *dc;
    cudaMalloc((float**)&da, nbytes);
    cudaMalloc((float**)&db, nbytes);
    cudaMalloc((float**)&dc, nbytes);

    cudaMemcpy(da, ha, nbytes, cudaMemcpyHostToDevice);
    cudaMemcpy(db, hb, nbytes, cudaMemcpyHostToDevice);


    dim3 block(32);
    dim3 grid((nElem + block.x - 1)/block.x);

    sumarray<<<grid, block>>>(da, db, dc, nElem);
    cudaMemcpy(res_g, dc, nbytes, cudaMemcpyDeviceToHost);

    sumOnCpu(ha, hb, hc, nElem);
    check(res_g, hc, nElem);

//    sumprintf(ha, hb, hc, nElem);
//    sumprintf(ha, hb, res_g, nElem);

    cudaFree(da);
    cudaFree(db);
    cudaFree(dc);
    free(ha);
    free(hb);
    free(res_g);

    return EXIT_SUCCESS;
}

3. 简单的矩阵相加

#include <stdio.h>
#include <cuda_runtime.h>

void initdata(float *p, int size)
{
    for(int i=0; i<size; i++)
    {
        p[i] = (float)(rand()&0xFF) / 10.0f;
    }
}

__global__ void summetrix(float *a, float *b, float *c, int nx, int ny)
{
    unsigned int ix = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int iy = blockIdx.y * blockDim.y + threadIdx.y;
    unsigned int idx = ix + iy * nx;
    if(ix<nx&&iy<ny)
        c[idx] = a[idx] + b[idx];
}
void sumMatrixOnHost(float *A, float *B, float *C, const int nx,
                     const int ny)
{
    float *ia = A;
    float *ib = B;
    float *ic = C;

    for (int iy = 0; iy < ny; iy++)
    {
        for (int ix = 0; ix < nx; ix++)
        {
            ic[ix] = ia[ix] + ib[ix];

        }

        ia += nx;
        ib += nx;
        ic += nx;
    }

    return;
}


void checkResult(float *hostRef, float *gpuRef, const int N)
{
    double epsilon = 1.0E-8;
    bool match = 1;

    for (int i = 0; i < N; i++)
    {
        if (abs(hostRef[i] - gpuRef[i]) > epsilon)
        {
            match = 0;
            printf("host %f gpu %f\n", hostRef[i], gpuRef[i]);
            break;
        }
    }

    if (match)
        printf("Arrays match.\n\n");
    else
        printf("Arrays do not match.\n\n");
}

int main() {
    int dev;
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, dev);
    cudaSetDevice(dev);

    int nx = 1<<8;
    int ny = 1<<8;
    int nElem = nx * ny;
    int nbytes = sizeof(float) * nElem;

    float *ha, *hb, *hc, *res_g;
    ha = (float*)malloc(nbytes);
    hb = (float*)malloc(nbytes);
    hc = (float*)malloc(nbytes);
    res_g = (float*)malloc(nbytes);

    initdata(ha, nElem);
    initdata(hb, nElem);
    memset(res_g, 0, nbytes);
    memset(hc, 0, nbytes);

    float *da, *db, *dc;
    cudaMalloc((float**)&da, nbytes);
    cudaMalloc((float**)&db, nbytes);
    cudaMalloc((float**)&dc, nbytes);
    cudaMemcpy(da, ha, nbytes, cudaMemcpyHostToDevice);
    cudaMemcpy(db, hb, nbytes, cudaMemcpyHostToDevice);

    dim3 block(32, 32);
    dim3 grid((nx + block.x -1)/block.x, (ny+block.y-1)/block.y);
    summetrix<<<grid, block>>>(da, db, dc, nx, ny);

    cudaMemcpy(res_g, dc, nbytes, cudaMemcpyDeviceToHost);

    sumMatrixOnHost(ha, hb, hc, nx, ny);
    checkResult(res_g, hc, nElem);


    cudaFree(da);
    cudaFree(db);
    cudaFree(dc);
    free(ha);
    free(hb);
    free(res_g);

    cudaDeviceReset();
    return 0;
}

4. 相邻归约,向量求和

#include <stdio.h>

__global__ void sumgpu(float *a, float *b, int n)
{
    unsigned int idx = threadIdx.x + blockDim.x * blockIdx.x;
    unsigned int tid = threadIdx.x;


    float *idata = a + blockIdx.x * blockDim.x;
    if(idx > n)return;
    for(int stride=1; stride<blockDim.x; stride*=2)
    {
        if((tid%(stride*2))==0)
            idata[tid] += idata[tid+stride];
        __syncthreads();
    }
    if(tid==0)
        b[blockIdx.x] = idata[0];
}


void initdata(float *a, int n)
{
    for(int i=0; i<n; i++)
    {
        a[i] = (float)(rand()&0xFF) / 100.0f;
    }
}
int main(){
    int dev = 0;
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, dev);
    printf("gpu: %d, name: %s\n", dev, deviceProp.name);
    cudaSetDevice(dev);

    int nElem = 1<<12;
    dim3 block(32);
    dim3 grid((nElem + block.x - 1)/block.x);
    int nbytes = sizeof(float) * nElem;
    float *h_a, *h_g;
    h_a = (float *)malloc(nbytes);
    h_g = (float *)malloc(sizeof(float) * grid.x);
    initdata(h_a, nElem);
    memset(h_g, 0, sizeof(float) * grid.x);

    float *d_a, *d_b;
    cudaMalloc((float **)&d_a, nbytes);
    cudaMalloc((float **)&d_b, sizeof(float) * grid.x);
    cudaMemcpy(d_a, h_a, nbytes, cudaMemcpyHostToDevice);
    cudaMemset(d_b, 0, sizeof(float) * grid.x);

    float res = 0.0;
    for(int i=0; i<nElem; i++)
        res += h_a[i];
    printf("\nsum on cpu:%f\n", res);

    sumgpu<<<grid, block>>>(d_a, d_b, nElem);
    cudaMemcpy(h_g, d_b, sizeof(float) * grid.x, cudaMemcpyDeviceToHost);
    float h_res = 0.0;
    for(int i=0; i<grid.x; i++)
        h_res += h_g[i];
    printf("\nsum on gpu:%f\n", h_res);


    cudaFree(d_a);
    cudaFree(d_b);
    free(h_a);
    free(h_g);
    cudaDeviceReset();
}

5. 相邻归约,改善分化

#include <stdio.h>

__global__ void sumgpu2(float *a, float *b, int n)
{
    unsigned int tid = threadIdx.x;
    unsigned int idx = threadIdx.x + blockDim.x * blockIdx.x;
    if(idx > n)return;

    float *idata = a + blockIdx.x * blockDim.x;
    for(int stride=1; stride<blockDim.x; stride*=2)
    {
        unsigned int index = 2 * stride * tid;
        if(index < blockDim.x)
            idata[index] += idata[index + stride];
        __syncthreads();
    }
    if(tid == 0)b[blockIdx.x] = idata[0];
}

void initdata(float *a, int n)
{
    for(int i=0; i<n; i++)
    {
        a[i] = (float)(rand()&0xFF) / 100.0f;
    }
}
int main(){
    int dev = 0;
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, dev);
    printf("gpu: %d, name: %s\n", dev, deviceProp.name);
    cudaSetDevice(dev);

    int nElem = 1<<12;
    dim3 block(32);
    dim3 grid((nElem + block.x - 1)/block.x);
    int nbytes = sizeof(float) * nElem;
    float *h_a, *h_g;
    h_a = (float *)malloc(nbytes);
    h_g = (float *)malloc(sizeof(float) * grid.x);
    initdata(h_a, nElem);
    memset(h_g, 0, sizeof(float) * grid.x);

    float *d_a, *d_b;
    cudaMalloc((float **)&d_a, nbytes);
    cudaMalloc((float **)&d_b, sizeof(float) * grid.x);
    cudaMemcpy(d_a, h_a, nbytes, cudaMemcpyHostToDevice);
    cudaMemset(d_b, 0, sizeof(float) * grid.x);

    float res = 0.0;
    for(int i=0; i<nElem; i++)
        res += h_a[i];
    printf("\nsum on cpu:%f\n", res);

    sumgpu2<<<grid, block>>>(d_a, d_b, nElem);
    cudaMemcpy(h_g, d_b, sizeof(float) * grid.x, cudaMemcpyDeviceToHost);
    float h_res = 0.0;
    for(int i=0; i<grid.x; i++)
        h_res += h_g[i];
    printf("\nsum on gpu:%f\n", h_res);


    cudaFree(d_a);
    cudaFree(d_b);
    free(h_a);
    free(h_g);
    cudaDeviceReset();
}

6. 交错归约

#include <stdio.h>

__global__ void sumgpu3(float *a, float *b, int n)
{
    unsigned int tid = threadIdx.x;
    unsigned int idx = threadIdx.x + blockDim.x * blockIdx.x;

    if(idx>n)return;
    float *idata = a + blockDim.x * blockIdx.x;
    for(int stride=blockDim.x/2; stride>0; stride>>=1)
    {
        if(tid<stride)
            idata[tid] += idata[tid+stride];
        __syncthreads();
    }
    if(tid==0)b[blockIdx.x] = idata[0];
}

void initdata(float *a, int n)
{
    for(int i=0; i<n; i++)
    {
        a[i] = (float)(rand()&0xFF) / 100.0f;
    }
}
int main(){
    int dev = 0;
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, dev);
    printf("gpu: %d, name: %s\n", dev, deviceProp.name);
    cudaSetDevice(dev);

    int nElem = 1<<12;
    dim3 block(32);
    dim3 grid((nElem + block.x - 1)/block.x);
    int nbytes = sizeof(float) * nElem;
    float *h_a, *h_g;
    h_a = (float *)malloc(nbytes);
    h_g = (float *)malloc(sizeof(float) * grid.x);
    initdata(h_a, nElem);
    memset(h_g, 0, sizeof(float) * grid.x);

    float *d_a, *d_b;
    cudaMalloc((float **)&d_a, nbytes);
    cudaMalloc((float **)&d_b, sizeof(float) * grid.x);
    cudaMemcpy(d_a, h_a, nbytes, cudaMemcpyHostToDevice);
    cudaMemset(d_b, 0, sizeof(float) * grid.x);

    float res = 0.0;
    for(int i=0; i<nElem; i++)
        res += h_a[i];
    printf("\nsum on cpu:%f\n", res);

    sumgpu3<<<grid, block>>>(d_a, d_b, nElem);
    cudaMemcpy(h_g, d_b, sizeof(float) * grid.x, cudaMemcpyDeviceToHost);
    float h_res = 0.0;
    for(int i=0; i<grid.x; i++)
        h_res += h_g[i];
    printf("\nsum on gpu:%f\n", h_res);


    cudaFree(d_a);
    cudaFree(d_b);
    free(h_a);
    free(h_g);
    cudaDeviceReset();
}

7. 展开归约

先将块叠加,即展开归约,然后再块内交错归约。提高内存吞吐量。内存延迟可以更好地被隐藏起来。

#include <stdio.h>
void initdata(float *a, int n)
{
    for(int i=0; i<n; i++)
    {
        a[i] = (float)(rand()&0xFF) / 100.f;
    }
}

__global__ void sumInterleaveUnroll(float *d, float *tmp, int n)
{
    unsigned int tid = threadIdx.x;
    unsigned int idx = threadIdx.x + blockDim.x * blockIdx.x * 8;


    if(idx + blockDim.x * 7 < n)
    {
        float a1 = d[idx];
        float a2 = d[idx + blockDim.x];
        float a3 = d[idx + blockDim.x * 2];
        float a4 = d[idx + blockDim.x * 3];
        float a5 = d[idx + blockDim.x * 4];
        float a6 = d[idx + blockDim.x * 5];
        float a7 = d[idx + blockDim.x * 6];
        float a8 = d[idx + blockDim.x * 7];
        d[idx] = a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8;
    }
    __syncthreads();
    float *idata = d + blockDim.x * blockIdx.x * 8;
    for(int stride=blockDim.x/2; stride>0; stride>>=1)
    {
        if(tid < stride)
            idata[tid] += idata[tid + stride];
        __syncthreads();
    }
    if(tid==0)tmp[blockIdx.x] = idata[0];
}
int main()
{
    int dev=0;
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, dev);
    printf("gpu: %d, name: %s \n", dev, deviceProp.name);
    cudaSetDevice(dev);

    int nElem = 1<<12;
    int nbytes = sizeof(float) * nElem;
    dim3 block(32);
    dim3 grid((nElem + block.x - 1) / block.x);

    float *ha, *htmp;
    ha = (float*)malloc(nbytes);
    htmp = (float*)malloc(sizeof(float)*grid.x / 8);
    initdata(ha, nElem);
    memset(htmp, 0, sizeof(float)*grid.x / 8);

    float *da, *dtmp;
    cudaMalloc((float**)&da, nbytes);
    cudaMalloc((float**)&dtmp, sizeof(float)*grid.x / 8);
    cudaMemcpy(da, ha, nbytes, cudaMemcpyHostToDevice);
    cudaMemset(dtmp, 0, sizeof(float)*grid.x / 8);

    float res = 0.0;
    for(int i=0; i<nElem; i++)
        res += ha[i];
    printf("correct sum: %f \n", res);

    sumInterleaveUnroll<<<grid.x / 8, block>>>(da, dtmp, nElem);

    cudaMemcpy(htmp, dtmp, sizeof(float)*grid.x / 8, cudaMemcpyDeviceToHost);
    float res_d = 0.0;
    for(int i=0; i<grid.x / 8; i++)
        res_d += htmp[i];
    printf("gpu sum: %f \n", res_d);


    cudaFree(da);
    cudaFree(dtmp);
    free(ha);
    free(htmp);
}

8. 展开线程的归约

用线程束内隐式同步代替部分__syncthreads()

#include <stdio.h>
void initdata(float *a, int n)
{
    for(int i=0; i<n; i++)
    {
        a[i] = (float)(rand()&0xFF) / 100.f;
    }
}


__global__ void sumInterleaveUnrollWarp(float *d, float *tmp, int n)
{
    unsigned int tid = threadIdx.x;
    unsigned int idx = threadIdx.x + blockDim.x * blockIdx.x * 8;
    if(idx + 7 * blockDim.x<n)
    {
        float a1 = d[idx];
        float a2 = d[idx + blockDim.x];
        float a3 = d[idx + blockDim.x * 2];
        float a4 = d[idx + blockDim.x * 3];
        float a5 = d[idx + blockDim.x * 4];
        float a6 = d[idx + blockDim.x * 5];
        float a7 = d[idx + blockDim.x * 6];
        float a8 = d[idx + blockDim.x * 7];
        d[idx] = a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8;
    }
    __syncthreads();
    float *idata = d + 8 * blockDim.x * blockIdx.x;
    for(int stride=blockDim.x/2; stride>32; stride>>=1)
    {
        if(tid < stride)
            idata[tid] += idata[tid + stride];
        __syncthreads();
    }
    if(tid<32)
    {
        volatile float *vmem = idata;
        vmem[tid] += vmem[tid + 32];
        vmem[tid] += vmem[tid + 16];
        vmem[tid] += vmem[tid + 8];
        vmem[tid] += vmem[tid + 4];
        vmem[tid] += vmem[tid + 2];
        vmem[tid] += vmem[tid + 1];
    }
    if(tid==0)tmp[blockIdx.x] = idata[0];
}
int main()
{
    int dev=0;
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, dev);
    printf("gpu: %d, name: %s \n", dev, deviceProp.name);
    cudaSetDevice(dev);

    int nElem = 1<<14;
    int nbytes = sizeof(float) * nElem;
    dim3 block(64);
    dim3 grid((nElem + block.x - 1) / block.x);

    float *ha, *htmp;
    ha = (float*)malloc(nbytes);
    htmp = (float*)malloc(sizeof(float)*grid.x / 8);
    initdata(ha, nElem);
    memset(htmp, 0, sizeof(float)*grid.x / 8);

    float *da, *dtmp;
    cudaMalloc((float**)&da, nbytes);
    cudaMalloc((float**)&dtmp, sizeof(float)*grid.x / 8);
    cudaMemcpy(da, ha, nbytes, cudaMemcpyHostToDevice);
    cudaMemset(dtmp, 0, sizeof(float)*grid.x / 8);

    float res = 0.0;
    for(int i=0; i<nElem; i++)
        res += ha[i];
    printf("correct sum: %f \n", res);

    sumInterleaveUnrollWarp<<<grid.x / 8, block>>>(da, dtmp, nElem);

    cudaMemcpy(htmp, dtmp, sizeof(float)*grid.x / 8, cudaMemcpyDeviceToHost);
    float res_d = 0.0;
    for(int i=0; i<grid.x / 8; i++)
        res_d += htmp[i];
    printf("gpu sum: %f \n", res_d);


    cudaFree(da);
    cudaFree(dtmp);
    free(ha);
    free(htmp);
}

9. 完全展开的归约

#include <stdio.h>
void initdata(float *a, int n)
{
    for(int i=0; i<n; i++)
    {
        a[i] = (float)(rand()&0xFF) / 100.f;
    }
}


__global__ void sumCompleteUnroll(float *d, float *tmp, int n)
{
    unsigned int tid = threadIdx.x;
    unsigned int idx = threadIdx.x + blockDim.x * blockIdx.x * 8;
    if(idx + 7 * blockDim.x<n)
    {
        float a1 = d[idx];
        float a2 = d[idx + blockDim.x];
        float a3 = d[idx + blockDim.x * 2];
        float a4 = d[idx + blockDim.x * 3];
        float a5 = d[idx + blockDim.x * 4];
        float a6 = d[idx + blockDim.x * 5];
        float a7 = d[idx + blockDim.x * 6];
        float a8 = d[idx + blockDim.x * 7];
        d[idx] = a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8;
    }
    __syncthreads();
    float *idata = d + 8 * blockDim.x * blockIdx.x;

    if(blockDim.x>=1024 && tid <512) idata[tid] += idata[tid + 512];
    __syncthreads();
    if(blockDim.x>=512 && tid <256) idata[tid] += idata[tid + 256];
    __syncthreads();
    if(blockDim.x>=256 && tid <128) idata[tid] += idata[tid + 128];
    __syncthreads();
    if(blockDim.x>=128 && tid <64) idata[tid] += idata[tid + 64];
    __syncthreads();
    if(tid<32)
    {
        volatile float *vmem = idata;
        vmem[tid] += vmem[tid + 32];
        vmem[tid] += vmem[tid + 16];
        vmem[tid] += vmem[tid + 8];
        vmem[tid] += vmem[tid + 4];
        vmem[tid] += vmem[tid + 2];
        vmem[tid] += vmem[tid + 1];
    }
    if(tid==0)tmp[blockIdx.x] = idata[0];
}
int main()
{
    int dev=0;
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, dev);
    printf("gpu: %d, name: %s \n", dev, deviceProp.name);
    cudaSetDevice(dev);

    int nElem = 1<<14;
    int nbytes = sizeof(float) * nElem;
    dim3 block(64);
    dim3 grid((nElem + block.x - 1) / block.x);

    float *ha, *htmp;
    ha = (float*)malloc(nbytes);
    htmp = (float*)malloc(sizeof(float)*grid.x / 8);
    initdata(ha, nElem);
    memset(htmp, 0, sizeof(float)*grid.x / 8);

    float *da, *dtmp;
    cudaMalloc((float**)&da, nbytes);
    cudaMalloc((float**)&dtmp, sizeof(float)*grid.x / 8);
    cudaMemcpy(da, ha, nbytes, cudaMemcpyHostToDevice);
    cudaMemset(dtmp, 0, sizeof(float)*grid.x / 8);

    float res = 0.0;
    for(int i=0; i<nElem; i++)
        res += ha[i];
    printf("correct sum: %f \n", res);


    sumCompleteUnroll<<<grid.x / 8, block>>>(da, dtmp, nElem);

    cudaMemcpy(htmp, dtmp, sizeof(float)*grid.x / 8, cudaMemcpyDeviceToHost);
    float res_d = 0.0;
    for(int i=0; i<grid.x / 8; i++)
        res_d += htmp[i];
    printf("gpu sum: %f \n", res_d);


    cudaFree(da);
    cudaFree(dtmp);
    free(ha);
    free(htmp);
}

10.模板函数的归约

blockDim.x 自内核启动时一旦确定,就不能更改了,所以模板函数解决了这个问题,当编译时编译器会去检查,blockDim.x 是否固定,如果固定,直接可以删除掉内核中不可能的部分也就是上半部分,下半部分是要执行的。

#include <stdio.h>
void initdata(float *a, int n)
{
    for(int i=0; i<n; i++)
    {
        a[i] = (float)(rand()&0xFF) / 100.f;
    }
}


template <unsigned int iBlockSize>
__global__ void sumCompleteUnrolltemplate(float *d, float *tmp, int n)
{
    unsigned int tid = threadIdx.x;
    unsigned int idx = threadIdx.x + blockDim.x * blockIdx.x * 8;
    if(idx + 7 * blockDim.x<n)
    {
        float a1 = d[idx];
        float a2 = d[idx + blockDim.x];
        float a3 = d[idx + blockDim.x * 2];
        float a4 = d[idx + blockDim.x * 3];
        float a5 = d[idx + blockDim.x * 4];
        float a6 = d[idx + blockDim.x * 5];
        float a7 = d[idx + blockDim.x * 6];
        float a8 = d[idx + blockDim.x * 7];
        d[idx] = a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8;
    }
    __syncthreads();
    float *idata = d + 8 * blockDim.x * blockIdx.x;

    if(iBlockSize>=1024 && tid <512) idata[tid] += idata[tid + 512];
    __syncthreads();
    if(iBlockSize>=512 && tid <256) idata[tid] += idata[tid + 256];
    __syncthreads();
    if(iBlockSize>=256 && tid <128) idata[tid] += idata[tid + 128];
    __syncthreads();
    if(iBlockSize>=128 && tid <64) idata[tid] += idata[tid + 64];
    __syncthreads();
    if(tid<32)
    {
        volatile float *vmem = idata;
        vmem[tid] += vmem[tid + 32];
        vmem[tid] += vmem[tid + 16];
        vmem[tid] += vmem[tid + 8];
        vmem[tid] += vmem[tid + 4];
        vmem[tid] += vmem[tid + 2];
        vmem[tid] += vmem[tid + 1];
    }
    if(tid==0)tmp[blockIdx.x] = idata[0];
}
int main()
{
    int dev=0;
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, dev);
    printf("gpu: %d, name: %s \n", dev, deviceProp.name);
    cudaSetDevice(dev);

    int nElem = 1<<14;
    int nbytes = sizeof(float) * nElem;
    dim3 block(64);
    dim3 grid((nElem + block.x - 1) / block.x);

    float *ha, *htmp;
    ha = (float*)malloc(nbytes);
    htmp = (float*)malloc(sizeof(float)*grid.x / 8);
    initdata(ha, nElem);
    memset(htmp, 0, sizeof(float)*grid.x / 8);

    float *da, *dtmp;
    cudaMalloc((float**)&da, nbytes);
    cudaMalloc((float**)&dtmp, sizeof(float)*grid.x / 8);
    cudaMemcpy(da, ha, nbytes, cudaMemcpyHostToDevice);
    cudaMemset(dtmp, 0, sizeof(float)*grid.x / 8);

    float res = 0.0;
    for(int i=0; i<nElem; i++)
        res += ha[i];
    printf("correct sum: %f \n", res);


    sumCompleteUnrolltemplate<64><<<grid.x / 8, block>>>(da, dtmp, nElem);

    cudaMemcpy(htmp, dtmp, sizeof(float)*grid.x / 8, cudaMemcpyDeviceToHost);
    float res_d = 0.0;
    for(int i=0; i<grid.x / 8; i++)
        res_d += htmp[i];
    printf("gpu sum: %f \n", res_d);


    cudaFree(da);
    cudaFree(dtmp);
    free(ha);
    free(htmp);
}

11. 使用共享内存

共享内存,减少全局内存访问

#include <stdio.h>
void initdata(float *a, int n)
{
    for(int i=0; i<n; i++)
    {
        a[i] = (float)(rand()&0xFF) / 100.f;
    }
}

template <unsigned int iBlockSize>
__global__ void sumCompleteUnrolltemplateSMEM(float *d, float *tmp, int n)
{
    __shared__ float smem[iBlockSize];
    unsigned int tid = threadIdx.x;
    unsigned int idx = threadIdx.x + blockDim.x * blockIdx.x * 8;
    float temsum = 0.0;
    if(idx + 7 * blockDim.x<n)
    {
        float a1 = d[idx];
        float a2 = d[idx + blockDim.x];
        float a3 = d[idx + blockDim.x * 2];
        float a4 = d[idx + blockDim.x * 3];
        float a5 = d[idx + blockDim.x * 4];
        float a6 = d[idx + blockDim.x * 5];
        float a7 = d[idx + blockDim.x * 6];
        float a8 = d[idx + blockDim.x * 7];
        temsum = a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8;
    }
    smem[tid] = temsum;
    
    __syncthreads();

    if(iBlockSize>=1024 && tid <512) smem[tid] += smem[tid + 512];
    __syncthreads();
    if(iBlockSize>=512 && tid <256) smem[tid] += smem[tid + 256];
    __syncthreads();
    if(iBlockSize>=256 && tid <128) smem[tid] += smem[tid + 128];
    __syncthreads();
    if(iBlockSize>=128 && tid <64) smem[tid] += smem[tid + 64];
    __syncthreads();
    if(tid<32)
    {
        volatile float *vmem = smem;
        vmem[tid] += vmem[tid + 32];
        vmem[tid] += vmem[tid + 16];
        vmem[tid] += vmem[tid + 8];
        vmem[tid] += vmem[tid + 4];
        vmem[tid] += vmem[tid + 2];
        vmem[tid] += vmem[tid + 1];
    }
    if(tid==0)tmp[blockIdx.x] = smem[0];
}

int main()
{
    int dev=0;
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, dev);
    printf("gpu: %d, name: %s \n", dev, deviceProp.name);
    cudaSetDevice(dev);

    int nElem = 1<<14;
    int nbytes = sizeof(float) * nElem;
    dim3 block(64);
    dim3 grid((nElem + block.x - 1) / block.x);

    float *ha, *htmp;
    ha = (float*)malloc(nbytes);
    htmp = (float*)malloc(sizeof(float)*grid.x / 8);
    initdata(ha, nElem);
    memset(htmp, 0, sizeof(float)*grid.x / 8);

    float *da, *dtmp;
    cudaMalloc((float**)&da, nbytes);
    cudaMalloc((float**)&dtmp, sizeof(float)*grid.x / 8);
    cudaMemcpy(da, ha, nbytes, cudaMemcpyHostToDevice);
    cudaMemset(dtmp, 0, sizeof(float)*grid.x / 8);

    float res = 0.0;
    for(int i=0; i<nElem; i++)
        res += ha[i];
    printf("correct sum: %f \n", res);

    sumCompleteUnrolltemplateSMEM<64><<<grid.x / 8, block>>>(da, dtmp, nElem);

    cudaMemcpy(htmp, dtmp, sizeof(float)*grid.x / 8, cudaMemcpyDeviceToHost);
    float res_d = 0.0;
    for(int i=0; i<grid.x / 8; i++)
        res_d += htmp[i];
    printf("gpu sum: %f \n", res_d);


    cudaFree(da);
    cudaFree(dtmp);
    free(ha);
    free(htmp);
}
  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值