first.cu 关于CUDA的第一个程序，备注理解架构_could not allocate memory cudamalloc-CSDN博客

本文链接：https://blog.csdn.net/jaminwm/article/details/7566767

// includes, system
#include <stdio.h>
#include <stdlib.h>

#define RADIUS 1
#define BLOCKDIM 16
#define N 13

__global__ static void set_global_idx(int n, int *d_a)
{
   int idx = blockIdx.x*blockDim.x + threadIdx.x;
   if (idx < n)
       d_a[idx] = idx;

}

__global__ static void set_block_idx(int n, int *d_a)
{
   int idx = blockIdx.x*blockDim.x + threadIdx.x;
   if (idx < n)
       d_a[idx] = blockIdx.x;

}

__global__ static void set_thread_idx(int n, int *d_a)
{
   int idx = blockIdx.x*blockDim.x + threadIdx.x;
   if (idx < n)
       d_a[idx] = threadIdx.x;

}

__global__ static void stencil_naive(int n, int *in, int *out)
{
   int idx = blockIdx.x*blockDim.x + threadIdx.x;
   int i;
   int value = 0;
   if (idx < n) {
       for (i=-RADIUS; i<=RADIUS; ++i)
       {
           value += in[idx+i+RADIUS];
       }
   }
   out[idx] = value;

}

__global__ static void stencil(int n, int *in, int *out)
{
   __shared__ int shared[BLOCKDIM + 2*RADIUS];
   int idx = blockIdx.x*blockDim.x + threadIdx.x;
   int idx_local = threadIdx.x + RADIUS;
   shared[idx_local] = in[idx + RADIUS];
   int i;
   if (idx < n) {
       if( threadIdx.x < RADIUS)
       {
           shared[idx_local - RADIUS] = in[idx ];
           shared[idx_local + BLOCKDIM] = in[idx + BLOCKDIM + RADIUS];
       }
   }
   __syncthreads();
   int value = 0;
   for (i=-RADIUS; i<=RADIUS; i++)
   {
       value += shared[idx_local + i];
   }
   out[idx] = value;

}

__global__ static void matrix_multiply_naive(int n, int *in_a, int *in_b, int *out)
{
   int tmp = gridDim.x*blockDim.x;
   int row = threadIdx.x + blockIdx.x*blockDim.x;
   int col = threadIdx.y + blockIdx.y*blockDim.y;
   int idx = row*tmp + col;
   int sum = 0;
   if (idx < n)
   {
       for (int i=0; i<N; i++)
       {
           sum += in_a[row*N+i]*in_b[i*N+col];
       }
   }
   out[row*N+col] = sum;

   /*int row = threadIdx.x + blockIdx.x*blockDim.x;
   int col = threadIdx.y + blockIdx.y*blockDim.y;
   int idx = row*gridDim.x*blockDim.x + col;
   if (idx < n)
   {
       out[idx] = idx;
   }*/


}

// Start the main SDK sample here
int main(int argc, char** argv)
{
   printf(" ###########################\n");
    printf(" #test for cuda toolkit ...#\n");
   printf(" ###########################\n");

   int dim = 16;
   int mem_size = dim*sizeof(int);

   int *d_a, *h_a, *d_b;
   h_a = (int*)malloc(mem_size);
   cudaMalloc((void**)&d_a, mem_size);
   cudaMalloc((void**)&d_b, mem_size);

   if ( 0==h_a || 0==d_a)
   {
       printf("could not allocate memory\n");
       return 1;
   }
   cudaMemset(d_a, 0, mem_size);
   cudaMemcpy(h_a, d_a, mem_size, cudaMemcpyDeviceToHost);

   for (int i=0; i<dim; ++i)
   {
       printf("%2d ", h_a[i]);
   }
   printf("\n");

   int dim_block = 3;
   int dim_grid = (dim + dim_block -1)/dim_block;
   set_global_idx<<<dim_grid, dim_block>>> (dim, d_a);
   cudaMemcpy(h_a, d_a, mem_size, cudaMemcpyDeviceToHost);

   for (int i=0; i<dim; ++i)
   {
       printf("%2d ", h_a[i]);
   }
   printf("\n");

   set_block_idx<<<dim_grid, dim_block>>> (dim, d_a);
   cudaMemcpy(h_a, d_a, mem_size, cudaMemcpyDeviceToHost);

   for (int i=0; i<dim; ++i)
   {
       printf("%2d ", h_a[i]);
   }
   printf("\n");

   set_thread_idx<<<dim_grid, dim_block>>> (dim, d_a);
   cudaMemcpy(d_b, d_a, mem_size, cudaMemcpyDeviceToDevice);
   cudaMemcpy(h_a, d_b, mem_size, cudaMemcpyDeviceToHost);

   for (int i=0; i<dim; ++i)
   {
       printf("%2d ", h_a[i]);
   }
   printf("\n");

   free(h_a);
   cudaFree(d_a);
   cudaFree(d_b);

   ///
   printf("\n");
   printf(" #######################\n");
   printf(" #stencil test ...... #\n");
   printf(" #######################\n");
   dim = 80;
   int dim_pro = dim + RADIUS*2;
   int mem_size_pro = sizeof(int)*dim_pro;
   mem_size = sizeof(int)*dim;
   h_a = (int*)malloc(mem_size_pro);
   memset(h_a, 0, mem_size_pro);
   int *h_b;
   cudaMalloc((void**)&d_a, mem_size_pro);
   cudaMalloc((void**)&d_b, mem_size);
   for (int i=0; i<dim; i++)
   {
       h_a[i+RADIUS] = i;
   }
   h_b = &(h_a[RADIUS]);
   printf(" original vector ... \n");
   for (int i=0; i<dim; ++i)
   {
       if (i%10 == 0)
           printf("\n");
       printf("%5d ", h_b[i]);
   }
   printf("\n\n\n");
   cudaMemcpy(d_a, h_a, mem_size_pro, cudaMemcpyHostToDevice);
   dim_grid = (dim + BLOCKDIM -1)/BLOCKDIM;
   //cudaMemset(d_b, 0, mem_size);
   stencil<<<dim_grid, BLOCKDIM>>> (dim, d_a, d_b);
   //stencil_naive<<<dim_grid, BLOCKDIM>>> (dim, d_a, d_b);
   cudaMemcpy(h_b, d_b, mem_size, cudaMemcpyDeviceToHost);


   printf(" after stencil   ... \n");
   for (int i=0; i<dim; ++i)
   {
       if (i%10 == 0)
           printf("\n");
       printf("%5d ", h_b[i]);
   }
   printf("\n");

   free(h_a);
   cudaFree(d_a);
   cudaFree(d_b);

   ///
   printf("\n");
   printf(" ##########################\n");
   printf(" #matrix multiply ...... #\n");
   printf(" ##########################\n");

   int row = N;
   int col = N;
   dim = row*col;
   mem_size = dim*sizeof(int);
   h_a = (int*)malloc(mem_size);
   int *d_c;
   cudaMalloc((void**)&d_a, mem_size);
   cudaMalloc((void**)&d_b, mem_size);
   cudaMalloc((void**)&d_c, mem_size);
   printf(" matrix A & B : \n");
   for (int i=0; i<dim; ++i)
   {
       h_a[i] = 1;
       if (i%row == 0)
       {
           printf("\n");
       }
       printf("%5d ", h_a[i]);
   }
   cudaMemcpy(d_a, h_a, mem_size, cudaMemcpyHostToDevice);
   cudaMemcpy(d_b, h_a, mem_size, cudaMemcpyHostToDevice);
   dim_block = 2;
   dim3 block(dim_block, dim_block);
   dim3 grid((row+dim_block-1)/dim_block, (col+dim_block-1)/dim_block);
   cudaMemset(d_c, 0, mem_size);
   matrix_multiply_naive<<<grid, block>>> (dim, d_a, d_b, d_c);

   //memset(h_a, 0,mem_size);
   cudaMemcpy(h_a, d_c, mem_size, cudaMemcpyDeviceToHost);
   printf("\n matrix C : \n");
   for (int i=0; i<dim; ++i)
   {
       if (i%row == 0)
       {
           printf("\n");
       }
       printf("%5d ", h_a[i]);
   }


   return 0;
}

——————————————————————————————————————————————————————————————————

#include <stdio.h>

#include <stdlib.h>
#define RADIUS 3
#define BLOCK_SIZE 16

__global__ void stencil(int* in, int* out)

{
__shared__ int shared[BLOCK_SIZE + 2 * RADIUS];
int globIdx = blockIdx.x * blockDim.x + threadIdx.x;
int locIdx = threadIdx.x + RADIUS;
shared[locIdx] = in[globIdx];

if (threadIdx.x < RADIUS)
{
    shared[locIdx – RADIUS] = in[globIdx – RADIUS];
    shared[locIdx + BLOCK_DIMX] = in[globIdx + BLOCK_SIZE];
}
__syncthreads();

int value = 0;
for (offset = - RADIUS; offset <= RADIUS; offset++)
    value += shared[locIdx + offset];
out[globIdx] = value;

}

void checkCUDAError(const char *msg)
{
    cudaError_t err = cudaGetLastError();
    if( cudaSuccess != err)
    {
        fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
        exit(-1);
    }
}

int main( int argc, char** argv)
{
    int *h_a;
    int *d_a;
    int numBlocks = 16;
    int numThreadsPerBlock = 16;

    size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
    h_a = (int *) malloc(memSize);
    cudaMalloc((void**)&d_a, memSize );

    if( 0==h_a || 0==d_a )
    {
        printf("couldn't allocate memory\n");
        return 1;
    }

    dim3 dimGrid(16,16);
    dim3 dimBlock(16);
    stencil<<< dimGrid,dimBlock>>>(RADIUS);

   cudaThreadSynchronize();

   checkCUDAError("kernel execution");

   cudaMemcpy(h_a, d_a, memSize, cudaMemcpyDeviceToHost);

   checkCUDAError("cudaMemcpy");

   for (int i = 0; i < 16 ; i++)
    {
        for (int j = 0; j < 16 ; j++)
        {
            assert(h_a[i * numThreadsPerBlock + j] == 1000 * i + j);
        }
    }

    cudaFree(d_a);
   free(h_a);
   printf("Correct!\n");
   return 0;
}