CUDA学习，使用shared memory实现Reverse Array

最新推荐文章于 2020-12-26 14:31:31 发布

何雷

最新推荐文章于 2020-12-26 14:31:31 发布

阅读量1.4k

点赞数

分类专栏： CUDA C/C++

本文链接：https://blog.csdn.net/helei001/article/details/25750625

版权

C/C++ 同时被 2 个专栏收录

23 篇文章 1 订阅

订阅专栏

CUDA

9 篇文章 0 订阅

订阅专栏

#include< stdio.h>  
#include "cuda_runtime.h"  
#include "device_launch_parameters.h"  
#include 
   
   
    
    
#include 
    
    
     
     
#include 
     
     
      
      

using namespace std;
// define grid and block size
   const int numThreadsPerBlock = 256;

// Simple utility function to check for CUDA runtime errors
void checkCUDAError(const char* msg);

// Part 2 of 2: implement the kernel
__global__ void reverseArrayBlock(int *d_out, int *d_in)
{
	__shared__ int s_data[numThreadsPerBlock];
	int inOffset=blockDim.x*blockIdx.x;
	int in=inOffset+threadIdx.x;
	s_data[blockDim.x-1-threadIdx.x]=d_in[in];
	__syncthreads();

	int outOffset=blockDim.x*(gridDim.x-1-blockIdx.x);
	int out=outOffset+threadIdx.x;
	d_out[out]=s_data[threadIdx.x];
    
}


// Program main

int main( int argc, char** argv) 
{
    // pointer for host memory and size
    int *h_a;
    int dimA = 256 * 1024; // 256K elements (1MB total)

    // Compute number of blocks needed based on array size and desired block size
    int numBlocks = dimA / numThreadsPerBlock;  

    // Part 1 of 2: Compute the number of bytes of shared memory needed
    // This is used in the kernel invocation below
    int sharedMemSize = 256;

    // allocate host and device memory
    size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
    h_a = (int *) malloc(memSize);

	
    // Initialize input array on host
    for (int i = 0; i < dimA; ++i)
    {
        h_a[i] = i;
    }


	
	//记录起始时间
	
	cudaEvent_t start, stop;
	cudaEventCreate( &start );
	cudaEventCreate( &stop );
	cudaEventRecord( start, 0 );

	// pointer for device memory
    int *d_b, *d_a;


    cudaMalloc( (void **) &d_a, memSize );
    cudaMalloc( (void **) &d_b, memSize );

    // Copy host array to device array
    cudaMemcpy( d_a, h_a, memSize, cudaMemcpyHostToDevice );

    // launch kernel
    dim3 dimGrid(numBlocks);
    dim3 dimBlock(numThreadsPerBlock);
    reverseArrayBlock<<< dimGrid, dimBlock, sharedMemSize >>>( d_b, d_a );

    // block until the device has completed
    cudaThreadSynchronize();

    // check if kernel execution generated an error
    // Check for any CUDA errors
    checkCUDAError("kernel invocation");

    // device to host copy
    cudaMemcpy( h_a, d_b, memSize, cudaMemcpyDeviceToHost );

	
	//获得结束时间并显示计时结果

	cudaEventRecord( stop,0 );
	cudaEventSynchronize( stop );

	float elapsedTime;
	cudaEventElapsedTime( &elapsedTime, start, stop );
	printf("Time is: %3.1f ms\n", elapsedTime);
	cudaEventDestroy(start);
	cudaEventDestroy(stop);

	

    // Check for any CUDA errors
    checkCUDAError("memcpy");

    // verify the data returned to the host is correct
    for (int i = 0; i < dimA; i++)
    {
        assert(h_a[i] == dimA - 1 - i );
    }

    // free device memory
    cudaFree(d_a);
    cudaFree(d_b);

    // free host memory
    free(h_a);

    // If the program makes it this far, then the results are correct and
    // there are no run-time errors.  Good work!
    printf("Correct!\n");

    return 0;
}

void checkCUDAError(const char *msg)
{
    cudaError_t err = cudaGetLastError();
    if( cudaSuccess != err) 
    {
        fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
        exit(EXIT_FAILURE);
    }                         
}

何雷

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
CUDA学习，使用shared memory实现Reverse Array

#include #include "cuda_runtime.h" #include "device_launch_parameters.h" #include #include #include using namespace std;// define grid and block size const int numThreadsPerBloc
复制链接

扫一扫

专栏目录