#include< stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include
#include
#include
using namespace std;
// define grid and block size
const int numThreadsPerBlock = 256;
// Simple utility function to check for CUDA runtime errors
void checkCUDAError(const char* msg);
// Part 2 of 2: implement the kernel
__global__ void reverseArrayBlock(int *d_out, int *d_in)
{
__shared__ int s_data[numThreadsPerBlock];
int inOffset=blockDim.x*blockIdx.x;
int in=inOffset+threadIdx.x;
s_data[blockDim.x-1-threadIdx.x]=d_in[in];
__syncthreads();
int outOffset=blockDim.x*(gridDim.x-1-blockIdx.x);
int out=outOffset+threadIdx.x;
d_out[out]=s_data[threadIdx.x];
}
// Program main
int main( int argc, char** argv)
{
// pointer for host memory and size
int *h_a;
int dimA = 256 * 1024; // 256K elements (1MB total)
// Compute number of blocks needed based on array size and desired block size
int numBlocks = dimA / numThreadsPerBlock;
// Part 1 of 2: Compute the number of bytes of shared memory needed
// This is used in the kernel invocation below
int sharedMemSize = 256;
// allocate host and device memory
size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
h_a = (int *) malloc(memSize);
// Initialize input array on host
for (int i = 0; i < dimA; ++i)
{
h_a[i] = i;
}
//记录起始时间
cudaEvent_t start, stop;
cudaEventCreate( &start );
cudaEventCreate( &stop );
cudaEventRecord( start, 0 );
// pointer for device memory
int *d_b, *d_a;
cudaMalloc( (void **) &d_a, memSize );
cudaMalloc( (void **) &d_b, memSize );
// Copy host array to device array
cudaMemcpy( d_a, h_a, memSize, cudaMemcpyHostToDevice );
// launch kernel
dim3 dimGrid(numBlocks);
dim3 dimBlock(numThreadsPerBlock);
reverseArrayBlock<<< dimGrid, dimBlock, sharedMemSize >>>( d_b, d_a );
// block until the device has completed
cudaThreadSynchronize();
// check if kernel execution generated an error
// Check for any CUDA errors
checkCUDAError("kernel invocation");
// device to host copy
cudaMemcpy( h_a, d_b, memSize, cudaMemcpyDeviceToHost );
//获得结束时间并显示计时结果
cudaEventRecord( stop,0 );
cudaEventSynchronize( stop );
float elapsedTime;
cudaEventElapsedTime( &elapsedTime, start, stop );
printf("Time is: %3.1f ms\n", elapsedTime);
cudaEventDestroy(start);
cudaEventDestroy(stop);
// Check for any CUDA errors
checkCUDAError("memcpy");
// verify the data returned to the host is correct
for (int i = 0; i < dimA; i++)
{
assert(h_a[i] == dimA - 1 - i );
}
// free device memory
cudaFree(d_a);
cudaFree(d_b);
// free host memory
free(h_a);
// If the program makes it this far, then the results are correct and
// there are no run-time errors. Good work!
printf("Correct!\n");
return 0;
}
void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
exit(EXIT_FAILURE);
}
}
CUDA学习,使用shared memory实现Reverse Array
最新推荐文章于 2023-07-17 09:21:54 发布