#include< stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include
#include
#include
using namespace std;
#define N (2048*2048)
#define THREADS_PER_BLOCK 256 //16*16
#define TILE_WIDTH 16
#define width 2048
__global__ void Add(int *dev_a, int *dev_b,int *dev_c)
{
int bx=blockIdx.x;
int by=blockIdx.y;
int tx=threadIdx.x;
int ty=threadIdx.y;
int i=bx*blockDim.x+by*blockDim.y*width+ty*width+tx;
__shared__ int A[TILE_WIDTH][TILE_WIDTH];
__shared__ int B[TILE_WIDTH][TILE_WIDTH];
A[ty][tx]=dev_a[i];
B[ty][tx]=dev_b[i];
__syncthreads();
dev_c[i]=A[ty][tx]+B[ty][tx];
/*
int i=bx*blockDim.x+by*blockDim.y*width+ty*width+tx;
dev_c[i]=dev_a[i]+dev_b[i];
*/
}
int main( void ) {
int *a, *b, *c; // host copies of a, b, c
int *dev_a, *dev_b, *dev_c; // device copies of a, b, c
int size = N * sizeof( int); // we need space for N integers
// allocate device copies of a, b, c
cudaMalloc( (void**)&dev_a, size );
cudaMalloc( (void**)&dev_b, size );
cudaMalloc( (void**)&dev_c, size );
a = (int*)malloc( size );
b = (int*)malloc( size );
c = (int*)malloc( size );
//random_ints( a, N );
//random_ints( b, N );
for (int i = 0; i < N; ++i)
{
a[i] = rand();
b[i] = rand();
}
// copy inputs to device
cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice);
dim3 dimGrid(128,128,1);
dim3 dimBlock(16,16,1);
int sharesize = 16*16*sizeof(int);
cudaEvent_t timeStartEvent,timeEndEvent;
cudaEventCreate( &timeStartEvent, 0);
cudaEventCreate(&timeEndEvent, 0);
cudaEventRecord( timeStartEvent, 0);
// launch add() kernel with blocks and threads
Add<<< dimGrid, dimBlock,sharesize >>>( dev_a, dev_b, dev_c);
// copy device result back to host copy of c
cudaMemcpy( c, dev_c, size, cudaMemcpyDeviceToHost);
// verify the data returned to the host is correct
for (int i = 0; i < N; i++)
{
assert(c[i] == a[i]+b[i] );
}
free( a );
free( b );
free( c );
cudaFree( dev_a);
cudaFree( dev_b);
cudaFree( dev_c);
cudaEventRecord( timeEndEvent, 0) ;
cudaEventSynchronize( timeEndEvent ) ;
float elapsedTime = 0 ;
cudaEventElapsedTime( & elapsedTime, timeStartEvent, timeEndEvent ) ;
std::cout<< "elapsedTime is " << elapsedTime << " ms. ";
cudaEventDestroy( timeStartEvent ) ;
cudaEventDestroy( timeEndEvent ) ;
return 0;
}
CUDA学习,使用shared memory实现Matrix Parallel Add
最新推荐文章于 2024-02-14 01:09:01 发布