CUDA学习，使用shared memory实现Matrix Parallel Add

最新推荐文章于 2024-02-14 01:09:01 发布

何雷

最新推荐文章于 2024-02-14 01:09:01 发布

阅读量1.6k

点赞数 2

分类专栏： CUDA

本文链接：https://blog.csdn.net/helei001/article/details/25750807

版权

CUDA 专栏收录该内容

9 篇文章 0 订阅

订阅专栏

#include< stdio.h>  
#include "cuda_runtime.h"  
#include "device_launch_parameters.h"  
#include 
   
   
    
    
#include 
    
    
     
     
#include 
     
     
      
      

using namespace std;

#define N (2048*2048)
#define THREADS_PER_BLOCK 256  //16*16
#define TILE_WIDTH 16
#define width 2048

__global__ void Add(int *dev_a, int *dev_b,int *dev_c)
{
	
	int bx=blockIdx.x;
	int by=blockIdx.y;
	
	int tx=threadIdx.x;
	int ty=threadIdx.y;
	
	int i=bx*blockDim.x+by*blockDim.y*width+ty*width+tx;

	__shared__ int A[TILE_WIDTH][TILE_WIDTH];
	__shared__ int B[TILE_WIDTH][TILE_WIDTH];
	
	A[ty][tx]=dev_a[i];
	B[ty][tx]=dev_b[i];

	__syncthreads();

	dev_c[i]=A[ty][tx]+B[ty][tx];
	
	/*
	int i=bx*blockDim.x+by*blockDim.y*width+ty*width+tx;
	dev_c[i]=dev_a[i]+dev_b[i];
	*/
}

int main( void ) {


	int *a, *b, *c; // host copies of a, b, c
	
	int *dev_a, *dev_b, *dev_c; // device copies of a, b, c
	
	int size = N * sizeof( int); // we need space for N integers
	
	// allocate device copies of a, b, c
	cudaMalloc( (void**)&dev_a, size );
	cudaMalloc( (void**)&dev_b, size );
	cudaMalloc( (void**)&dev_c, size );
	
	a = (int*)malloc( size );
	b = (int*)malloc( size );
	c = (int*)malloc( size );
	
	//random_ints( a, N );
	//random_ints( b, N );
	
	for (int i = 0; i < N; ++i)
		{
			a[i] = rand();
			b[i] = rand();
		}
	
	// copy inputs to device
	cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice);
	cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice);
	
	dim3 dimGrid(128,128,1);
	dim3 dimBlock(16,16,1);

	int sharesize = 16*16*sizeof(int);

	cudaEvent_t timeStartEvent,timeEndEvent;
	cudaEventCreate( &timeStartEvent, 0);
	cudaEventCreate(&timeEndEvent, 0);
	cudaEventRecord( timeStartEvent, 0);

	// launch add() kernel with blocks and threads
	Add<<< dimGrid, dimBlock,sharesize >>>( dev_a, dev_b, dev_c);

	// copy device result back to host copy of c
	cudaMemcpy( c, dev_c, size, cudaMemcpyDeviceToHost);

	// verify the data returned to the host is correct
	
	for (int i = 0; i < N; i++)
	{
		assert(c[i] == a[i]+b[i] );
	}
	
	
	free( a );
	free( b );
	free( c );
	cudaFree( dev_a);
	cudaFree( dev_b);
	cudaFree( dev_c);
	
	cudaEventRecord( timeEndEvent, 0) ;
	cudaEventSynchronize( timeEndEvent ) ;
	float elapsedTime = 0 ;
	cudaEventElapsedTime( & elapsedTime, timeStartEvent, timeEndEvent ) ;
		 
	std::cout<< "elapsedTime is  " << elapsedTime << " ms. ";
	cudaEventDestroy( timeStartEvent ) ;
	cudaEventDestroy( timeEndEvent ) ;

	return 0;
}