CUDA学习,使用shared memory实现Matrix Parallel Add

#include< stdio.h>  
#include "cuda_runtime.h"  
#include "device_launch_parameters.h"  
#include 
   
   
    
    
#include 
    
    
     
     
#include 
     
     
      
      

using namespace std;

#define N (2048*2048)
#define THREADS_PER_BLOCK 256  //16*16
#define TILE_WIDTH 16
#define width 2048

__global__ void Add(int *dev_a, int *dev_b,int *dev_c)
{
	
	int bx=blockIdx.x;
	int by=blockIdx.y;
	
	int tx=threadIdx.x;
	int ty=threadIdx.y;
	
	int i=bx*blockDim.x+by*blockDim.y*width+ty*width+tx;

	__shared__ int A[TILE_WIDTH][TILE_WIDTH];
	__shared__ int B[TILE_WIDTH][TILE_WIDTH];
	
	A[ty][tx]=dev_a[i];
	B[ty][tx]=dev_b[i];

	__syncthreads();

	dev_c[i]=A[ty][tx]+B[ty][tx];
	
	/*
	int i=bx*blockDim.x+by*blockDim.y*width+ty*width+tx;
	dev_c[i]=dev_a[i]+dev_b[i];
	*/
}

int main( void ) {


	int *a, *b, *c; // host copies of a, b, c
	
	int *dev_a, *dev_b, *dev_c; // device copies of a, b, c
	
	int size = N * sizeof( int); // we need space for N integers
	
	// allocate device copies of a, b, c
	cudaMalloc( (void**)&dev_a, size );
	cudaMalloc( (void**)&dev_b, size );
	cudaMalloc( (void**)&dev_c, size );
	
	a = (int*)malloc( size );
	b = (int*)malloc( size );
	c = (int*)malloc( size );
	
	//random_ints( a, N );
	//random_ints( b, N );
	
	for (int i = 0; i < N; ++i)
		{
			a[i] = rand();
			b[i] = rand();
		}
	
	// copy inputs to device
	cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice);
	cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice);
	
	dim3 dimGrid(128,128,1);
	dim3 dimBlock(16,16,1);

	int sharesize = 16*16*sizeof(int);

	cudaEvent_t timeStartEvent,timeEndEvent;
	cudaEventCreate( &timeStartEvent, 0);
	cudaEventCreate(&timeEndEvent, 0);
	cudaEventRecord( timeStartEvent, 0);

	// launch add() kernel with blocks and threads
	Add<<< dimGrid, dimBlock,sharesize >>>( dev_a, dev_b, dev_c);

	// copy device result back to host copy of c
	cudaMemcpy( c, dev_c, size, cudaMemcpyDeviceToHost);

	// verify the data returned to the host is correct
	
	for (int i = 0; i < N; i++)
	{
		assert(c[i] == a[i]+b[i] );
	}
	
	
	free( a );
	free( b );
	free( c );
	cudaFree( dev_a);
	cudaFree( dev_b);
	cudaFree( dev_c);
	
	cudaEventRecord( timeEndEvent, 0) ;
	cudaEventSynchronize( timeEndEvent ) ;
	float elapsedTime = 0 ;
	cudaEventElapsedTime( & elapsedTime, timeStartEvent, timeEndEvent ) ;
		 
	std::cout<< "elapsedTime is  " << elapsedTime << " ms. ";
	cudaEventDestroy( timeStartEvent ) ;
	cudaEventDestroy( timeEndEvent ) ;

	return 0;
}

     
     
    
    
   
   
  • 2
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值