cuda,常量内存使用2

<img src="" alt="" />
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include "assert.h"
#include "conio.h"
using namespace std;
#define u16 unsigned short int
#define u32 unsigned int
#define CUDA_CALL(x){const cudaError_t a=(x);if(a!=cudaSuccess){printf("\nCUDAError:%s(err_num= %d) \n",cudaGetErrorString(a),a);cudaDeviceReset();assert(0);}}
#define KERNEL_LOOP 4096
__constant__ static const u32 const_data_gpu[KERNEL_LOOP];
__device__ static u32 gmem_data_gpu[KERNEL_LOOP];
static u32 const_data_host[KERNEL_LOOP];
__global__ void const_test_gpu_gmem(u32 * const data, const u32 num_elements)
{
	const u32 tid = (blockDim.x*blockIdx.x) + threadIdx.x;
	if (tid < num_elements)
	{
		u32 d = gmem_data_gpu[0];
		for (u32 i = 0; i < KERNEL_LOOP; i++)
		{
			d ^= gmem_data_gpu[i];
			d |= gmem_data_gpu[i];
			d &= gmem_data_gpu[i];
			d |= gmem_data_gpu[i];
		}
		data[tid] = d;
	}
}
__global__ void const_test_gpu_const(u32 * const data, const u32 num_elements)
{
	const u32 tid = (blockDim.x*blockIdx.x) + threadIdx.x;
	if (tid < num_elements)
	{
		u32 d = gmem_data_gpu[0];
		for (u32 i = 0; i < KERNEL_LOOP; i++)
		{
			d ^= gmem_data_gpu[i];
			d |= gmem_data_gpu[i];
			d &= gmem_data_gpu[i];
			d |= gmem_data_gpu[i];
		}
		data[tid] = d;
	}
}
__host__ void wait_exit(void)
{
	char ch;
	printf("\nPress any key to exit");
	ch = getch();
}
__host__ void cuda_error_check(
	const char * prefix,
	const char * postfix)
{
	if (cudaPeekAtLastError() != cudaSuccess)
	{
		printf("\n%s%s%s", prefix, cudaGetErrorString(cudaGetLastError()), postfix);
		cudaDeviceReset();
		wait_exit();
		exit(1);
	}
}
__host__ void generate_rand_data(u32 * host_data_ptr)
{
	for (u32 i = 0; i < KERNEL_LOOP; i++)
	{
		host_data_ptr[i] = (u32)rand();
	}
}
__host__ void gpu_kernel(void)
{
	const u32 num_elements = (128 * 1024);
	const u32 num_threads = 256;
	const u32 num_blocks = (num_elements + (num_threads - 1)) / num_threads;
	const u32 num_bytes = num_elements*sizeof(u32);
	int max_device_num;
	const int max_runs = 6;
	CUDA_CALL(cudaGetDeviceCount(&max_device_num));
	for (int device_num = 0; device_num < max_device_num; device_num++)
	{
		CUDA_CALL(cudaSetDevice(device_num));
		u32 *data_gpu;
		cudaEvent_t kernel_start1, kernel_stop1;
		cudaEvent_t kernel_start2, kernel_stop2;
		float delta_time1 = 0.0F, delta_time2 = 0.0F;
		struct cudaDeviceProp device_prop;
		char device_prefix[261];

		CUDA_CALL(cudaMalloc(&data_gpu, num_bytes));
		CUDA_CALL(cudaEventCreate(&kernel_start1));
		CUDA_CALL(cudaEventCreate(&kernel_start2));
		CUDA_CALL(cudaEventCreateWithFlags(&kernel_stop1, cudaEventBlockingSync));
		CUDA_CALL(cudaEventCreateWithFlags(&kernel_stop2, cudaEventBlockingSync));
		CUDA_CALL(cudaGetDeviceProperties(&device_prop, device_num));
		sprintf(device_prefix, "ID:%d %s:", device_num, device_prop.name);

		for (int num_test = 0; num_test < max_runs; num_test++)
		{
			generate_rand_data(const_data_host);
			CUDA_CALL(cudaMemcpyToSymbol(const_data_gpu, const_data_host, KERNEL_LOOP*sizeof(u32)));
			const_test_gpu_gmem << <num_blocks, num_threads >> >(data_gpu, num_elements);
			cuda_error_check("Error ", "return from literal starup kernel");
			CUDA_CALL(cudaEventRecord(kernel_start1, 0));

			const_test_gpu_gmem << <num_blocks, num_threads >> >(data_gpu, num_elements);
			cuda_error_check("Error ", "return from literal runtime kernel");
			CUDA_CALL(cudaEventRecord(kernel_stop1, 0));
			CUDA_CALL(cudaEventSynchronize(kernel_stop1));
			CUDA_CALL(cudaEventElapsedTime(&delta_time1, kernel_start1, kernel_stop1));

			CUDA_CALL(cudaMemcpyToSymbol(const_data_gpu, const_data_host, KERNEL_LOOP*sizeof(u32)));
			const_test_gpu_const << <num_blocks, num_threads >> >(data_gpu, num_elements);
			cuda_error_check("Error ", "return from constant starup kernel");
			CUDA_CALL(cudaEventRecord(kernel_start2, 0));

			const_test_gpu_const << <num_blocks, num_threads >> >(data_gpu, num_elements);
			cuda_error_check("Error ", "return from constant runtime kernel");
			CUDA_CALL(cudaEventRecord(kernel_stop2, 0));
			CUDA_CALL(cudaEventSynchronize(kernel_stop2));
			CUDA_CALL(cudaEventElapsedTime(&delta_time2, kernel_start2, kernel_stop2));

			if (delta_time1 > delta_time2)
			{
				printf("\n%sConst version is faster by: %.2fms (Const=%.2fms vs. Literal=%.2fms)", device_prefix, delta_time1 = delta_time2, delta_time1, delta_time2);
			}
			else
			{
				printf("\n%sLiteral version is faster by: %.2fms (Const=%.2fms vs. Literal=%.2fms)", device_prefix, delta_time2 = delta_time1, delta_time1, delta_time2);
			}
		}
		CUDA_CALL(cudaEventDestroy(kernel_start1));
		CUDA_CALL(cudaEventDestroy(kernel_start2));
		CUDA_CALL(cudaEventDestroy(kernel_stop1));
		CUDA_CALL(cudaEventDestroy(kernel_stop2));
		CUDA_CALL(cudaFree(data_gpu));
		CUDA_CALL(cudaDeviceReset());
		printf("\n");
	}
	wait_exit();
}

int main()
{
	gpu_kernel();
	cin.get();
	return 0;
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

RtZero

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值