cuda ,常量内存使用

<img src="" alt="" />
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
#include "assert.h"
#include "conio.h"
using namespace std;

#define CUDA_CALL(x){const cudaError_t a=(x);if(a!=cudaSuccess){printf("\nCUDAError:%s(err_num= %d) \n",cudaGetErrorString(a),a);cudaDeviceReset();assert(0);}}
#define KERNEL_LOOP 65536
__constant__ static const int const_data_01 = 0x55555555;
__constant__ static const int const_data_02 = 0x77777777;
__constant__ static const int const_data_03 = 0x33333333;
__constant__ static const int const_data_04 = 0x11111111;
__global__ void const_test_gpu_literal(int * const data, const int num_elements)
{
	const int tid = (blockDim.x*blockIdx.x) + threadIdx.x;
	if (tid < num_elements)
	{
		int d = 0x55555555;
		for (int i = 0; i < KERNEL_LOOP; i++)
		{
			d ^= 0x55555555;
			d |= 0x77777777;
			d &= 0x33333333;
			d |= 0x11111111;
		}
		data[tid] = d;
	}
}
__global__ void const_test_gpu_const(int * const data, const int num_elements)
{
	const int tid = (blockDim.x*blockIdx.x) + threadIdx.x;
	if (tid < num_elements)
	{
		int d = const_data_01;
		for (int i = 0; i < KERNEL_LOOP; i++)
		{
			d ^= const_data_01;
			d |= const_data_02;
			d &= const_data_03;
			d |= const_data_04;
		}
		data[tid] = d;
	}
}
__host__ void wait_exit(void)
{
	char ch;
	printf("\nPress any key to exit");
	ch = getch();
}
__host__ void cuda_error_check(
	const char * prefix,
	const char * postfix)
{
	if (cudaPeekAtLastError() != cudaSuccess)
	{
		printf("\n%s%s%s", prefix, cudaGetErrorString(cudaGetLastError()), postfix);
		cudaDeviceReset();
		wait_exit();
		exit(1);
	}
}
__host__ void gpu_kernel(void)
{
	const int num_elements = (128 * 1024);
	const int num_threads = 256;
	const int num_blocks = (num_elements + (num_threads - 1)) / num_threads;
	const int num_bytes = num_elements*sizeof(int);
	int max_device_num;
	const int max_runs = 6;
	CUDA_CALL(cudaGetDeviceCount(&max_device_num));
	for (int device_num = 0; device_num < max_device_num; device_num++)
	{
		CUDA_CALL(cudaSetDevice(device_num));
		for (int num_test = 0; num_test < max_runs; num_test++)
		{
			int *data_gpu;
			cudaEvent_t kernel_start1, kernel_stop1;
			cudaEvent_t kernel_start2, kernel_stop2;
			float delta_time1 = 0.0F, delta_time2 = 0.0F;
			struct cudaDeviceProp device_prop;
			char device_prefix[261];

			CUDA_CALL(cudaMalloc(&data_gpu, num_bytes));
			CUDA_CALL(cudaEventCreate(&kernel_start1));
			CUDA_CALL(cudaEventCreate(&kernel_start2));
			CUDA_CALL(cudaEventCreateWithFlags(&kernel_stop1, cudaEventBlockingSync));
			CUDA_CALL(cudaEventCreateWithFlags(&kernel_stop2, cudaEventBlockingSync));

			CUDA_CALL(cudaGetDeviceProperties(&device_prop, device_num));
			sprintf(device_prefix, "ID:%d %s:", device_num, device_prop.name);

			const_test_gpu_literal << <num_blocks, num_threads >> >(data_gpu, num_elements);
			cuda_error_check("Error ", "return from literal starup kernel");
			CUDA_CALL(cudaEventRecord(kernel_start1, 0));

			const_test_gpu_literal << <num_blocks, num_threads >> >(data_gpu, num_elements);
			cuda_error_check("Error ", "return from literal runtime kernel");
			CUDA_CALL(cudaEventRecord(kernel_stop1, 0));
			CUDA_CALL(cudaEventSynchronize(kernel_stop1));
			CUDA_CALL(cudaEventElapsedTime(&delta_time1, kernel_start1, kernel_stop1));

			const_test_gpu_literal << <num_blocks, num_threads >> >(data_gpu, num_elements);
			cuda_error_check("Error ", "return from constant starup kernel");
			CUDA_CALL(cudaEventRecord(kernel_start2, 0));

			const_test_gpu_literal << <num_blocks, num_threads >> >(data_gpu, num_elements);
			cuda_error_check("Error ", "return from constant runtime kernel");
			CUDA_CALL(cudaEventRecord(kernel_stop2, 0));
			CUDA_CALL(cudaEventSynchronize(kernel_stop2));
			CUDA_CALL(cudaEventElapsedTime(&delta_time2, kernel_start2, kernel_stop2));

			if (delta_time1 > delta_time2)
			{
				printf("\n%sConst version is faster by: %.2fms (Const=%.2fms vs. Literal=%.2fms)", device_prefix, delta_time1 = delta_time2, delta_time1, delta_time2);
			}
			else
			{
				printf("\n%sLiteral version is faster by: %.2fms (Const=%.2fms vs. Literal=%.2fms)", device_prefix, delta_time2 = delta_time1, delta_time1, delta_time2);
			}
			CUDA_CALL(cudaEventDestroy(kernel_start1));
			CUDA_CALL(cudaEventDestroy(kernel_start2));
			CUDA_CALL(cudaEventDestroy(kernel_stop1));
			CUDA_CALL(cudaEventDestroy(kernel_stop2));
		}
		CUDA_CALL(cudaDeviceReset());
		printf("\n");
	}
	wait_exit();
}
int main()
{
	gpu_kernel();
	return 0;
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

RtZero

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值