CUDA内存(三) 常量内存 __constant__

常量内存

常量内存其实只是全局内存的一种虚拟地址形式, 并没有特殊保留的常量内存块. 常量内存有两个特性, 一个是高速缓存, 另一个是它支持将单个值广播到线程束的每个线程.

测试代码

ConstMem.h

#pragma once

#include "Global.h"
#include <stdio.h>
#include <stdlib.h>
#include <conio.h>
#include <assert.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#define CUDA_CALL(x){const cudaError_t a=(x);if(a!=cudaSuccess){printf("\nCUDA Error: %s (err_num=%d)\n", cudaGetErrorString(a), a);cudaDeviceReset();assert(0);}} 

#define KERNEL_LOOP 65536

__constant__ static const u32 const_data_01 = 0x55555555;
__constant__ static const u32 const_data_02 = 0x77777777;
__constant__ static const u32 const_data_03 = 0x33333333;
__constant__ static const u32 const_data_04 = 0x11111111;

__device__ static  const u32 gmem_data_01 = 0x55555555;
__device__ static  const u32 gmem_data_02 = 0x77777777;
__device__ static  const u32 gmem_data_03 = 0x33333333;
__device__ static  const u32 gmem_data_04 = 0x11111111;

__global__ void const_test_gpu_literal(u32* const data, const u32 num_elements);
__global__ void const_test_gpu_const(u32 *const data, const u32 num_elements);
__global__ void const_test_gpu_gmem(u32 *const data, const u32 num_elements);
__host__ void wait_exit(void);
__host__ void cuda_error_check(const char *prefix, const char *postfix);
__host__ void gpu_const_vs_literal_kernel(void);
__host__ void gpu_const_vs_gmem_kernel(void);

ConstMem.cu


#include "ConstMem.h"

__global__ void const_test_gpu_literal(u32* const data, const u32 num_elements)
{
	const u32 tid = (blockIdx.x*blockDim.x) + threadIdx.x;
	if (tid < num_elements)
	{
		u32 d = 0x55555555;
		for (int i = 0; i < KERNEL_LOOP; i++)
		{
			d ^= 0x55555555;
			d |= 0x77777777;
			d &= 0x33333333;
			d |= 0x11111111;
		}
		data[tid] = d;
	}
}


__global__ void const_test_gpu_const(u32 *const data, const u32 num_elements)
{
	const u32 tid = (blockIdx.x*blockDim.x) + threadIdx.x;
	if (tid < num_elements)
	{
		u32 d = const_data_01;
		for (int i = 0; i < KERNEL_LOOP; i++)
		{
			d ^= const_data_01;
			d |= const_data_02;
			d &= const_data_03;
			d |= const_data_04;
		}
		data[tid] = d;
	}
}

__global__ void const_test_gpu_gmem(u32 *const data, const u32 num_elements)
{
	const u32 tid = (blockIdx.x*blockDim.x) + threadIdx.x;
	if (tid < num_elements)
	{
		u32 d = 0x55555555;
		for (int i = 0; i < KERNEL_LOOP; i++)
		{
			d ^= gmem_data_01;
			d |= gmem_data_02;
			d &= gmem_data_03;
			d |= gmem_data_04;
		}
		data[tid] = d;
	}
}
__host__ void wait_exit(void)
{
	char ch;
	printf("\nPress any key to exit");
	ch = getch();
}

__host__ void cuda_error_check(const char *prefix, const char *postfix)
{
	if (cudaPeekAtLastError() != cudaSuccess)
	{
		printf("\n%s%s%s", prefix, cudaGetErrorString(cudaGetLastError()), postfix);
		cudaDeviceReset();
		wait_exit();
		exit(1);
	}
}

__host__ void gpu_const_vs_literal_kernel(void)
{
	const u32 num_elements = 128 * 1024;
	const u32 num_threads = 256;
	const u32 num_blocks = (num_elements + num_threads - 1) / num_threads;
	const u32 num_bytes = num_elements * sizeof(u32);
	int max_device_num;
	const int max_runs = 6;
	
	CUDA_CALL(cudaGetDeviceCount(&max_device_num));
	for (int device_num = 0; device_num < max_device_num;device_num++)
	{
		CUDA_CALL(cudaSetDevice(device_num));
		for (int num_test = 0; num_test < max_runs; num_test++)
		{
			u32 *data_gpu = NULL;
			cudaEvent_t kernel_start1, kernel_stop1;
			cudaEvent_t kernel_start2, kernel_stop2;
			float delta_time1 = 0.0f, delta_time2 = 0.0f;
			struct cudaDeviceProp device_prop;
			char device_prefix[261];
			CUDA_CALL(cudaMalloc(&data_gpu, num_bytes));
			CUDA_CALL(cudaEventCreate(&kernel_start1));
			CUDA_CALL(cudaEventCreate(&kernel_start2));
			CUDA_CALL(cudaEventCreateWithFlags(&kernel_stop1, cudaEventBlockingSync));
			CUDA_CALL(cudaEventCreateWithFlags(&kernel_stop2, cudaEventBlockingSync));


			//printf("\nLauching %u blocks. %u threads", num_blocks, num_threads);
			CUDA_CALL(cudaGetDeviceProperties(&device_prop, device_num));
			sprintf(device_prefix, "ID:%d %s:", device_num, device_prop.name);

			// 字面值
			// Warm up run
			//printf("\nLauching literal kernel warm-up");
			const_test_gpu_literal<<<num_blocks, num_threads>>>(data_gpu, num_elements);
			cuda_error_check("Error ", " returned from literal startup kernel");
			//Do the literal kernel
			//printf("\nLauching literal kernel");
			CUDA_CALL(cudaEventRecord(kernel_start1, 0));
			const_test_gpu_literal << <num_blocks, num_threads >> >(data_gpu, num_elements);
			cuda_error_check("Error ", " returned from literal runtime kernel");
			CUDA_CALL(cudaEventRecord(kernel_stop1, 0));
			CUDA_CALL(cudaEventSynchronize(kernel_stop1));
			CUDA_CALL(cudaEventElapsedTime(&delta_time1, kernel_start1, kernel_stop1));
			//printf("\nLiteral Elapsed time: %.3fms", delta_time1);

			// 常量内存
			// Warm up run
			//printf("\nLauching constant kernel warm-up");
			const_test_gpu_const << <num_blocks, num_threads >> >(data_gpu, num_elements);
			cuda_error_check("Error ", " returned from costant startup kernel");

			// Do the const kernel
			//printf("\n Lauching constant kernel");
			CUDA_CALL(cudaEventRecord(kernel_start2, 0));
			const_test_gpu_const << <num_blocks, num_threads >> >(data_gpu, num_elements);
			cuda_error_check("Error ", " returned from constant runtime kernel");
			CUDA_CALL(cudaEventRecord(kernel_stop2, 0));
			CUDA_CALL(cudaEventSynchronize(kernel_stop2));
			CUDA_CALL(cudaEventElapsedTime(&delta_time2, kernel_start2, kernel_stop2));
			//printf("\nConstant Elapsed time: %.3fms", delta_time2);

			if (delta_time1 > delta_time2)
			{
				printf("\n%s常量内存快了: %.2fms (Const=%.2fms vs. Literal=%.2fms)", device_prefix, delta_time1 - delta_time2, delta_time1, delta_time2);
			}
			else
			{
				printf("\n%s字面取值快了: %.2fms (Const=%.2fms vs. Literal=%.2fms)", device_prefix, delta_time2 - delta_time1, delta_time1, delta_time2);
			}
			CUDA_CALL(cudaEventDestroy(kernel_start1));
			CUDA_CALL(cudaEventDestroy(kernel_start2));
			CUDA_CALL(cudaEventDestroy(kernel_stop1));
			CUDA_CALL(cudaEventDestroy(kernel_stop2));
			CUDA_CALL(cudaFree(data_gpu));

		}/* end of num_test*/
		CUDA_CALL(cudaDeviceReset());
		printf("\n");
	}/* end of device_num*/
}
__host__ void gpu_const_vs_gmem_kernel(void)
{
	const u32 num_elements = 128 * 1024;
	const u32 num_threads = 256;
	const u32 num_blocks = (num_elements + num_threads - 1) / num_threads;
	const u32 num_bytes = num_elements * sizeof(u32);
	int max_device_num;
	const int max_runs = 6;

	CUDA_CALL(cudaGetDeviceCount(&max_device_num));
	for (int device_num = 0; device_num < max_device_num; device_num++)
	{
		CUDA_CALL(cudaSetDevice(device_num));
		for (int num_test = 0; num_test < max_runs; num_test++)
		{
			u32 *data_gpu = NULL;
			cudaEvent_t kernel_start1, kernel_stop1;
			cudaEvent_t kernel_start2, kernel_stop2;
			float delta_time1 = 0.0f, delta_time2 = 0.0f;
			struct cudaDeviceProp device_prop;
			char device_prefix[261];
			CUDA_CALL(cudaMalloc(&data_gpu, num_bytes));
			CUDA_CALL(cudaEventCreate(&kernel_start1));
			CUDA_CALL(cudaEventCreate(&kernel_start2));
			CUDA_CALL(cudaEventCreateWithFlags(&kernel_stop1, cudaEventBlockingSync));
			CUDA_CALL(cudaEventCreateWithFlags(&kernel_stop2, cudaEventBlockingSync));


			//printf("\nLauching %u blocks. %u threads", num_blocks, num_threads);
			CUDA_CALL(cudaGetDeviceProperties(&device_prop, device_num));
			sprintf(device_prefix, "ID:%d %s:", device_num, device_prop.name);

			// 常量内存
			// Warm up run
			//printf("\nLauching constant kernel warm-up");
			const_test_gpu_const << <num_blocks, num_threads >> >(data_gpu, num_elements);
			cuda_error_check("Error ", " returned from costant startup kernel");

			// Do the const kernel
			//printf("\n Lauching constant kernel");
			CUDA_CALL(cudaEventRecord(kernel_start1, 0));
			const_test_gpu_const << <num_blocks, num_threads >> >(data_gpu, num_elements);
			cuda_error_check("Error ", " returned from constant runtime kernel");
			CUDA_CALL(cudaEventRecord(kernel_stop1, 0));
			CUDA_CALL(cudaEventSynchronize(kernel_stop1));
			CUDA_CALL(cudaEventElapsedTime(&delta_time1, kernel_start1, kernel_stop1));
			//printf("\nConstant Elapsed time: %.3fms", delta_time2);

			// 全局内存
			// Warm up run
			//printf("\nLauching gmem kernel warm-up");
			const_test_gpu_gmem << <num_blocks, num_threads >> >(data_gpu, num_elements);
			cuda_error_check("Error ", " returned from gmem startup kernel");
			//printf("\n Lauching gmem kernel");
			CUDA_CALL(cudaEventRecord(kernel_start2, 0));
			const_test_gpu_gmem << <num_blocks, num_threads >> >(data_gpu, num_elements);
			cuda_error_check("Error ", " returned from gmem runtime kernel");
			CUDA_CALL(cudaEventRecord(kernel_stop2, 0));
			CUDA_CALL(cudaEventSynchronize(kernel_stop2));
			CUDA_CALL(cudaEventElapsedTime(&delta_time2, kernel_start2, kernel_stop2));

			if (delta_time1 > delta_time2)
			{
				printf("\n%s常量内存快了: %.2fms (Const=%.2fms vs. Gmem=%.2fms)", device_prefix, delta_time1 - delta_time2, delta_time1, delta_time2);
			}
			else
			{
				printf("\n%s全局内存快了: %.2fms (Const=%.2fms vs. Gmem=%.2fms)", device_prefix, delta_time2 - delta_time1, delta_time1, delta_time2);
			}
			CUDA_CALL(cudaEventDestroy(kernel_start1));
			CUDA_CALL(cudaEventDestroy(kernel_start2));
			CUDA_CALL(cudaEventDestroy(kernel_stop1));
			CUDA_CALL(cudaEventDestroy(kernel_stop2));
			CUDA_CALL(cudaFree(data_gpu));

		}/* end of num_test*/
		CUDA_CALL(cudaDeviceReset());
		printf("\n");
	}/* end of device_num*/
}

ConstMem_main.cpp

#include "ConstMem.h"
int main()
{
	getCpuInfo();
	getGpuInfo();
	printf("\n常亮内存和字面值u32位操作速度对比:\n");
	gpu_const_vs_literal_kernel();
	gpu_const_vs_gmem_kernel();
	return 0;
}

测试结果

可以看出, (在较高版本架构上, )全局内存借助一级缓存也能达到与常量内存相同的访问速度!
在这里插入图片描述

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值