cuda 常量内存,头文件不知道有什么gui问题



#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include "device_atomic_functions.h"
#include <iostream>
#include <stdio.h>
#include "crt\func_macro.h"
#include "assert.h"
#include "conio.h"
using namespace std;


#define CUDA_CALL(x){const cudaError_t a=(x);if(a!=cudaSuccess){printf("\nCUDAError:%s(err_num= %d) \n",cudaGetErrorString(a),a);cudaDeviceReset();assert(0);}}
#define KERNEL_LOOP 65536
__constant__ static const int const_data_01 = 0x55555555;
__constant__ static const int const_data_02 = 0x77777777;
__constant__ static const int const_data_03 = 0x33333333;
__constant__ static const int const_data_04 = 0x11111111;
__global__ void const_test_gpu_literal(int * const data, const int num_elements)
{
const int tid = (blockDim.x*blockIdx.x) + threadIdx.x;
if (tid < num_elements)
{
int d = 0x55555555;
for (int i = 0; i < KERNEL_LOOP; i++)
{
d ^= 0x55555555;
d |= 0x77777777;
d &= 0x33333333;
d |= 0x11111111;
}
data[tid] = d;
}
}
__global__ void const_test_gpu_const(int * const data, const int num_elements)
{
const int tid = (blockDim.x*blockIdx.x) + threadIdx.x;
if (tid < num_elements)
{
int d = const_data_01;
for (int i = 0; i < KERNEL_LOOP; i++)
{
d ^= const_data_01;
d |= const_data_02;
d &= const_data_03;
d |= const_data_04;
}
data[tid] = d;
}
}
__host__ void wait_exit(void)
{
char ch;
printf("\nPress any key to exit");
ch = getch();
}
__host__ void cuda_error_check(
const char * prefix,
const char * postfix)
{
if (cudaPeekAtLastError() != cudaSuccess)
{
printf("\n%s%s%s", prefix, cudaGetErrorString(cudaGetLastError()), postfix);
cudaDeviceReset();
wait_exit();
exit(1);
}
}
__host__ void gpu_kernel(void)
{
const int num_elements = (128 * 1024);
const int num_threads = 256;
const int num_blocks = (num_elements + (num_threads - 1)) / num_threads;
const int num_bytes = num_elements*sizeof(int);
int max_device_num;
const int max_runs = 6;
CUDA_CALL(cudaGetDeviceCount(&max_device_num));
for (int device_num = 0; device_num < max_device_num; device_num++)
{
CUDA_CALL(cudaSetDevice(device_num));
for (int num_test = 0; num_test < max_runs; num_test++)
{
int *data_gpu;
cudaEvent_t kernel_start1, kernel_stop1;
cudaEvent_t kernel_start2, kernel_stop2;
float delta_time1 = 0.0F, delta_time2 = 0.0F;
struct cudaDeviceProp device_prop;
char device_prefix[261];


CUDA_CALL(cudaMalloc(&data_gpu, num_bytes));
CUDA_CALL(cudaEventCreate(&kernel_start1));
CUDA_CALL(cudaEventCreate(&kernel_start2));
CUDA_CALL(cudaEventCreateWithFlags(&kernel_stop1, cudaEventBlockingSync));
CUDA_CALL(cudaEventCreateWithFlags(&kernel_stop2, cudaEventBlockingSync));


CUDA_CALL(cudaGetDeviceProperties(&device_prop, device_num));
sprintf(device_prefix, "ID:%d %s:", device_num, device_prop.name);


const_test_gpu_literal << <num_blocks, num_threads >> >(data_gpu, num_elements);
cuda_error_check("Error ", "return from literal starup kernel");
CUDA_CALL(cudaEventRecord(kernel_start1, 0));


const_test_gpu_literal << <num_blocks, num_threads >> >(data_gpu, num_elements);
cuda_error_check("Error ", "return from literal runtime kernel");
CUDA_CALL(cudaEventRecord(kernel_stop1, 0));
CUDA_CALL(cudaEventSynchronize(kernel_stop1));
CUDA_CALL(cudaEventElapsedTime(&delta_time1, kernel_start1, kernel_stop1));


const_test_gpu_literal << <num_blocks, num_threads >> >(data_gpu, num_elements);
cuda_error_check("Error ", "return from constant starup kernel");
CUDA_CALL(cudaEventRecord(kernel_start2, 0));


const_test_gpu_literal << <num_blocks, num_threads >> >(data_gpu, num_elements);
cuda_error_check("Error ", "return from constant runtime kernel");
CUDA_CALL(cudaEventRecord(kernel_stop2, 0));
CUDA_CALL(cudaEventSynchronize(kernel_stop2));
CUDA_CALL(cudaEventElapsedTime(&delta_time2, kernel_start2, kernel_stop2));


if (delta_time1 > delta_time2)
{
printf("\n%sConst version is faster by: %.2fms (Const=%.2fms vs. Literal=%.2fms)", device_prefix, delta_time1 = delta_time2, delta_time1, delta_time2);
}
else
{
printf("\n%sLiteral version is faster by: %.2fms (Const=%.2fms vs. Literal=%.2fms)", device_prefix, delta_time2 = delta_time1, delta_time1, delta_time2);
}
CUDA_CALL(cudaEventDestroy(kernel_start1));
CUDA_CALL(cudaEventDestroy(kernel_start2));
CUDA_CALL(cudaEventDestroy(kernel_stop1));
CUDA_CALL(cudaEventDestroy(kernel_stop2));
}
CUDA_CALL(cudaDeviceReset());
printf("\n");
}
wait_exit();
}


int main()
{
gpu_kernel();
return 0;
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

RtZero

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值