cuda编程--全局内存、共享内存、动态共享内存的使用

// error.cuh
#pragma once
#include <stdio.h>
#include <cuda_runtime_api.h>
#include <cuda_runtime.h>

#define CHECK(call)                                   \
do                                                    \
{                                                     \
    const cudaError_t error_code = call;              \
    if (error_code != cudaSuccess)                    \
    {                                                 \
        printf("CUDA Error:\n");                      \
        printf("    File:       %s\n", __FILE__);     \
        printf("    Line:       %d\n", __LINE__);     \
        printf("    Error code: %d\n", error_code);   \
        printf("    Error text: %s\n",                \
            cudaGetErrorString(error_code));          \
        exit(1);                                      \
    }                                                 \
} while (0)
// reduce2gpu.cu
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <device_launch_parameters.h>
#include "error.cuh"
#include <stdio.h>
#include <cmath>

const int NUM_REPEATS = 100;
const int N = 100000000;
const int M = sizeof(float) * N;
const int BLOCK_SIZE = 128;


void timing(float* h_x, float* d_x, const int method);


int main()
{
	float* h_x = (float*)malloc(M);
	for (int n = 0; n < N; ++n) {
		h_x[n] = 1.23;
	}

	float* d_x;
	CHECK(cudaMalloc(&d_x, M));

	printf("\nUsing global memory only:\n");
	timing(h_x, d_x, 0);
	printf("\nUsing static shared memory:\n");
	timing(h_x, d_x, 1);
	printf("\nUsing static shared memory:\n");
	timing(h_x, d_x, 2);

	free(h_x);
	CHECK(cudaFree(d_x));
	return 0;
}

/*
* blockSize中的所有数据按照元素相加,然后对每个gridSize进行再求和
*/
void __global__ reduce_global(float* d_x, float* d_y)
{
	const int tid = threadIdx.x;
	float* x = d_x + blockDim.x * blockIdx.x;   // blockIdx.x 对应的是 gridDim

	for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
	{
		if (tid < offset)
		{
			x[tid] += x[tid + offset];
		}

		__syncthreads();
	}

	if (tid == 0)
	{
		d_y[blockIdx.x] = x[0];
	}

}


void __global__ reduce_shared(float* d_x, float* d_y)
{
	const int tid = threadIdx.x;
	const int bid = blockIdx.x;
	const int n = bid * blockDim.x + tid;

	__shared__ float s_y[128];
	s_y[tid] = (n < N) ? d_x[n] : 0.;
	__syncthreads();

	for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
	{
		if (tid < offset)
		{
			s_y[tid] += s_y[tid + offset];
		}
		__syncthreads();
	}

	if (tid == 0)
	{
		d_y[bid] = s_y[0];
	}
}


void __global__ reduce_dynamic(float* d_x, float* d_y)
{
	const int tid = threadIdx.x;
	const int bid = blockIdx.x;
	const int n = bid * blockDim.x + tid;
	extern __shared__ float s_y[];
	s_y[tid] = (n < N) ? d_x[n] : 0.0;
	__syncthreads();

	for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
	{

		if (tid < offset)
		{
			s_y[tid] += s_y[tid + offset];
		}
		__syncthreads();
	}

	if (tid == 0)
	{
		d_y[bid] = s_y[0];
	}
}


float reduce(float* d_x, const int method)
{
	int grid_size = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
	const int ymem = sizeof(float) * grid_size;
	const int smem = sizeof(float) * BLOCK_SIZE;

	float* d_y;

	CHECK(cudaMalloc(&d_y, ymem));

	float* h_y = (float*)malloc(ymem);

	switch (method)
	{
	case 0:
		reduce_global << <grid_size, BLOCK_SIZE >> > (d_x, d_y);
		break;
	case 1:
		reduce_shared << <grid_size, BLOCK_SIZE >> > (d_x, d_y);
		break;
	case 2:
		reduce_dynamic << <grid_size, BLOCK_SIZE, smem >> > (d_x, d_y);
		break;
	default:
		printf("Error: wrong method\n");
		exit(1);
		break;
	}

	CHECK(cudaMemcpy(h_y, d_y, ymem, cudaMemcpyDeviceToHost));

	float result = 0.;
	for (int n = 0; n < grid_size; ++n)
	{
		result += h_y[n];
	}

	free(h_y);
	CHECK(cudaFree(d_y));
	return result;

}

void timing(float* h_x, float* d_x, const int method)
{
	float sum = 0;

	for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
	{
		CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice));

		cudaEvent_t start, stop;
		CHECK(cudaEventCreate(&start));
		CHECK(cudaEventCreate(&stop));
		CHECK(cudaEventRecord(start));
		cudaEventQuery(start);

		sum = reduce(d_x, method);

		CHECK(cudaEventRecord(stop));
		CHECK(cudaEventSynchronize(stop));
		float elapsed_time;
		CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
		printf("Time = %g ms.\n", elapsed_time);

		CHECK(cudaEventDestroy(start));
		CHECK(cudaEventDestroy(stop));
	}

	printf("sum = %f.\n", sum);
}

代码简单讲解:
定义一个长度为100000000的数组
将数据分为cile(100000000/128)个分区, 然后对每个分区计算一个区域求和,然后对总的分区进行求和。
三个做法都是如此

#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <device_launch_parameters.h>
#include "error.cuh"
#include <stdio.h>
#include <cmath>

const int NUM_REPEATS = 1;
const int N = 100000000;
const int M = sizeof(float) * N;
const int BLOCK_SIZE = 128;


void timing(float* h_x, float* d_x, const int method);


int main()
{
	float* h_x = (float*)malloc(M);
	for (int n = 0; n < N; ++n) {
		h_x[n] = 1.23;
	}

	float* d_x;
	CHECK(cudaMalloc(&d_x, M));

	/*printf("\nUsing global memory only:\n");
	timing(h_x, d_x, 0);
	printf("\nUsing static shared memory:\n");
	timing(h_x, d_x, 1);
	printf("\nUsing static shared memory:\n");
	timing(h_x, d_x, 2);*/
	printf("\nUsing atomicAdd operation:\n");
	timing(h_x, d_x, 3);

	free(h_x);
	CHECK(cudaFree(d_x));
	return 0;
}

/*
* blockSize中的所有数据按照元素相加,然后对每个gridSize进行再求和
*/
void __global__ reduce_global(float* d_x, float* d_y)
{
	const int tid = threadIdx.x;
	float* x = d_x + blockDim.x * blockIdx.x;   // blockIdx.x 对应的是 gridDim

	for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
	{
		if (tid < offset)
		{
			x[tid] += x[tid + offset];
		}

		__syncthreads();
	}

	if (tid == 0)
	{
		d_y[blockIdx.x] = x[0];
	}

}


void __global__ reduce_shared(float* d_x, float* d_y)
{
	const int tid = threadIdx.x;
	const int bid = blockIdx.x;
	const int n = bid * blockDim.x + tid;

	__shared__ float s_y[128];
	s_y[tid] = (n < N) ? d_x[n] : 0.;
	__syncthreads();

	for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
	{
		if (tid < offset)
		{
			s_y[tid] += s_y[tid + offset];
		}
		__syncthreads();
	}

	if (tid == 0)
	{
		d_y[bid] = s_y[0];
	}
}


void __global__ reduce_dynamic(float* d_x, float* d_y)
{
	const int tid = threadIdx.x;
	const int bid = blockIdx.x;
	const int n = bid * blockDim.x + tid;
	extern __shared__ float s_y[];
	s_y[tid] = (n < N) ? d_x[n] : 0.0;
	__syncthreads();

	for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
	{

		if (tid < offset)
		{
			s_y[tid] += s_y[tid + offset];
		}
		__syncthreads();
	}

	if (tid == 0)
	{
		d_y[bid] = s_y[0];
	}
}

void __global__ reduce_atomic(const float* d_x, float* d_y)
{
	const int tid = threadIdx.x;
	const int bid = blockIdx.x;
	const int n = bid * blockDim.x + threadIdx.x;
	__shared__ float s_y[128];
	s_y[tid] = (n < N) ? d_x[n] : 0.0;
	__syncthreads();

	for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
	{
		if (tid < offset)
		{
			s_y[tid] += s_y[tid + offset];
		}

		__syncthreads();
	}

	if (tid == 0)
	{
		atomicAdd(d_y, s_y[0]);
	}
}

float reduce(float* d_x, const int method)
{
	int grid_size = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
	const int ymem = sizeof(float) * grid_size;
	const int smem = sizeof(float) * BLOCK_SIZE;

	float* d_y;

	CHECK(cudaMalloc(&d_y, ymem));

	float* h_y = (float*)malloc(ymem);

	switch (method)
	{
	case 0:
		reduce_global << <grid_size, BLOCK_SIZE >> > (d_x, d_y);
		break;
	case 1:
		reduce_shared << <grid_size, BLOCK_SIZE >> > (d_x, d_y);
		break;
	case 2:
		reduce_dynamic << <grid_size, BLOCK_SIZE, smem >> > (d_x, d_y);
		break;
	case 3:
		reduce_atomic << <grid_size, BLOCK_SIZE>> > (d_x, d_y);
		break;
	default:
		printf("Error: wrong method\n");
		exit(1);
		break;
	}

	CHECK(cudaMemcpy(h_y, d_y, ymem, cudaMemcpyDeviceToHost));

	float result = 0.;
	for (int n = 0; n < grid_size; ++n)
	{
		result += h_y[n];
		if (n < 10)
			printf("grid_size:%d, 当前值: %f\n", n, h_y[n]);

	}

	free(h_y);
	CHECK(cudaFree(d_y));
	return result;

}

void timing(float* h_x, float* d_x, const int method)
{
	float sum = 0;

	for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
	{
		CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice));

		cudaEvent_t start, stop;
		CHECK(cudaEventCreate(&start));
		CHECK(cudaEventCreate(&stop));
		CHECK(cudaEventRecord(start));
		cudaEventQuery(start);

		sum = reduce(d_x, method);

		CHECK(cudaEventRecord(stop));
		CHECK(cudaEventSynchronize(stop));
		float elapsed_time;
		CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
		printf("Time = %g ms.\n", elapsed_time);

		CHECK(cudaEventDestroy(start));
		CHECK(cudaEventDestroy(stop));
	}

	printf("sum = %f.\n", sum);
}

此处添加一个原子操作:
我们通过打印,发现在d_y处已经进行了求和, 在atomicAdd下,只对d_y[0]进行累加

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
CUDA编程是一种用于GPU并行计算的编程模型,它由NVIDIA推出并应用于其显卡产品系列。通过CUDA编程,开发者可以利用GPU的并行计算能力来加速各种计算任务。下面是一些关于CUDA编程和GPU并行计算的重要概念: 1. GPU:图形处理器(Graphics Processing Unit)是一种专门用于处理图形和并行计算的硬件设备。与传统的央处理器(CPU)相比,GPU具有更多的核心和更高的内存带宽,适合并行计算任务。 2. CUDA:Compute Unified Device Architecture(CUDA)是一种并行计算平台和编程模型,用于利用GPU进行通用目的的并行计算。CUDA提供了一套API和工具,使开发者能够直接在GPU上编写并运行并行计算代码。 3. 核函数(Kernel Function):在CUDA编程,开发者可以定义一个称为核函数的特殊函数。核函数在GPU上并行执行,并且每个线程都会独立地执行该函数。通过合理设计核函数,开发者可以利用GPU的并行计算能力来加速各种计算任务。 4. 线程、块和网格:在CUDA编程,GPU上的并行计算是以线程为基本单位进行的。线程被组织成块(block),而块又可以组织成网格(grid)。开发者可以通过调整块和网格的大小来优化并行计算的性能。 5. 内存管理:CUDA提供了多种类型的内存,开发者可以根据需要选择合适的内存类型。其全局内存(Global Memory)是GPU上所有线程都可以访问的共享内存,而共享内存(Shared Memory)是块内线程共享的高速缓存。合理地使用不同类型的内存可以提高并行计算的效率。 通过CUDA编程,开发者可以将适合并行计算的任务分配给GPU来加速处理。这种方式在科学计算、深度学习、图像处理等领域得到广泛应用,能够显著提高计算性能和效率。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值