cuda编程入门示例22

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda.h>
#include <cuda_runtime.h>

#define BLOCK_SIZE 16
static void HandleError(cudaError_t err, const char *file, int line)
{
	if (err != cudaSuccess)
	{
		printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
		exit(EXIT_FAILURE);
	}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))

#define HANDLE_NULL( a ) {if ((a) == NULL) { \
	printf("Host memory failed in %s at line %d\n", \
	__FILE__, __LINE__); \
	exit(EXIT_FAILURE); }}


static bool InitCUDA()
{
	int count;

	cudaGetDeviceCount(&count);
	if (count == 0)
	{
		fprintf(stderr, "There is no device.\n");
		return false;
	}

	int i;
	cudaDeviceProp prop = {0};

	for (i = 0; i < count; i++)
	{
		if (cudaGetDeviceProperties(&prop, i) == cudaSuccess)
		{
			if (prop.major >= 1)
			{
				printf("%s\n", prop.name);
				break;
			}
		}
	}

	if (i >= count)
	{
		fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
		return false;
	}

	cudaSetDevice(i);

	return true;
}

static void matgen(float* a, int lda, int n)
{
	int i, j;

	for (i = 0; i < n; i++)
	{
		for (j = 0; j < n; j++)
		{
			a[i * lda + j] = (float)rand() / RAND_MAX + (float)rand() / (RAND_MAX * RAND_MAX);
		}
	}
}

static  void matmult(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
	int i, j, k;

	for (i = 0; i < n; i++)
	{
		for (j = 0; j < n; j++)
		{
			double t = 0;
			for (k = 0; k < n; k++)
			{
				t += a[i * lda + k] * b[j + k * ldb];
			}

			c[i * ldc + j] = t;
		}
	}
}

static void compare_mat(const float* a, int lda, const float* b, int ldb, int n)
{
	float max_err = 0;
	float average_err = 0;
	float max_absolute_err = 0;
	int i, j;

	for (i = 0; i < n; i++)
	{
		for (j = 0; j < n; j++)
		{
			if (b[i * ldb + j] != 0)
			{
				float tmp = fabs(a[i * lda + j] - b[i * ldb + j]);
				if (max_absolute_err < tmp)
				{
					max_absolute_err = tmp;
				}

				float err = fabs(tmp / b[i * ldb + j]);
				if (max_err < err)
				{
					max_err = err;
				}

				average_err += err;
			}
		}
	}

	printf("Max absolute error: %g\nMax error: %g\nAverage error: %g\n", \
			max_absolute_err, max_err, average_err / (n * n));
}

//利用 Kahan's Summation Formula 来提高精确度,并且利用块内共享内存存储矩阵dev_a的一行,一共n块。
__global__ static void matMultCUDA(const float* dev_a, size_t lda, const float* dev_b, size_t ldb, float* dev_c, size_t ldc, int n)
{
	extern __shared__ float data[];
	const int tid = threadIdx.x;
	const int row = blockIdx.x;
	int i, j;

	for (i = tid; i < n; i += blockDim.x)
	{
		data[i] = dev_a[row * lda + i];
	}
	__syncthreads();

	//row行,j列
	for (j = tid; j < n; j += blockDim.x)
	{
		float t = 0;
		float y = 0;

		for (i = 0; i < n; i++)
		{
			float r;
			y -= data[i] * dev_b[i * ldb + j];
			r = t - y;
			y = (r - t) + y;
			t = r;
		}

		dev_c[row * ldc + j] = t;
	}
}

//显存地址自动对齐,可以提高访问显存的效率
static clock_t matmultCUDA(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
	const int thread_num = 256;
	float *dev_a, *dev_b, *dev_c;
	clock_t time;
	size_t pitch_a, pitch_b, pitch_c;
	cudaEvent_t     start, stop;
	float           elapsedTime;

	time = clock();
	HANDLE_ERROR(cudaEventCreate(&start));
	HANDLE_ERROR(cudaEventCreate(&stop));

	HANDLE_ERROR(cudaMallocPitch((void**)&dev_a, &pitch_a, sizeof(float)* n, n));
	HANDLE_ERROR(cudaMallocPitch((void**)&dev_b, &pitch_b, sizeof(float)* n, n));
	HANDLE_ERROR(cudaMallocPitch((void**)&dev_c, &pitch_c, sizeof(float)* n, n));

	HANDLE_ERROR(cudaMemcpy2D(dev_a, pitch_a, a, sizeof(float)* lda, sizeof(float)* n, n, cudaMemcpyHostToDevice));
	HANDLE_ERROR(cudaMemcpy2D(dev_b, pitch_b, b, sizeof(float)* ldb, sizeof(float)* n, n, cudaMemcpyHostToDevice));

	//int blocks = (n * n + thread_num - 1) / thread_num;
	int blocks = n;

	HANDLE_ERROR(cudaEventRecord(start, 0));
	matMultCUDA << <blocks, thread_num, n * sizeof(float) >> >(dev_a, pitch_a / sizeof(float), dev_b, pitch_b / sizeof(float), dev_c, pitch_c / sizeof(float), n);
	HANDLE_ERROR(cudaEventRecord(stop, 0));
	HANDLE_ERROR(cudaEventSynchronize(stop));
	HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop));
	printf("核函数执行时间:%lfms\n", elapsedTime);

	HANDLE_ERROR(cudaMemcpy2D(c, sizeof(float)* ldc, dev_c, pitch_c, sizeof(float)* n, n, cudaMemcpyDeviceToHost));

	cudaFree(dev_a);
	cudaFree(dev_b);
	cudaFree(dev_c);

	return clock() - time;
}

int main(int argc, char *argv[])
{
	const int n = 64 * 4 * 2;
	float *a, *b, *c, *d;

	HANDLE_NULL(a = (float*)malloc(sizeof(float)* n * n));
	HANDLE_NULL(b = (float*)malloc(sizeof(float)* n * n));
	HANDLE_NULL(c = (float*)malloc(sizeof(float)* n * n));
	HANDLE_NULL(d = (float*)malloc(sizeof(float)* n * n));

	srand(0);
	matgen(a, n, n);
	matgen(b, n, n);

	printf("利用 Kahan's Summation Formula 来提高精确度\n");
	printf("显存地址自动对齐,可以提高访问显存的效率\n");

	clock_t time = matmultCUDA(a, n, b, n, c, n, n);
	matmult(a, n, b, n, d, n, n);
	compare_mat(c, n, d, n, n);

	double sec = (double)time / CLOCKS_PER_SEC;
	printf("Time used: %.6fs (%.6lf GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9));

	free(a);
	free(b);
	free(c);
	free(d);

	//remember to release the device
	cudaDeviceReset();

	return 0;
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
CUDA(Compute Unified Device Architecture)是NVIDIA推出的并行计算平台和API模型,用于利用GPU的计算能力。以下是一个CUDA快速入门代码的示例: 代码示例: #include <cuda_runtime.h> #include <stdio.h> // CUDA核函数,执行在GPU上 __global__ void cudaHelloWorld() { printf("Hello World from GPU!\n"); } int main() { // 调用CUDA异步执行的配置 cudaStream_t stream; cudaStreamCreate(&stream); // 定义核函数的执行配置 dim3 block(1, 1); dim3 grid(1, 1); // 在GPU上调用核函数 cudaHelloWorld<<<grid, block, 0, stream>>>(); // 同步GPU,等待核函数执行完成 cudaStreamSynchronize(stream); // 销毁CUDAcudaStreamDestroy(stream); // 输出CPU上的信息 printf("Hello World from CPU!\n"); return 0; } 该示例代码中,我们在主函数中调用CUDA核函数cudaHelloWorld,并在GPU上并行执行。核函数使用__global__修饰符标记,表明它将在GPU上执行。在主函数中,我们首先使用cudaStreamCreate函数创建一个CUDA流,用于异步执行核函数。然后,我们定义了核函数的执行配置,即指定了需要启动的线程块数量和线程块中的线程数量。在调用核函数时,我们传递了执行配置、流对象和其他参数。接着,我们使用cudaStreamSynchronize函数来等待GPU上的核函数执行完成,以确保输出的正确顺序。最后,我们使用printf函数输出来自CPU和GPU的信息。 这个示例代码展示了如何使用CUDA快速入门,并在GPU上进行并行计算。通过学习和掌握CUDA编程,开发者可以充分利用GPU的并行计算能力,加速各种科学计算和计算密集型任务的执行。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值