CUDA项目配置

最新推荐文章于 2025-05-14 12:43:33 发布

原创最新推荐文章于 2025-05-14 12:43:33 发布 · 524 阅读

CC 4.0 BY-SA版权

文章标签：

一、单独的.cu文件
方法一：新建CUDA Runtime项目
方法二：
新建空项目，调试选64位
配置属性表（安装目录下include、lib、lib中所有的库，以及C盘隐藏目录ProgramData中Nvidia Corporation 中 CUDA Samples 的 inc 和 lib）
右键项目生成依赖项–CUDA11.0
右键项目属性–CUDA C/C+±-Target Machine改为64位
新建CUDA C/C++ FILE（这时右键看.cu文件属性，项类型应为CUDA C/C++, Target Machine Platform应为64位，无需更改），在开头include必要头文件：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

例子：main.cu

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

__global__ void square(float* d_out, float* d_in)
{
	int idx = threadIdx.x;
	float f = d_in[idx];
	d_out[idx] = f * f;
}

int main()
{
	const int ARRAY_SIZE = 64;
	const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);

	//generate the input arrayon the host
	float h_in[ARRAY_SIZE];
	for (int i = 0; i < ARRAY_SIZE; ++i)
	{
		h_in[i] = float(i);
	}
	float h_out[ARRAY_SIZE];

	//declare GPU memory pointers
	float* d_in, * d_out;

	//allocate GPU memory
	cudaMalloc((void**)&d_in, ARRAY_BYTES);
	cudaMalloc((void**)&d_out, ARRAY_BYTES);

	//transfer the array to GPU
	cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);

	//launch the kernel
	square << <1, ARRAY_SIZE >> > (d_out, d_in);

	//copy back the result array to CPU
	cudaMemcpy(h_out, d_out, ARRAY_BYTES, cudaMemcpyDeviceToHost);

	//print out the resulting array
	for (int i = 0; i < ARRAY_SIZE; i++)
	{
		printf("%f", h_out[i]);
		printf(((i % 4) != 3) ? "\t" : "\n");
	}

	//free gpu memory allocation
	cudaFree(d_in);
	cudaFree(d_out);

	return 0;
}

二、.cu和.cpp混合
为现有的cpp项目配置CUDA属性表
右键项目生成依赖项–CUDA11.0
右键项目属性–CUDA C/C+±-Target Machine改为64位
添加CUDA C/C++文件（这时右键看.cu文件属性，项类型应为CUDA C/C++, Target Machine Platform应为64位，无需更改）
在.cpp和.cu中分别include必要头文件：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

对于.cpp直接调用的.cu中的函数，在.cu中该函数前一行写extern “C”，并在.cpp中给出该函数的完整声明（带extern C)，此外还要注意，.cpp中function<<<A,B>>>(arg1,arg2)是无效写法，应通过封装的方式，将kernal调用写在.cu中
例子：
kernal.cu

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

__global__ void square(float* d_out, float* d_in)
{
	int idx = threadIdx.x;
	float f = d_in[idx];
	d_out[idx] = f * f;
}

extern "C"
void kernal()
{
	const int ARRAY_SIZE = 64;
	const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);

	//generate the input arrayon the host
	float h_in[ARRAY_SIZE];
	for (int i = 0; i < ARRAY_SIZE; ++i)
	{
		h_in[i] = float(i);
	}
	float h_out[ARRAY_SIZE];

	//declare GPU memory pointers
	float* d_in, * d_out;

	//allocate GPU memory
	cudaMalloc((void**)&d_in, ARRAY_BYTES);
	cudaMalloc((void**)&d_out, ARRAY_BYTES);

	//transfer the array to GPU
	cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);

	//launch the kernel
	square << <1, ARRAY_SIZE >> > (d_out, d_in);

	//copy back the result array to CPU
	cudaMemcpy(h_out, d_out, ARRAY_BYTES, cudaMemcpyDeviceToHost);

	//print out the resulting array
	for (int i = 0; i < ARRAY_SIZE; i++)
	{
		printf("%f", h_out[i]);
		printf(((i % 4) != 3) ? "\t" : "\n");
	}

	//free gpu memory allocation
	cudaFree(d_in);
	cudaFree(d_out);
}
main.cpp
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

extern "C"
void kernal();

int main()
{
	kernal();

	return 0;
}