一、单独的.cu文件
方法一:新建CUDA Runtime项目
方法二:
新建空项目,调试选64位
配置属性表(安装目录下include、lib、lib中所有的库,以及C盘隐藏目录ProgramData中Nvidia Corporation 中 CUDA Samples 的 inc 和 lib)
右键项目生成依赖项–CUDA11.0
右键项目属性–CUDA C/C+±-Target Machine改为64位
新建CUDA C/C++ FILE(这时右键看.cu文件属性,项类型应为CUDA C/C++, Target Machine Platform应为64位,无需更改),在开头include必要头文件:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
例子:main.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
__global__ void square(float* d_out, float* d_in)
{
int idx = threadIdx.x;
float f = d_in[idx];
d_out[idx] = f * f;
}
int main()
{
const int ARRAY_SIZE = 64;
const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
//generate the input arrayon the host
float h_in[ARRAY_SIZE];
for (int i = 0; i < ARRAY_SIZE; ++i)
{
h_in[i] = float(i);
}
float h_out[ARRAY_SIZE];
//declare GPU memory pointers
float* d_in, * d_out;
//allocate GPU memory
cudaMalloc((void**)&d_in, ARRAY_BYTES);
cudaMalloc((void**)&d_out, ARRAY_BYTES);
//transfer the array to GPU
cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);
//launch the kernel
square << <1, ARRAY_SIZE >> > (d_out, d_in);
//copy back the result array to CPU
cudaMemcpy(h_out, d_out, ARRAY_BYTES, cudaMemcpyDeviceToHost);
//print out the resulting array
for (int i = 0; i < ARRAY_SIZE; i++)
{
printf("%f", h_out[i]);
printf(((i % 4) != 3) ? "\t" : "\n");
}
//free gpu memory allocation
cudaFree(d_in);
cudaFree(d_out);
return 0;
}
二、.cu和.cpp混合
为现有的cpp项目配置CUDA属性表
右键项目生成依赖项–CUDA11.0
右键项目属性–CUDA C/C+±-Target Machine改为64位
添加CUDA C/C++文件(这时右键看.cu文件属性,项类型应为CUDA C/C++, Target Machine Platform应为64位,无需更改)
在.cpp和.cu中分别include必要头文件:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
对于.cpp直接调用的.cu中的函数,在.cu中该函数前一行写extern “C”,并在.cpp中给出该函数的完整声明(带extern C),此外还要注意,.cpp中function<<<A,B>>>(arg1,arg2)是无效写法,应通过封装的方式,将kernal调用写在.cu中
例子:
kernal.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
__global__ void square(float* d_out, float* d_in)
{
int idx = threadIdx.x;
float f = d_in[idx];
d_out[idx] = f * f;
}
extern "C"
void kernal()
{
const int ARRAY_SIZE = 64;
const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
//generate the input arrayon the host
float h_in[ARRAY_SIZE];
for (int i = 0; i < ARRAY_SIZE; ++i)
{
h_in[i] = float(i);
}
float h_out[ARRAY_SIZE];
//declare GPU memory pointers
float* d_in, * d_out;
//allocate GPU memory
cudaMalloc((void**)&d_in, ARRAY_BYTES);
cudaMalloc((void**)&d_out, ARRAY_BYTES);
//transfer the array to GPU
cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);
//launch the kernel
square << <1, ARRAY_SIZE >> > (d_out, d_in);
//copy back the result array to CPU
cudaMemcpy(h_out, d_out, ARRAY_BYTES, cudaMemcpyDeviceToHost);
//print out the resulting array
for (int i = 0; i < ARRAY_SIZE; i++)
{
printf("%f", h_out[i]);
printf(((i % 4) != 3) ? "\t" : "\n");
}
//free gpu memory allocation
cudaFree(d_in);
cudaFree(d_out);
}
main.cpp
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
extern "C"
void kernal();
int main()
{
kernal();
return 0;
}
3137

被折叠的 条评论
为什么被折叠?



