windows10 vs2013控制台工程中添加并编译cuda8.0文件操作步骤

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/fengbingchun/article/details/71195074

一般有两种方法可以在vs2013上添加运行cuda8.0程序:

一、直接新建一个基于CUDA8.0的项目:如下图所示,


点击确定后即可生成test_cuda项目;默认会自动生成一个kernel.cu文件;默认已经配置好Debug/Release, Win32/x64环境,直接编译运行,结果如下图所示:函数执行的是两个数组的加操作。移除kernel.cu文件,加入自己需要的cuda文件即可进行实际操作了,非常方便。


二、实际情况下,多是在已有的项目中添加一些cuda文件,用于加速,下面说下具体的操作步骤:

1、新建一个CUDA_Test x64控制台空工程;

2、新建CUDA_Test.cpp文件;

3、选中CUDA_Test项目,右键单击-->生成依赖项-->生成自定义,勾选CUDA8.0,点击确定,如下图所示:


4、完成第3步后,再次打开工程的属性配置,会多出两项,CUDA C/C++和CUDA Linker,如下图所示:


5、新建或添加几个已有的文件,包括common.hpp、simple.hpp、simple.cpp、simple.cu,各个文件内容如下:

common.hpp:

#ifndef FBC_CUDA_TEST_COMMON_HPP_
#define FBC_CUDA_TEST_COMMON_HPP_

#define PRINT_ERROR_INFO(info) { \
	fprintf(stderr, "Error: %s, file: %s, func: %s, line: %d\n", #info, __FILE__, __FUNCTION__, __LINE__); \
	return -1; }

#endif // FBC_CUDA_TEST_COMMON_HPP_
simple.hpp:

#ifndef FBC_CUDA_TEST_SIMPLE_HPP_
#define FBC_CUDA_TEST_SIMPLE_HPP_

// reference: C:\ProgramData\NVIDIA Corporation\CUDA Samples\v8.0\0_Simple
int test_vectorAdd();


int vectorAdd_cpu(const float *A, const float *B, float *C, int numElements);

int vectorAdd_gpu(const float *A, const float *B, float *C, int numElements);

#endif // FBC_CUDA_TEST_SIMPLE_HPP_
simple.cpp:

#include "simple.hpp"
#include <stdlib.h>
#include <iostream>
#include "common.hpp"

// =========================== vector add =============================
int test_vectorAdd()
{
	// Vector addition: C = A + B, implements element by element vector addition
	const int numElements{ 50000 };
	float* A = new float[numElements];
	float* B = new float[numElements];
	float* C1 = new float[numElements];
	float* C2 = new float[numElements];

	// Initialize vector
	for (int i = 0; i < numElements; ++i) {
		A[i] = rand() / (float)RAND_MAX;
		B[i] = rand() / (float)RAND_MAX;
	}

	int ret = vectorAdd_cpu(A, B, C1, numElements);
	if (ret != 0) PRINT_ERROR_INFO(vectorAdd_cpu);

	ret = vectorAdd_gpu(A, B, C2, numElements);
	if (ret != 0) PRINT_ERROR_INFO(vectorAdd_gpu);

	for (int i = 0; i < numElements; ++i) {
		if (fabs(C1[i] - C2[i]) > 1e-5) {
			fprintf(stderr, "Result verification failed at element %d!\n", i);
			return -1;
		}
	}

	delete[] A;
	delete[] B;
	delete[] C1;
	delete[] C2;

	return 0;
}

int vectorAdd_cpu(const float *A, const float *B, float *C, int numElements)
{
	for (int i = 0; i < numElements; ++i) {
		C[i] = A[i] + B[i];
	}

	return 0;
}
simple.cu:

#include "simple.hpp"
#include <iostream>
#include <cuda_runtime.h> // For the CUDA runtime routines (prefixed with "cuda_")
#include <device_launch_parameters.h>

// reference: C:\ProgramData\NVIDIA Corporation\CUDA Samples\v8.0\0_Simple

// =========================== vector add =============================
__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements)
{
	int i = blockDim.x * blockIdx.x + threadIdx.x;

	if (i < numElements) {
		C[i] = A[i] + B[i];
	}
}

int vectorAdd_gpu(const float *A, const float *B, float *C, int numElements)
{
	// Error code to check return values for CUDA calls
	cudaError_t err{ cudaSuccess };
	size_t length{ numElements * sizeof(float) };
	fprintf(stderr, "Length: %d\n", length);
	float* d_A{ nullptr };
	float* d_B{ nullptr };
	float* d_C{ nullptr };

	err = cudaMalloc(&d_A, length);
	if (err != cudaSuccess) {
			fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
			return -1;
	}
	err = cudaMalloc(&d_B, length);
	if (err != cudaSuccess) {
		fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
		return -1;
	}
	err = cudaMalloc(&d_C, length);
	if (err != cudaSuccess) {
		fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
		return -1;
	}

	err = cudaMemcpy(d_A, A, length, cudaMemcpyHostToDevice);
	if (err != cudaSuccess) {
		fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
		return -1;
	}
	err = cudaMemcpy(d_B, B, length, cudaMemcpyHostToDevice);
	if (err != cudaSuccess) {
		fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err));
		return -1;
	}

	// Launch the Vector Add CUDA kernel
	int threadsPerBlock = 256;
	int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
	fprintf(stderr, "CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
	vectorAdd << <blocksPerGrid, threadsPerBlock >> >(d_A, d_B, d_C, numElements);
	err = cudaGetLastError();
	if (err != cudaSuccess) {
		fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
		return -1;
	}

	// Copy the device result vector in device memory to the host result vector in host memory.
	err = cudaMemcpy(C, d_C, length, cudaMemcpyDeviceToHost);
	if (err != cudaSuccess) {
		fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
		return -1;
	}

	err = cudaFree(d_A);
	if (err != cudaSuccess) {
		fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
		return -1;
	}

	err = cudaFree(d_B);
	if (err != cudaSuccess) {
		fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
		return -1;
	}

	err = cudaFree(d_C);
	if (err != cudaSuccess) {
		fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
		return -1;
	}

	return err;
}
CUDA_Test.cpp:
#include <iostream>
#include "simple.hpp"

int main()
{
	int ret = test_vectorAdd();

	if (ret == 0) fprintf(stderr, "***** test success *****\n");
	else fprintf(stderr, "===== test fail =====\n");

	return 0;
}
6、调整属性配置项:

(1)、CUDA C/C++-->Common中Target Machine Platform中默认是32-bit(--machine32),因为是x64,所以将其调整为64-bit(--machine 64);

(2)、添加附加库:链接器-->输入-->附加依赖项:cudart.lib;

(3)、消除nvcc warning: The 'compute_20', 'sm_20', and'sm_21' architectures are deprecated, and may be removed in a future release:CUDA C/C++-->Device: Code Generation:由compute_20,sm_20修改为compute_30,sm_30; compute_35,sm_35; compute_37,sm_37;compute_50,sm_50; compute_52,sm_52; compute_60,sm_60

以上code是参考NVIDIA Corporation\CUDA Samples\v8.0\0_Simple中vectorAdd例子进行的改写,输出结果如下:



GitHubhttps://github.com/fengbingchun/CUDA_Test

展开阅读全文

没有更多推荐了,返回首页