背景说明:本文目的在于厘清CUDA C/C++与GNU C/C++的调用与编译链接关系。
代码参考:https://devblogs.nvidia.com/how-implement-performance-metrics-cuda-cc/
编译参考:https://stackoverflow.com/questions/9421108/how-can-i-compile-cuda-code-then-link-it-to-a-c-project
代码如下:
add.h ,注意这里的头文件没有__host__修饰符
#include <stdio.h>
void call(void);
void wrapper(void);
add.cu
#include "add.h"
__global__ void saxpy(int n, float a, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n) y[i] = a*x[i] + y[i];
}
__host__ void call(void)
{
int N = 20 * (1 << 20);
float *x, *y, *d_x, *d_y;
x = (float*)malloc(N*sizeof(float));
y = (float*)malloc(N*sizeof(float));
cudaMalloc(&d_x, N*sizeof(float));
cudaMalloc(&d_y, N*sizeof(float));
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
// Perform SAXPY on 1M elements.
saxpy<<<(N+511)/512, 512>>>(N, 2.0f, d_x, d_y);
cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
float maxError = 0.0f;
for (int i = 0; i < N; i++) {
maxError = max(maxError, abs(y[i]-4.0f));
}
printf("Max error: %f\n", maxError);
cudaFree(d_x);
cudaFree(d_y);
free(x);
free(y);
}
__host__ void wrapper(void)
{
call();
}
main.cpp
#include <stdio.h>
#include "add.h"
int main(void)
{
call();
wrapper();
}
编译
1、编译add.cu文件
nvcc -c -o add.o add.cu
2、混编cpp与add.o文件,这里有两种方法,一种用nvcc编译,一种是用g++编译。
A、使用nvcc混编:
nvcc -o main main.cpp add.o
B、使用g++混编:
g++ -o main main.cpp add.o -L<cuda-home-dir>/lib64 -lcuda -lcudart
Done!