本文给出了一个使用G++与NVCC混合编译的例子,该例子用于向量的相加。
CPP文件
/**
* @file vectorAdd.cpp
* @date Thu 22 Aug 2019 09:37:03 AM CST
******************************************************************************/
#include <iostream>
using namespace std;
const int N = 1024;
extern "C"
void vectorAdd(int *a, int *b, int *c, const int n);
int main(int argc, char const* argv[])
{
int a[N];
int b[N];
int c[N];
for (int i = 0; i < N; i++) {
a[i] = 2 * i;
b[i] = 1;
c[i] = 0;
}
vectorAdd(a, b, c, N);
for (int i = 0; i < N; i++) {
cout << c[i] << "\t";
}
cout << endl;
return 0;
}
CU文件
/**
* @file vectorAdd.cu
* @date Thu 22 Aug 2019 09:36:58 AM CST
******************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
__global__ void vectorAddKernel(int *da, int *db, int *dc, const int n)
{
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < n) {
dc[i] = da[i] + db[i];
}
}
extern "C"
void vectorAdd(int *a, int *b, int *c, const int n)
{
int *da = NULL, *db = NULL, *dc = NULL;
cudaMalloc((void**)(&da), n * sizeof(int));
cudaMalloc((void**)(&db), n * sizeof(int));
cudaMalloc((void**)(&dc), n * sizeof(int));
cudaMemcpy(da, a, n * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(db, b, n * sizeof(int), cudaMemcpyHostToDevice);
const int nt = 256;
const int nb = (n + nt - 1) / nt;
vectorAddKernel<<<nt, nb>>>(da, db, dc, n);
cudaMemcpy(c, dc, n * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(da);
cudaFree(db);
cudaFree(dc);
}
编译
g++ vectorAdd.cpp -c -o vectorAdd.cpp.o
nvcc vectorAdd.cu -c -o vectorAdd.cu.o
g++ -o vectorAdd vectorAdd.cpp.o vectorAdd.cu.o -lcudart -L/path/to/cuda/lib64