主函数在main.cpp中,用clang++编译[注:g++(gcc)不行,必须用clang++(clang)]
cuda函数放在KernelWrapper.cu中,用nvcc编译。另外main.cpp中需要包含头文件
KernelWrapper.h
#ifndef _KernelWrapper_h
#define _KernelWrapper_h
void RunTest();
#endif
KernelWrapper.cu
#include
#include "KernelWrapper.h"
__global__ void TestDevice(int *deviceArray)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
deviceArray[idx] = deviceArray[idx]*deviceArray[idx];
}
void RunTest()
{
int* hostArray;
int* deviceArray;
const int arrayLength = 16;
const unsigned int memSize = sizeof(int) * arrayLength;
hostArray = (int*)malloc(memSize);
cudaMalloc((void**) &deviceArray, memSize);
printf("Init Data\n");
for(int i=0;i
{
hostArray[i] = i+1;
printf("%d\n", hostArray[i]);
}
cudaMemcpy(deviceArray, hostArray, memSize, cudaMemcpyHostToDevice);
TestDevice <<< 4, 4 >>> (deviceArray);
cudaMemcpy(hostArray, deviceArray, memSize, cudaMemcpyDeviceToHost);
printf("After Kernel Function\n");
for(int i=0;i
{
printf("%d\n", hostArray[i]);
}
cudaFree(deviceArray);
free(hostArray);
printf("done");
}
main.cp
#include "KernelWrapper.h"
int main( int argc, char** argv)
{
RunTest();
return 0;
}
Makefile
all: program
program: KernelWrapper.o main.o
clang++ -o program -L/usr/local/cuda/lib64 -lcuda -lcudart KernelWrapper.o main.o
KernelWrapper.o:KernelWrapper.cu
/usr/local/cuda/bin/nvcc -c KernelWrapper.cu
main.o: main.cpp
clang++ -c main.cpp
clean:
rm -f *.o program