以向量加法为例,包含三个文件:kernel.h,kernel.cu,test.cpp
kernel.h:
#ifndef __KERNEL_H_
#define __KERNEL_H_
extern "C" void runtest();
#endif
kernel.cu:
#include "kernel.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
template <class T>
class operate {
public:
cudaError_t addWithCuda(T *c, const T *a, const T *b, unsigned int size);
};
template <class T>
void __global__ addKernel1(T *c, const T *a, const T *b)
{
int i = threadIdx.x;
c[i] = a[i] + b[i];
}
template <class T>
cudaError_t operate<T>::addWithCuda(T *c, const T *a, const T *b, unsigned int size)
{
T *dev_a = 0;
T *dev_b = 0;
T *dev_c = 0;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU