花了两个晚上,搞了一个使用CUDA驱动API的简单例子.可以从这个例子出发,修改出你想要的程序.例子的功能很简单,就是Hello CUDA!(被我改成了New CUDA!呵呵).好了,废话少说,下面就是说明和代码.
1)原程序由两部分组成:
a)由kernel.cu编译成的kernel.cubin.我用的是sdk 3.0.因此,cubin被编译成为了elf文件格式.不过,没关系,在程序执行时照样可装入使用.
kernel.cu的代码如下:
__device__ void HelloCUDA(char *result, int num)
{
int i = 0;
char p_HelloCUDA[] = "New CUDA!";
for(i = 0; i < num; i++) {
result[i] = p_HelloCUDA[i];
}
}
__global__ static void GPUMain(char* result, int num)
{
HelloCUDA(result, num);
}
这里要注意两点:
i:编译器可能优化你的程序.在kernel.cubin中,你将找不到HelloCUDA函数.
ii:在kernel.cubin中,GPUMain的函数名成为了_Z7GPUMainPci.呵呵.
b)主机程序sample.cpp.
嗯?cpp?不是cu?
是的,是cpp!而且可以在vc6.0下编译通过!总算又看到了VC6的"亲切面孔"....
(和普通的vc6一样,不要特别的设置。当然,别忘了设置include路径,lib路径,还有连接库上别忘了加上cuda.lib)
下面是sample.cpp的代码:
/********************************************************************
* sample.cu
* This is a example of the CUDA program.
*********************************************************************/
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <io.h>
#include <stdio.h>
#include <stdlib.h>
#include <windows.h>
#include <cuda.h>
#include <cutil.h>
/************************************************************************/
/* Define for CUDA */
/************************************************************************/
#define ALIGN_UP(offset, alignment) /
(offset)=(((offset)+(alignment)-1) & ~((alignment)-1))
/************************************************************************/
/* Init CUDA */
/************************************************************************/
bool InitCUDA(void)
{
static int nGpuArchCoresPerSM[] = { -1, 8, 32 };
int driverVersion;
char deviceName[256];
int major;
int minor;
unsigned int totalGlobalMem;
int multiProcessorCount;
int totalConstantMemory;
int sharedMemPerBlock;
int regsPerBlock;
int warpSize;
int maxThreadsPerBlock;
int blockDim[3];
int gridDim[3];
int memPitch;
int clockRate;
int gpuOverlap;
int textureAlign;
int kernelExecTimeoutEnabled;
int integrated;
int canMapHostMemory;
int computeMode;
CUresult rc;
int count = 0;
int i = 0;
/* Init */
rc=cuInit(0);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "CUDA init error./n");
return false;
}
/* Get device num */
rc=cuDeviceGetCount(&count);
if((rc!=CUDA_SUCCESS)|| (count==0))
{
fprintf(stderr, "There is no device supporting CUDA./n");
return false;
}
/* Get API version */
rc=cuDriverGetVersion(&driverVersion);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuDriverGetVersion./n");
return false;
}
printf("CUDA Driver Version: %d.%d/n/n", driverVersion/1000, driverVersion%100);
/* Get device info */
for(i=0; i<count; i++)
{
rc=cuDeviceComputeCapability(&major, &minor,i);
if(rc==CUDA_SUCCESS)
{
if ((major == 9999) && (minor == 9999))
{ /* emu device */
fprintf(stderr, "Find the emu device./n");
continue;
}
/* display the info */
rc=cuDeviceGetName(deviceName, 256, i);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuDeviceGetName(%ld)./n", i);
continue;
}
printf("Device %ld: /"%s/"/n", i, deviceName);
printf(" CUDA Capability Major revision number: %d/n", major);
printf(" CUDA Capability Minor revision number: %d/n", minor);
rc=cuDeviceTotalMem(&totalGlobalMem, i);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuDeviceTotalMem(%ld)./n", i);
continue;
}
printf(" Total amount of global memory: %u bytes/n", totalGlobalMem);
rc=cuDeviceGetAttribute( &multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, i);