下面是一个简单的CUDA式的Hello World。
/************************************************************************
* [!output PROJECT_NAME].cu
* This is a example of the CUDA program.
************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
/************************************************************************/
/* Example */
/************************************************************************/
__global__ static void HelloCUDA(char* result, int num, clock_t* time)
{
int i = 0;
char p_HelloCUDA[] = "Hello CUDA!";
clock_t start = clock();
for(i = 0; i < num; i++) {
result[i] = p_HelloCUDA[i];
}
*time = clock() - start;
}
int main(int argc, char** argv)
{
char *device_result = 0;
clock_t *time = 0;
char host_result[12] ={0};
clock_t time_used = 0;
int deviceCount;
int device;
cudaGetDeviceCount(&deviceCount);
for (device = 0; device < deviceCount; ++device)
{
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, device);
printf("Device %d has compute capability %d.%d .\n",
device, deviceProp.major, deviceProp.minor);
}
cudaMalloc((void**) &device_result, sizeof(char) * 11);
cudaMalloc((void**) &time, sizeof(clock_t));
HelloCUDA<<<1, 1, 0>>>(device_result, 11 , time);
cudaMemcpy(&host_result, device_result, sizeof(char) * 11, cudaMemcpyDeviceToHost);
cudaMemcpy(&time_used, time, sizeof(clock_t), cudaMemcpyDeviceToHost);
cudaFree(device_result);
cudaFree(time);
printf("%s,%d\n", host_result, time_used);
return 0;
}
用命令行来编译,相对简单许多,nvcc.exe --help >nvcc.txt得到帮助文档,方便查看。对应上面的程序,批处理文件如下(保存为make.bat双击运行):
@echo off
set myFun=sample
call "%VS90COMNTOOLS%vsvars32.bat"
set include=%CUDA_INC_PATH%;%include%
set lib=%CUDA_LIB_PATH%;%lib%
set path=%CUDA_BIN_PATH%;%path%
echo ------------------===By GoldenSpider 2011-10-8===------------------
nvcc %myFun%.cu -c -Xcompiler "/MD " -o "%myFun%.obj"
link /OUT:"%myFun%.exe" /SUBSYSTEM:console /nologo %myFun%.obj cudart.lib kernel32.lib msvcrt.lib
echo -------------------------------------------------------------------
echo Good Job, Compiler Success!! Run EXE(Y/?)
pause
%myFun%.exe
pause
效果:(好像不支持上传图片,就复制一下cmd下的结果吧^_^)
Setting environment for using Microsoft Visual Studio 2008 x86 tools.
------------------===By GoldenSpider 2011-10-8===------------------
sample.cu
tmpxft_00000cf0_00000000-3_sample.cudafe1.gpu
tmpxft_00000cf0_00000000-8_sample.cudafe2.gpu
sample.cu
tmpxft_00000cf0_00000000-3_sample.cudafe1.cpp
tmpxft_00000cf0_00000000-14_sample.ii
-------------------------------------------------------------------
Good Job, Compiler Success!! Run EXE(Y/?)
请按任意键继续. . .
Device 0 has compute capability 1.2 .
Hello CUDA!,8876
请按任意键继续. . .
上面是基本入门,如果想用vc6.0编译怎么办呢,要是用汇编该怎么写呢,思路也很简单,就是用CUDA Driver API.设备码交给nvcc编译,得到ptx或cubin。主机码交给vc编译或汇编器来编译。ptx、cubin仅仅作为数据。实质上也是这么做的。具体的可以参考vectorAddDrv这个实例。你能:
call "%VS90COMNTOOLS%vsvars32.bat"
set include=%CUDA_INC_PATH%;%include%
set lib=%CUDA_LIB_PATH%;%lib%
set path=%CUDA_BIN_PATH%;%path%
nvcc -ptx VecAdd.cu
再:
@echo off
call "E:\Microsoft Visual Studio\VC98\Bin\vcvars32.bat"
set include=%CUDA_INC_PATH%;%include%
set lib=%CUDA_LIB_PATH%;%lib%
set myHost=main
cl /c /MD %myHost%.cpp
link /SUBSYSTEM:console /nologo %myHost%.obj cuda.lib kernel32.lib msvcrt.lib
%myHost%.exe
pause
执行效果:
cuDeviceGet returns: 0
cuCtxCreate returns: 0
cuModuleLoad returns: 0
allocating d_a returns: 0
copy data for a returns: 0
getting the function handle returns: 0
kernel launch returns: 0
copy from device to host returns: 0
2.1000 ....
查看其导入库:
导入表所处的节: .rdata
----------------------------------------------------------
导入库: nvcuda.dll
----------------------------------------------------------
OriginalFirstThunk 000020FC
TimeDateStamp 00000000
ForwarderChain 00000000
FirstThunk 00002044
----------------------------------------------------------
导入序号 导入函数名称
----------------------------------------------------------
00000084 cuInit
00000059 cuDeviceGetCount
00000057 cuDeviceGet
0000000D cuCtxCreate_v2
000000E0 cuModuleLoad
0000008E cuMemAlloc_v2
000000C6 cuMemcpyHtoD_v2
000000DB cuModuleGetFunction
00000088 cuLaunchKernel
000000BE cuMemcpyDtoH_v2
----------------------------------------------------------
导入库: MSVCRT.dll
----------------------------------------------------------
OriginalFirstThunk 000020B8
TimeDateStamp 00000000
ForwarderChain 00000000
FirstThunk 00002000
运行库已经不再是MSVCR90.dll ,呵呵。上面的VecAdd.cu代码如下:
__global__ void VecAdd(const float* A, const float* B, float* C, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N)
C[i] = A[i] + B[i];
}