在网上查了如何在 NVIDIA GPU上来运行openCL , 结果大部分都是介绍在AMD GPU上开发的。所以在此写篇简单例子,介绍在NVIDIA GPU 上开发。
NVIDIA的显卡上,cuda装好opencl就自然有了,所以装cuda=装opencl. 【CUDA安装见 CUDA安装文章】
以下代码是 功能是实现 向量加: C=A+B ,代码是参照一本介绍在AMD GPU上开发的openCL教程(具体记不清了),在此我略微有添加改动的等。
系统:redhat Linux
GPU:NVIDIA K40
软件环境:CUDA-6.0
一:此程序有两个文件 1.vecadd.c 2 add.cl
vecadd.c
#include<stdio.h>
#include<CL/cl.h>
#include<stdlib.h>
#include<time.h>
#include<string.h>
#include<iostream>
#include<fstream>
using namespace std;
#define NWITEMS 262144 //2^18=262144
//read a text file into a string
int convertToString(const char *filename, std::string& s)
{
size_t size;
char* str;
std::fstream f(filename,(std::fstream::in | std::fstream::binary));
if(f.is_open()){
size_t filesize;
f.seekg(0,std::fstream::end);
size=filesize=(size_t)f.tellg();
f.seekg(0,std::fstream::beg);
str = new char[size+1];
if(!str){
f.close();
return -1;
}
f.read(str,filesize);
f.close();
str[size]='\0';
s=str;
delete[] str;
return 0;
}
printf("Error: Failed to open file %s\n",filename);
return 1;
}
int main()
{
float* buff1=NULL;
float* buff2=NULL;
float* buff=NULL;
buff1=(float*)malloc(NWITEMS*sizeof(float));
buff2=(float*)malloc(NWITEMS*sizeof(float));
buff=(float*)malloc(NWITEMS*sizeof(float));
int i;
srand((unsigned)time(NULL));
for(i=0;i<NWITEMS;i++){
buff1[1]=rand()%RAND_MAX;
}
srand((unsigned)time(NULL)+1000);
for(i=0;i<NWITEMS;i++){
buff2[1]=rand()%RAND_MAX;
}
for(i=0;i<NWITEMS;i++){
buff[i]=buff1[i]+buff2[i];
}
/************openCL begain*******/
cl_int status=0;
size_t deviceListSize;
cl_uint numPlatforms;
cl_platform_id platform=NULL;
/**********get platform*************/
status=clGetPlatformIDs(0,NULL,&numPlatforms);//get number of platform
//printf("status=%d\nnumPlatforms=%d\n",status,numPlatforms);
status=clGetPlatformIDs(1,&platform,NULL);//choose NV platform
/**********************get device form NV platform**************/
cl_uint numdevices;
cl_device_id device=NULL;
clGetDeviceIDs(platform,CL_DEVICE_TYPE_GPU,0,NULL,&numdevices);//get device number of NV platform
printf("numDevice=%d\n",numdevices);
clGetDeviceIDs(platform,CL_DEVICE_TYPE_GPU,1,&device,NULL);//choose a NV GPU device
cl_context context = clCreateContext(NULL,1,&device,NULL,NULL,NULL); //create Context;
cl_command_queue queue = clCreateCommandQueue(context,device,CL_QUEUE_PROFILING_ENABLE,NULL);//create commmand queue and enable profiling
/********************create openCL buffer object, and then copy buff1 to clbuff1, buf2 copy to clbuff2*****************/
cl_mem clbuf1 = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,NWITEMS*sizeof(cl_float),buff1,NULL);
cl_mem clbuf2 = clCreateBuffer(context,CL_MEM_READ_ONLY,NWITEMS*sizeof(cl_float),NULL,NULL);
status = clEnqueueWriteBuffer(queue,clbuf2,1,0,NWITEMS*sizeof(cl_float),buff2,0,0,0);
cl_mem clbuf = clCreateBuffer(context,CL_MEM_WRITE_ONLY,NWITEMS*sizeof(cl_float),NULL,NULL);
const char* filename="add.cl";
std::string sourceStr;
status = convertToString(filename,sourceStr);
const char* source = sourceStr.c_str();
size_t sourceSize[]={ strlen(source) };
/************create program object***********/
cl_program program = clCreateProgramWithSource(context,1,&source,sourceSize,NULL);
status = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
if(status!=0){
fprintf(stderr,"clBuild failed:%d\n",status);
char tbuf[100];
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0x10000, tbuf, NULL);
fprintf(stderr,"\n%s\n",tbuf);
return -1;
}
/***********create kernel object************/
cl_kernel kernel = clCreateKernel(program, "vecadd", NULL);
//set kernel argument
cl_int clnum=NWITEMS;
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&clbuf1);
clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&clbuf2);
clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&clbuf);
cl_event ev;
size_t global_work_size=NWITEMS;
clEnqueueNDRangeKernel(queue,kernel,1,NULL, &global_work_size,NULL,0,NULL,&ev);
clFinish(queue);
/**********copy data from device to host memory *****/
cl_float *ptr;
ptr=(cl_float *) clEnqueueMapBuffer(queue,clbuf, CL_TRUE, CL_MAP_READ, 0, NWITEMS * sizeof(cl_float),0,NULL, NULL, NULL);
/***************vertify the result******/
if(!memcmp(buff,ptr,NWITEMS)){
printf("vertify passed\n");
}else{
printf("vertify failed\n");
}
if(buff)
free(buff);
if(buff1)
free(buff1);
if(buff2)
free(buff2);
/*************free OpenCL resource object************/
clReleaseMemObject(clbuf1);
clReleaseMemObject(clbuf2);
clReleaseMemObject(clbuf);
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseContext(context);
return 0;
}
add.cl
__kernel void vecadd(__global const float* A, __global const float* B,__global float* C)
{
int id = get_global_id(0);
C[id] = A[id] + B[id];
}
二:编译
nvcc test.cpp -o test -lOpenCL //注意-lOopenCL 的大小写
三:执行
./vecadd
四:结果
numPlatforms=1 //环境中只有NVIDIA的平台,所以为 1
numDevice=3 //共有三块NVIDIA卡
verify passed