CUDA3特性中最吸引人的一条就是其对C++的支持,今天兄弟就切实的尝试了一下,所有的代码都以向量相加为例子,体会分享如下:
函数重载与默认参数
允许在同一个类中出现多个同名但参数列表不同的过程或函数,运行时才确定调用函数的入口地址。记得在2.3中就已经开始支持了,但是一直都没有尝试,今天就试了一下,感觉但是不错的,呵呵!代码如下:
其中定义的DEFAULT用于测试默认参数;同时还试验了一下__restrict__关键字,但是有个限制,如果大家试验一下相信很容易知道,呵呵!大家试试,看看是不是能够出正确结果。
运算符重载
代码如下:
yyfn是一个结构体,其有两个域,一个是money,一个是girl,运算符重载的使用使得程序清晰简洁了不少。但是运算符重载应当不能用于内核,因为运算符重载要返回值。
模板
CUDA很早以前就已经支持这个,所以就不详细说了,给了个例子,大家体会一下,呵呵!
函数对象
在C++中,函数对象广泛用于算法中,CUDA应该是为以后支持更多的算法准备的。代码如下 :
本来还想试验一下复杂的类机制,但是仔细一想,还是不打击自己了,呵呵!
函数重载与默认参数
允许在同一个类中出现多个同名但参数列表不同的过程或函数,运行时才确定调用函数的入口地址。记得在2.3中就已经开始支持了,但是一直都没有尝试,今天就试了一下,感觉但是不错的,呵呵!代码如下:
CODE:
#include
#define DEFAULT
#define BLOCKSIZE 256
__device__ int add(int a=0,int b=0){
return (a+b);
}
__device__ float add(float a,float b){
return (a+b);
}
__global__ void add(const int* __restrict__ a,const int* __restrict__ b,int* __restrict__ c,const unsigned int num){
const unsigned int id=blockDim.x*blockIdx.x+threadIdx.x;
if(id #ifdef DEFAULT
c[id]=add();
#else
c[id]=add(a[id],b[id]);
#endif
}
}
__global__ void add(const float* __restrict__ a,const float* __restrict__ b,float* __restrict__ c,const unsigned int num){
const unsigned int id=blockIdx.x*blockDim.x+threadIdx.x;
if(id c[id]=add(a[id],b[id]);
}
}
int test(const unsigned int num){
int *a;
int *b;
float *af;
float *bf;
cudaMallocHost((void**)&a,num*sizeof(int));
cudaMallocHost((void**)&b,num*sizeof(int));
cudaMallocHost((void**)&af,num*sizeof(float));
cudaMallocHost((void**)&bf,num*sizeof(float));
for(int i=0;i a[i]=1;
b[i]=2;
af[i]=1.1f;
bf[i]=2.2f;
}
int *d_a;
cudaMalloc((void**)&d_a,num*sizeof(int));
cudaMemcpyAsync(d_a,a,num*sizeof(int),cudaMemcpyHostToDevice,0);
float *d_af;
cudaMalloc((void**)&d_af,num*sizeof(float));
cudaMemcpyAsync(d_af,af,num*sizeof(float),cudaMemcpyHostToDevice,0);
int *d_b;
cudaMalloc((void**)&d_b,num*sizeof(int));
cudaMemcpyAsync(d_b,b,num*sizeof(int),cudaMemcpyHostToDevice,0);
float *d_bf;
cudaMalloc((void**)&d_bf,num*sizeof(float));
cudaMemcpyAsync(d_bf,bf,num*sizeof(int),cudaMemcpyHostToDevice,0);
int *c;
cudaMallocHost((void**)&c,num*sizeof(int));
float *cf;
cudaMallocHost((void**)&cf,num*sizeof(float));
int *d_c;
cudaMalloc((void**)&d_c,num*sizeof(int));
float *d_cf;
cudaMalloc((void**)&d_cf,num*sizeof(float));
add<<>>(d_a,d_b,d_c,num);
cudaMemcpyAsync(c,d_c,num*sizeof(int),cudaMemcpyDeviceToHost,0);
add<<>>(d_af,d_bf,d_cf,num);
cudaMemcpyAsync(cf,d_cf,num*sizeof(float),cudaMemcpyDeviceToHost,0);
for(int i=0;i if((i+1)%20==0)
printf("%d ",c[i]);
}
printf(".......................................................\n");
for(int i=0;i if((i+1)%20==0)
printf("%f ",cf[i]);
}
cudaFreeHost(a);
cudaFreeHost(b);
cudaFreeHost(c);
cudaFreeHost(af);
cudaFreeHost(bf);
cudaFreeHost(cf);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
cudaFree(d_af);
cudaFree(d_bf);
cudaFree(d_cf);
return 0;
}
int main(){
test(10000);
}
#define DEFAULT
#define BLOCKSIZE 256
__device__ int add(int a=0,int b=0){
return (a+b);
}
__device__ float add(float a,float b){
return (a+b);
}
__global__ void add(const int* __restrict__ a,const int* __restrict__ b,int* __restrict__ c,const unsigned int num){
const unsigned int id=blockDim.x*blockIdx.x+threadIdx.x;
if(id #ifdef DEFAULT
c[id]=add();
#else
c[id]=add(a[id],b[id]);
#endif
}
}
__global__ void add(const float* __restrict__ a,const float* __restrict__ b,float* __restrict__ c,const unsigned int num){
const unsigned int id=blockIdx.x*blockDim.x+threadIdx.x;
if(id c[id]=add(a[id],b[id]);
}
}
int test(const unsigned int num){
int *a;
int *b;
float *af;
float *bf;
cudaMallocHost((void**)&a,num*sizeof(int));
cudaMallocHost((void**)&b,num*sizeof(int));
cudaMallocHost((void**)&af,num*sizeof(float));
cudaMallocHost((void**)&bf,num*sizeof(float));
for(int i=0;i a[i]=1;
b[i]=2;
af[i]=1.1f;
bf[i]=2.2f;
}
int *d_a;
cudaMalloc((void**)&d_a,num*sizeof(int));
cudaMemcpyAsync(d_a,a,num*sizeof(int),cudaMemcpyHostToDevice,0);
float *d_af;
cudaMalloc((void**)&d_af,num*sizeof(float));
cudaMemcpyAsync(d_af,af,num*sizeof(float),cudaMemcpyHostToDevice,0);
int *d_b;
cudaMalloc((void**)&d_b,num*sizeof(int));
cudaMemcpyAsync(d_b,b,num*sizeof(int),cudaMemcpyHostToDevice,0);
float *d_bf;
cudaMalloc((void**)&d_bf,num*sizeof(float));
cudaMemcpyAsync(d_bf,bf,num*sizeof(int),cudaMemcpyHostToDevice,0);
int *c;
cudaMallocHost((void**)&c,num*sizeof(int));
float *cf;
cudaMallocHost((void**)&cf,num*sizeof(float));
int *d_c;
cudaMalloc((void**)&d_c,num*sizeof(int));
float *d_cf;
cudaMalloc((void**)&d_cf,num*sizeof(float));
add<<>>(d_a,d_b,d_c,num);
cudaMemcpyAsync(c,d_c,num*sizeof(int),cudaMemcpyDeviceToHost,0);
add<<>>(d_af,d_bf,d_cf,num);
cudaMemcpyAsync(cf,d_cf,num*sizeof(float),cudaMemcpyDeviceToHost,0);
for(int i=0;i if((i+1)%20==0)
printf("%d ",c[i]);
}
printf(".......................................................\n");
for(int i=0;i if((i+1)%20==0)
printf("%f ",cf[i]);
}
cudaFreeHost(a);
cudaFreeHost(b);
cudaFreeHost(c);
cudaFreeHost(af);
cudaFreeHost(bf);
cudaFreeHost(cf);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
cudaFree(d_af);
cudaFree(d_bf);
cudaFree(d_cf);
return 0;
}
int main(){
test(10000);
}
其中定义的DEFAULT用于测试默认参数;同时还试验了一下__restrict__关键字,但是有个限制,如果大家试验一下相信很容易知道,呵呵!大家试试,看看是不是能够出正确结果。
运算符重载
代码如下:
CODE:
#include
#define BLOCKSIZE 256
typedef struct __align__(8){
int money;
int girl;
}yyfn;
__device__ yyfn operator+ (yyfn& one,yyfn& two){
yyfn temp;
temp.money=one.money+two.money;
temp.girl=one.girl+two.girl;
return (temp);
}
__global__ void add(yyfn *a,yyfn *b,yyfn *c,const unsigned int num){
const unsigned int id=blockDim.x*blockIdx.x+threadIdx.x;
if(id c[id]=(a[id]+b[id]);
}
}
int test(const unsigned int num){
yyfn *a;
yyfn *b;
cudaMallocHost((void**)&a,num*sizeof(yyfn));
cudaMallocHost((void**)&b,num*sizeof(yyfn));
for(int i=0;i a[i].money=1;
a[i].girl=1;
b[i].money=2;
b[i].girl=1;
}
yyfn *d_a;
cudaMalloc((void**)&d_a,num*sizeof(yyfn));
cudaMemcpyAsync(d_a,a,num*sizeof(yyfn),cudaMemcpyHostToDevice,0);
yyfn *d_b;
cudaMalloc((void**)&d_b,num*sizeof(yyfn));
cudaMemcpyAsync(d_b,b,num*sizeof(yyfn),cudaMemcpyHostToDevice,0);
yyfn *c;
cudaMallocHost((void**)&c,num*sizeof(yyfn));
yyfn *d_c;
cudaMalloc((void**)&d_c,num*sizeof(yyfn));
add<<>>(d_a,d_b,d_c,num);
cudaMemcpyAsync(c,d_c,num*sizeof(yyfn),cudaMemcpyDeviceToHost,0);
for(int i=0;i if((i+1)%20==0)
printf("money=%d,girl=%d ",c[i].money,c[i].girl);
}
cudaFreeHost(a);
cudaFreeHost(b);
cudaFreeHost(c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
int main(){
test(10000);
}
#define BLOCKSIZE 256
typedef struct __align__(8){
int money;
int girl;
}yyfn;
__device__ yyfn operator+ (yyfn& one,yyfn& two){
yyfn temp;
temp.money=one.money+two.money;
temp.girl=one.girl+two.girl;
return (temp);
}
__global__ void add(yyfn *a,yyfn *b,yyfn *c,const unsigned int num){
const unsigned int id=blockDim.x*blockIdx.x+threadIdx.x;
if(id c[id]=(a[id]+b[id]);
}
}
int test(const unsigned int num){
yyfn *a;
yyfn *b;
cudaMallocHost((void**)&a,num*sizeof(yyfn));
cudaMallocHost((void**)&b,num*sizeof(yyfn));
for(int i=0;i a[i].money=1;
a[i].girl=1;
b[i].money=2;
b[i].girl=1;
}
yyfn *d_a;
cudaMalloc((void**)&d_a,num*sizeof(yyfn));
cudaMemcpyAsync(d_a,a,num*sizeof(yyfn),cudaMemcpyHostToDevice,0);
yyfn *d_b;
cudaMalloc((void**)&d_b,num*sizeof(yyfn));
cudaMemcpyAsync(d_b,b,num*sizeof(yyfn),cudaMemcpyHostToDevice,0);
yyfn *c;
cudaMallocHost((void**)&c,num*sizeof(yyfn));
yyfn *d_c;
cudaMalloc((void**)&d_c,num*sizeof(yyfn));
add<<>>(d_a,d_b,d_c,num);
cudaMemcpyAsync(c,d_c,num*sizeof(yyfn),cudaMemcpyDeviceToHost,0);
for(int i=0;i if((i+1)%20==0)
printf("money=%d,girl=%d ",c[i].money,c[i].girl);
}
cudaFreeHost(a);
cudaFreeHost(b);
cudaFreeHost(c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
int main(){
test(10000);
}
yyfn是一个结构体,其有两个域,一个是money,一个是girl,运算符重载的使用使得程序清晰简洁了不少。但是运算符重载应当不能用于内核,因为运算符重载要返回值。
模板
CUDA很早以前就已经支持这个,所以就不详细说了,给了个例子,大家体会一下,呵呵!
CODE:
#include
#define BLOCKSIZE 256
template
__global__ void add(const T* __restrict__ a,const T* __restrict__ b,T* __restrict__ c,const unsigned int num){
const unsigned int id=blockDim.x*blockIdx.x+threadIdx.x;
if(id c[id]=(a[id]+b[id]);
}
}
int test(const unsigned int num){
int *a;
int *b;
cudaMallocHost((void**)&a,num*sizeof(int));
cudaMallocHost((void**)&b,num*sizeof(int));
for(int i=0;i a[i]=1;
b[i]=2;
}
int *d_a;
cudaMalloc((void**)&d_a,num*sizeof(int));
cudaMemcpyAsync(d_a,a,num*sizeof(int),cudaMemcpyHostToDevice,0);
int *d_b;
cudaMalloc((void**)&d_b,num*sizeof(int));
cudaMemcpyAsync(d_b,b,num*sizeof(int),cudaMemcpyHostToDevice,0);
int *c;
cudaMallocHost((void**)&c,num*sizeof(int));
int *d_c;
cudaMalloc((void**)&d_c,num*sizeof(int));
add<<>>(d_a,d_b,d_c,num);
cudaMemcpyAsync(c,d_c,num*sizeof(int),cudaMemcpyDeviceToHost,0);
for(int i=0;i if((i+1)%20==0)
printf("%d ",c[i]);
}
cudaFreeHost(a);
cudaFreeHost(b);
cudaFreeHost(c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
int main(){
test(10000);
}
#define BLOCKSIZE 256
template
__global__ void add(const T* __restrict__ a,const T* __restrict__ b,T* __restrict__ c,const unsigned int num){
const unsigned int id=blockDim.x*blockIdx.x+threadIdx.x;
if(id c[id]=(a[id]+b[id]);
}
}
int test(const unsigned int num){
int *a;
int *b;
cudaMallocHost((void**)&a,num*sizeof(int));
cudaMallocHost((void**)&b,num*sizeof(int));
for(int i=0;i a[i]=1;
b[i]=2;
}
int *d_a;
cudaMalloc((void**)&d_a,num*sizeof(int));
cudaMemcpyAsync(d_a,a,num*sizeof(int),cudaMemcpyHostToDevice,0);
int *d_b;
cudaMalloc((void**)&d_b,num*sizeof(int));
cudaMemcpyAsync(d_b,b,num*sizeof(int),cudaMemcpyHostToDevice,0);
int *c;
cudaMallocHost((void**)&c,num*sizeof(int));
int *d_c;
cudaMalloc((void**)&d_c,num*sizeof(int));
add<<>>(d_a,d_b,d_c,num);
cudaMemcpyAsync(c,d_c,num*sizeof(int),cudaMemcpyDeviceToHost,0);
for(int i=0;i if((i+1)%20==0)
printf("%d ",c[i]);
}
cudaFreeHost(a);
cudaFreeHost(b);
cudaFreeHost(c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
int main(){
test(10000);
}
函数对象
在C++中,函数对象广泛用于算法中,CUDA应该是为以后支持更多的算法准备的。代码如下 :
CODE:
#include
#define BLOCKSIZE 256
class Add{
public:
template
__device__ T operator() (T& a,T& b) const {
return (a+b);
}
};
__global__ void add(const int* __restrict__ a,const int* __restrict__ b,int* __restrict__ c,const unsigned int num,Add op){
const unsigned int id=blockDim.x*blockIdx.x+threadIdx.x;
if(id c[id]=op(a[id],b[id]);
}
}
int test(const unsigned int num){
int *a;
int *b;
cudaMallocHost((void**)&a,num*sizeof(int));
cudaMallocHost((void**)&b,num*sizeof(int));
for(int i=0;i a[i]=1;
b[i]=2;
}
int *d_a;
cudaMalloc((void**)&d_a,num*sizeof(int));
cudaMemcpyAsync(d_a,a,num*sizeof(int),cudaMemcpyHostToDevice,0);
int *d_b;
cudaMalloc((void**)&d_b,num*sizeof(int));
cudaMemcpyAsync(d_b,b,num*sizeof(int),cudaMemcpyHostToDevice,0);
int *c;
cudaMallocHost((void**)&c,num*sizeof(int));
int *d_c;
cudaMalloc((void**)&d_c,num*sizeof(int));
add<<>>(d_a,d_b,d_c,num,Add());
cudaMemcpyAsync(c,d_c,num*sizeof(int),cudaMemcpyDeviceToHost,0);
for(int i=0;i if((i+1)%20==0)
printf("%d ",c[i]);
}
cudaFreeHost(a);
cudaFreeHost(b);
cudaFreeHost(c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
int main(){
test(10000);
}
#define BLOCKSIZE 256
class Add{
public:
template
__device__ T operator() (T& a,T& b) const {
return (a+b);
}
};
__global__ void add(const int* __restrict__ a,const int* __restrict__ b,int* __restrict__ c,const unsigned int num,Add op){
const unsigned int id=blockDim.x*blockIdx.x+threadIdx.x;
if(id c[id]=op(a[id],b[id]);
}
}
int test(const unsigned int num){
int *a;
int *b;
cudaMallocHost((void**)&a,num*sizeof(int));
cudaMallocHost((void**)&b,num*sizeof(int));
for(int i=0;i a[i]=1;
b[i]=2;
}
int *d_a;
cudaMalloc((void**)&d_a,num*sizeof(int));
cudaMemcpyAsync(d_a,a,num*sizeof(int),cudaMemcpyHostToDevice,0);
int *d_b;
cudaMalloc((void**)&d_b,num*sizeof(int));
cudaMemcpyAsync(d_b,b,num*sizeof(int),cudaMemcpyHostToDevice,0);
int *c;
cudaMallocHost((void**)&c,num*sizeof(int));
int *d_c;
cudaMalloc((void**)&d_c,num*sizeof(int));
add<<>>(d_a,d_b,d_c,num,Add());
cudaMemcpyAsync(c,d_c,num*sizeof(int),cudaMemcpyDeviceToHost,0);
for(int i=0;i if((i+1)%20==0)
printf("%d ",c[i]);
}
cudaFreeHost(a);
cudaFreeHost(b);
cudaFreeHost(c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
int main(){
test(10000);
}
本来还想试验一下复杂的类机制,但是仔细一想,还是不打击自己了,呵呵!
来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/23057064/viewspace-630474/,如需转载,请注明出处,否则将追究法律责任。
转载于:http://blog.itpub.net/23057064/viewspace-630474/