1.简介
本文中主要介绍了使用block来集散,每个block计算数组上对应两个数的乘积,最后将结果数组拷贝到CPU主内存上即可。
2.cuda代码演示
#include<iostream>
#include<cstdlib>
using namespace std;
#define N 100
__global__ void add(int *a,int *b,int *c) {
int ind = blockIdx.x;
c[ind] = a[ind] * b[ind];
}
void initialVector(int length,int *arr,int limit) {
for(int i=0;i<length;i++,limit++)
*(arr+i) = limit;
}
int main() {
int *a,*b,*c;
a = (int *)malloc(sizeof(int)*N);
b = (int *)malloc(sizeof(int)*N);
c = (int *)malloc(sizeof(int)*N);
initialVector(N,a,1);
initialVector(N,b,1);
int *d_a,*d_b,*d_c;
/* allocate memory on device */
cudaMalloc((void **)&d_a,N*sizeof(int));
cudaMalloc((void **)&d_b,N*sizeof(int));
cudaMalloc((void **)&d_c,N*sizeof(int));
cudaMemcpy(d_a,a,N*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(d_b,b,N*sizeof(int),cudaMemcpyHostToDevice);
add<<<N,1>>>(d_a,d_b,d_c);
cudaMemcpy(c,d_c,N*sizeof(int),cudaMemcpyDeviceToHost);
cout<<"c["<<N-1<<"]:"<<c[N-1]<<endl;
free(a);
free(b);
free(c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}