简单矩阵CUDA运算
在cuda里面运算,都需要再GPU里面分配空间,OpenCV中可以用GpuMat
__global__ void Add(int *A,int *B,int *C){
int i=threadIdx.x;
//x可以理解成index,
//二维数组的话,y*width+x 可以理解成线性存储的index
C[i]=A[i]+B[i];
}
int main(){
int N=10;
int A[10]={1,2,3,4,5,6,7,8,9,10};
int B[10]={3,3,3,3,3,3,3,3,3,3};
int *cuA,*cuB,*cuC;int C[N];
cudaMalloc((void**)&cuC,sizeof(int)*N);//分配空间
cudaMalloc((void**)&cuA,sizeof(int)*N);
cudaMalloc((void**)&cuB,sizeof(int)*N);
cudaMemcpy(cuA,A,sizeof(int)*N,cudaMemcpyHostToDevice);
cudaMemcpy(cuB,B,sizeof(int)*N,cudaMemcpyHostToDevice);
Add<<<1,N>>>(cuA,cuB,cuC);
cudaMemcpy(&C,cuC,sizeof(int)*N,cudaMemcpyDeviceToHost);
cudaFree(cuA);//释放空间
cudaFree(cuB);
cudaFree(cuC);
//2D
//A[][],B[][],C[][]
//const dim3 blockDim(8,8);//2的幂
//const dim3 gridDim((width+blockDim.x-1)/blockDim.x,(height+blockDim.y-1)/blockDim.y);
//Add<<<gridDim,blockDim>>>(A,B,C);
}
一个简易的矩阵运算 就算是完成了,够简单的.....