直接上代码,最奇怪的几点就是貌似
1.对设备内存的操作-指定二维指针指向的一维指针,需要在设备函数中做,在主机中做会爆炸--
2.貌似设备的形参就是传递了引用。,而不需要加&了,非常奇怪,加了&反而会错误。但是在主机代码中,是需要加& 的。我只能瞎猜,CUDA的函数里面不允许引用了。。补充一个PPT
#include <cuda_runtime.h>
#include <stdio.h>
void read_error(cudaError_t ret,int count){
if (ret != cudaSuccess)
{
printf("------------%d--------------\n",count);
printf("%s\n", cudaGetErrorString(ret));
}
}
__global__ void matrix_add(int ** A,int ** B,int ** C,int n,int m){
int x = blockIdx.x*blockDim.x+threadIdx.x;
int y = blockIdx.y*blockDim.y+threadIdx.y;
if(x<n && y<m){
C[y][x] = A[y][x] + B[y][x];
}
}
int divup(int a,int b){
if(a%b==0){
return a/b;
}
else{
return a/b+1;
}
}
void init_data(int * data,int elem){
for(int i=0;i<elem;i++){
data[i] = rand()%65535;
}
}
__global__ void init_matrix(int **A, int *A_data, int n, int m){
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid<n){
A[tid] = &A_data[tid*m];
}
}
void Memory_host(int ** &A,int * &A_data,int n,int m){
A = (int **)malloc(sizeof(int *)*n);
A_data = (int *)malloc(sizeof(int)*n*m);
for(int i=0;i<n;i++){
A[i] = &A_data[i*n];
}
}
void Memory_device(int ** &A,int * &A_data,int n,int m){
cudaError_t ret = cudaMalloc((void **)&A_data,sizeof(int)*n*m);
read_error(ret,999);
cudaMalloc((void **)&A,sizeof(int*)*n);
init_matrix<<<1,n>>>(A,A_data,n,m);
}
int main(){
int m = 6,n=6,w=2,h=2;
dim3 block(w,h,1);
dim3 grid(divup(m,w),divup(n,h),1);
cudaError_t ret;
int *F;
ret = cudaMalloc((void **)&F,sizeof(int)*n*m);
// read_error(ret);
int **A,*A_data;
Memory_host(A,A_data,n,m);
init_data(A_data,n*m);
int **d_A,*d_A_data;
Memory_device(d_A,d_A_data,n,m);
ret = cudaMemcpy(d_A_data,A_data,sizeof(int)*n*m,cudaMemcpyHostToDevice);
read_error(ret,2);
int **B,*B_data;
Memory_host(B,B_data,n,m);
init_data(B_data,n*m);
int **d_B,*d_B_data;
Memory_device(d_B,d_B_data,n,m);
cudaMemcpy(d_B_data,B_data,sizeof(int)*n*m,cudaMemcpyHostToDevice);
int **C,*C_data;
Memory_host(C,C_data,n,m);
int **d_C,*d_C_data;
Memory_device(d_C,d_C_data,n,m);
matrix_add<<<grid,block>>>(d_A,d_B,d_C,n,m);
cudaMemcpy(C_data,d_C_data,sizeof(int)*n*m,cudaMemcpyDeviceToHost);
// printf("%d %d\n",n,m);
for(int i=0;i<n;i++){
for(int j=0;j<m;j++){
printf("%d ",A[i][j]);
}
printf("\n");
}
printf("\n\n");
for(int i=0;i<n;i++){
for(int j=0;j<m;j++){
printf("%d ",B[i][j]);
}
printf("\n");
}
printf("\n\n");
for(int i=0;i<n;i++){
for(int j=0;j<m;j++){
printf("%d ",C[i][j]);
}
printf("\n");
}
free(A_data);
cudaFree(d_A_data);
cudaDeviceReset();
}