这是我的之前写的关于在CUDA中传输结构体的代码:CUDA结构体传输
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include<stdlib.h>
#define maxnumber 500
typedef struct node {
int piont[maxnumber][maxnumber];
} Node;
__global__ void gpuAddwith(Node* a,Node*c) {
int tid = threadIdx.x;
c->piont[tid][1] = 0;
c->piont[tid][2] = 1;
}
void Print(Node* a) {
int i = 5;
printf(" test[1] is:%d test[2]is:%d\n",a->piont[i][1], a->piont[i][2] );
}
int main(){
Node* h_a;
h_a = (Node*)malloc(sizeof(Node)*maxnumber);
for (int i = 0; i < maxnumber; i++) {
for (int j = 0; j < maxnumber; j++) {
h_a->piont[i][j] = i * maxnumber + j;
}
}
Print(h_a);
Node* d_a,*d_c;
cudaMalloc((Node**)&d_a, sizeof(Node) * maxnumber);
cudaMalloc((Node**)&d_c, sizeof(Node) * maxnumber);
cudaMemcpy(d_a, h_a, sizeof(Node) * maxnumber, cudaMemcpyHostToDevice);
cudaMemcpy(d_c, h_a, sizeof(Node) * maxnumber, cudaMemcpyHostToDevice);
gpuAddwith << <1,1000 >> > (d_a,d_c);
cudaMemcpy(h_a,d_c , sizeof(Node) * maxnumber, cudaMemcpyDeviceToHost);
Print(h_a);
cudaFree(d_a);
cudaFree(d_c);
free(h_a);
return 0;
}
最大数maxnumber设置为500时,运行第一遍,正确的输出了答案和结果,
但是将最大数maxnumber改成5000时,乍一看并没有出现什么严重的错误,用VS生成解决方案也并没有报错,但运行时就弹出了问题:
XXX 处引发的异常: XXX: 写入位置 XXX 时发生访问冲突。
这个问题我猜想是数组越界的问题,我们需要运行的数据超过了VS帮我们设置的内存上限,于是就出现了数组越界问题。
这是我在CSDN上找到的一篇使用结构体传输的帖子:cuda中结构体的赋值
按照这个帖子上写结构体就可以实现动态创建,编译器也不再会报错。不过我想再分享一种类似写法:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include<time.h>
#include <stdio.h>
#include<stdlib.h>
#include<string.h>
struct ProcPara
{
int* h_a;
int* h_b;
int* h_c;
int* d_a;
int* d_b;
int* d_c;
};
__global__ void addKernel(ProcPara* d_para) {
int i = threadIdx.x;
d_para->d_c[i] = d_para->d_a[i] + d_para->d_b[i];
}
void InitProcPara(ProcPara** ha_para, ProcPara** da_para, int arrySize)
{
cudaMallocHost((void**)ha_para, sizeof(ProcPara));
cudaMalloc((void**)da_para, sizeof(ProcPara));
ProcPara* h_para = *ha_para;
cudaMallocHost((void**)&h_para->h_a, arrySize * sizeof(int));
cudaMallocHost((void**)&h_para->h_b, arrySize * sizeof(int));
cudaMallocHost((void**)&h_para->h_c, arrySize * sizeof(int));
cudaMalloc((void**)&h_para->d_a, arrySize * sizeof(int));
cudaMalloc((void**)&h_para->d_b, arrySize * sizeof(int));
cudaMalloc((void**)&h_para->d_c, arrySize * sizeof(int));
cudaMemcpy(*da_para, *ha_para, sizeof(ProcPara), cudaMemcpyHostToDevice);
}
void DeinitProcPara(ProcPara* h_para, ProcPara* d_para) {
cudaFreeHost(h_para->h_a);
cudaFreeHost(h_para->h_b);
cudaFreeHost(h_para->h_c);
cudaFree(h_para->d_a);
cudaFree(h_para->d_b);
cudaFree(h_para->d_c);
cudaFreeHost(h_para);
cudaFree(d_para);
}
void addWithCuda(ProcPara* h_para, ProcPara* d_para, unsigned int arraySize)
{
cudaSetDevice(0);//设置设备为本台电脑;
cudaMemcpy(h_para->d_a, h_para->h_a, arraySize * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(h_para->d_b, h_para->h_b, arraySize * sizeof(int), cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
addKernel << <1,1000 >> > (d_para);
cudaMemcpy(h_para->h_c, h_para->d_c, arraySize * sizeof(int), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
}
int main()
{
const int arraySize = 50000;
int a[arraySize];
int b[arraySize];
int c[arraySize];
for (int i = 0; i < arraySize; i++) {
a[i] = i;
b[i] = arraySize-i;
c[i] = 0;
}
ProcPara* h_para;
ProcPara* d_para;
InitProcPara(&h_para, &d_para, arraySize);
memcpy(h_para->h_a, a, arraySize * sizeof(int));
memcpy(h_para->h_b, b, arraySize * sizeof(int));
printf("a[5]=%d,b[5]=%d,c[5]=%d \n", a[5], b[5], c[5]);
addWithCuda(h_para, d_para, arraySize);
memcpy(c, h_para->h_c, arraySize * sizeof(int));
printf("a[5]=%d,b[5]=%d,c[5]=%d \n",a[5],b[5],c[5]);
DeinitProcPara(h_para, d_para);
return 0;
}
这下最大数maxnumber改成50000都没有问题了