文章目录
前言
本教程实现一个A[5] 和 B[3][5]两个矩阵之间欧氏距离的CUDA代码。
#include <stdio.h>
#define N 5
#define D 3
#define SIZE N*D
void __global__ cpt(int *da, int *db, int *dres);
void __global__ cpt(int *da, int *db, int *dres)
{
int tid = threadIdx.x; // tid = 0,1,2
int sum=0; // register
for(int i=0; i<N; ++i)
{
sum += (da[i]-db[tid*N + i]) * (da[i]-db[tid*N]+i);
}
dres[tid] = sum;
}
int main(int arc, char *argv[])
{
// host memory and assignment
int *ha, *hb, *hres;
ha =(int *)malloc(sizeof(int)*N);
hb =(int *)malloc(sizeof(int)* SIZE);
hres = (int *)malloc(sizeof(int)*D);
for(int i=0; i<N; ++i)
{
ha[i] = 1;
}
for(int i=0; i<SIZE; ++i)
{
hb[i] = 0;
}
for(int i=0; i<D; ++i)
{
hres[i] = 0;
}
// device memory and copy
int *da, *db, *dres;
cudaMalloc((void **)&da, sizeof(int)*N);
cudaMalloc((void **)&db, sizeof(int)*SIZE);
cudaMalloc((void **)&dres, sizeof(int)*D);
cudaMemcpy(da, ha, sizeof(int)*N, cudaMemcpyHostToDevice);
cudaMemcpy(db, hb, sizeof(int)*SIZE, cudaMemcpyHostToDevice);
cudaMemcpy(dres, hres, sizeof(int)*D, cudaMemcpyHostToDevice);
// set threads and global kerner fun
const dim3 grid_size(1);
const dim3 block_size(D);
cpt<<<grid_size,block_size>>>(da,db,dres);
// cpy device to host
cudaMemcpy(hres, dres, sizeof(int)*D, cudaMemcpyDeviceToHost);
printf("%d\n",hres[0]);
// free memory
free(ha);
free(hb);
free(hres);
cudaFree(da);
cudaFree(db);
cudaFree(dres);
return 0;
}