#include <cuda_runtime.h>
#include <stdio.h>
const int TILE_DIM = 32; // 使用32x32的线程块
// 矩阵复制的核函数
__global__ void matrixCopyKernel(const float* A, float* B, int width, int height) {
const int nx = blockIdx.x * TILE_DIM + threadIdx.x;
const int ny = blockIdx.y * TILE_DIM + threadIdx.y;
if (nx < width && ny < height) {
int index = ny * width + nx;
B[index] = A[index];
}
}
int main() {
int width = 1024;
int height = 1024;
float* h_A, * h_B;
float* d_A, * d_B;
int size = width * height * sizeof(float);
// 在host端分配内存
h_A = (float*)malloc(size);
h_B = (float*)malloc(size);
// 初始化输入数据
for (int i = 0; i < width * height; i++) {
h_A[i] = (float)i;
}
// 在device端分配内存
cudaMalloc((void**)&d_A, size);
cudaMalloc((void**)&d_B, size);
// 将数据从host复制到device
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
// 定义线程块和网格的尺寸
dim3 block(TILE_DIM, TILE_DIM);
dim3 grid((width + block.x - 1) / block.x, (height + block.y - 1) / block.y);
// 调用核函数
matrixCopyKernel << <grid, block >> > (d_A, d_B, width, height);
// 将数据从device复制到host
cudaMemcpy(h_B, d_B, size, cudaMemcpyDeviceToHost);
// 清理
free(h_A);
free(h_B);
cudaFree(d_A);
cudaFree(d_B);
return 0;
}
CUDA:二维grid和block
最新推荐文章于 2023-10-20 11:35:19 发布