CUDA：二维grid和block

最新推荐文章于 2023-10-20 11:35:19 发布

心瘾こころ

最新推荐文章于 2023-10-20 11:35:19 发布

阅读量137

点赞数

文章标签：算法前端

本文链接：https://blog.csdn.net/weixin_51954774/article/details/132911171

版权

#include <cuda_runtime.h>
#include <stdio.h>

const int TILE_DIM = 32; // 使用32x32的线程块

// 矩阵复制的核函数
__global__ void matrixCopyKernel(const float* A, float* B, int width, int height) {
    const int nx = blockIdx.x * TILE_DIM + threadIdx.x;
    const int ny = blockIdx.y * TILE_DIM + threadIdx.y;

    if (nx < width && ny < height) {
        int index = ny * width + nx;
        B[index] = A[index];
    }
}

int main() {
    int width = 1024;
    int height = 1024;

    float* h_A, * h_B;
    float* d_A, * d_B;

    int size = width * height * sizeof(float);

    // 在host端分配内存
    h_A = (float*)malloc(size);
    h_B = (float*)malloc(size);

    // 初始化输入数据
    for (int i = 0; i < width * height; i++) {
        h_A[i] = (float)i;
    }

    // 在device端分配内存
    cudaMalloc((void**)&d_A, size);
    cudaMalloc((void**)&d_B, size);

    // 将数据从host复制到device
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);

    // 定义线程块和网格的尺寸
    dim3 block(TILE_DIM, TILE_DIM);
    dim3 grid((width + block.x - 1) / block.x, (height + block.y - 1) / block.y);

    // 调用核函数
    matrixCopyKernel << <grid, block >> > (d_A, d_B, width, height);

    // 将数据从device复制到host
    cudaMemcpy(h_B, d_B, size, cudaMemcpyDeviceToHost);

    // 清理
    free(h_A);
    free(h_B);
    cudaFree(d_A);
    cudaFree(d_B);

    return 0;
}