CUDA矩阵转置

xiyanjoy

已于 2023-12-21 00:53:19 修改

阅读量324

点赞数

文章标签：矩阵线性代数 c++

于 2023-08-01 23:55:36 首次发布

本文链接：https://blog.csdn.net/xxt228/article/details/132053098

版权

1 CUDA矩阵转置

#include <cstdio>
#include <iostream>
#include <chrono>

#define BLOCK_SIZE 32

__global__ void transpose(int* out, int* const in, int nx, int ny) {
    __shared__ int temp[(BLOCK_SIZE + 1) * BLOCK_SIZE];

    int x = blockDim.x * blockIdx.x + threadIdx.x;
    int y = blockDim.y * blockIdx.y + threadIdx.y;

    if (x >= nx || y >= ny) return;

    int rx = blockDim.y * blockIdx.y + threadIdx.x;
    int ry = blockDim.x * blockIdx.x + threadIdx.y;
    temp[threadIdx.y * (BLOCK_SIZE + 1) + threadIdx.x] = in[ry * nx + rx];
    __syncthreads();

    out[y * nx + x] = temp[threadIdx.x * (BLOCK_SIZE + 1) + threadIdx.y];

}

int main() {
    int nx = 1<<12, ny = 1<<12;
    int* in_host = (int*)malloc(sizeof(int) * nx * ny);
    int* out_host = (int*)malloc(sizeof(int) * nx * ny);

    for (int i = 0; i < nx * ny; i++) {
        in_host[i] = i;
    }

    int* in_device;
    int* out_device;
    cudaMalloc(&in_device, sizeof(int) * nx * ny);
    cudaMalloc(&out_device, sizeof(int) * nx * ny);
    cudaMemcpy(in_device, in_host, sizeof(int)*nx*ny, cudaMemcpyHostToDevice);

    transpose<<<dim3(nx/BLOCK_SIZE, ny/BLOCK_SIZE, 1), dim3(BLOCK_SIZE, BLOCK_SIZE, 1)>>>(out_device, in_device, nx, ny);
    cudaMemcpy(out_host, out_device, sizeof(int)*nx*ny, cudaMemcpyDeviceToHost);

    for (int y = 0; y < ny; y++) {
        for (int x = 0; x < nx; x++) {
            if (out_host[y*nx + x] != in_host[x*ny + y]) {
                printf("Wrong At (%d, %d)!\n", x, y);
                printf("out: %d\n", out_host[y*nx + x]);
                printf("in: %d\n", in_host[x*ny + y]);
                return -1;
            }
        }
    }
    printf("All Correct!\n");
    return 0;
}