CUDA矩阵转置

1 CUDA矩阵转置

#include <cstdio>
#include <iostream>
#include <chrono>

#define BLOCK_SIZE 32

__global__ void transpose(int* out, int* const in, int nx, int ny) {
    __shared__ int temp[(BLOCK_SIZE + 1) * BLOCK_SIZE];

    int x = blockDim.x * blockIdx.x + threadIdx.x;
    int y = blockDim.y * blockIdx.y + threadIdx.y;

    if (x >= nx || y >= ny) return;

    int rx = blockDim.y * blockIdx.y + threadIdx.x;
    int ry = blockDim.x * blockIdx.x + threadIdx.y;
    temp[threadIdx.y * (BLOCK_SIZE + 1) + threadIdx.x] = in[ry * nx + rx];
    __syncthreads();

    out[y * nx + x] = temp[threadIdx.x * (BLOCK_SIZE + 1) + threadIdx.y];

}

int main() {
    int nx = 1<<12, ny = 1<<12;
    int* in_host = (int*)malloc(sizeof(int) * nx * ny);
    int* out_host = (int*)malloc(sizeof(int) * nx * ny);

    for (int i = 0; i < nx * ny; i++) {
        in_host[i] = i;
    }

    int* in_device;
    int* out_device;
    cudaMalloc(&in_device, sizeof(int) * nx * ny);
    cudaMalloc(&out_device, sizeof(int) * nx * ny);
    cudaMemcpy(in_device, in_host, sizeof(int)*nx*ny, cudaMemcpyHostToDevice);

    transpose<<<dim3(nx/BLOCK_SIZE, ny/BLOCK_SIZE, 1), dim3(BLOCK_SIZE, BLOCK_SIZE, 1)>>>(out_device, in_device, nx, ny);
    cudaMemcpy(out_host, out_device, sizeof(int)*nx*ny, cudaMemcpyDeviceToHost);

    for (int y = 0; y < ny; y++) {
        for (int x = 0; x < nx; x++) {
            if (out_host[y*nx + x] != in_host[x*ny + y]) {
                printf("Wrong At (%d, %d)!\n", x, y);
                printf("out: %d\n", out_host[y*nx + x]);
                printf("in: %d\n", in_host[x*ny + y]);
                return -1;
            }
        }
    }
    printf("All Correct!\n");
    return 0;
}
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 3
    评论
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值