CUDA:消除bank conflict前后的矩阵转置

#include <iostream>
#include <cuda_runtime.h>

const int TILE_DIM = 32;

typedef float real;

__global__ void transpose1(const real* A, real* B, const int N)
{
    __shared__ real S[TILE_DIM][TILE_DIM];
    int bx = blockIdx.x * TILE_DIM;
    int by = blockIdx.y * TILE_DIM;

    int nx1 = bx + threadIdx.x;
    int ny1 = by + threadIdx.y;
    if (nx1 < N && ny1 < N)
    {
        S[threadIdx.y][threadIdx.x] = A[ny1 * N + nx1];
    }
    __syncthreads();

    int nx2 = bx + threadIdx.y;
    int ny2 = by + threadIdx.x;
    if (nx2 < N && ny2 < N)
    {
        B[nx2 * N + ny2] = S[threadIdx.x][threadIdx.y];
    }
}

__global__ void transpose2(const real* A, real* B, const int N)
{
    __shared__ real S[TILE_DIM][TILE_DIM + 1];
    int bx = blockIdx.x * TILE_DIM;
    int by = blockIdx.y * TILE_DIM;

    int nx1 = bx + threadIdx.x;
    int ny1 = by + threadIdx.y;
    if (nx1 < N && ny1 < N)
    {
        S[threadIdx.y][threadIdx.x] = A[ny1 * N + nx1];
    }
    __syncthreads();

    int nx2 = bx + threadIdx.y;
    int ny2 = by + threadIdx.x;
    if (nx2 < N && ny2 < N)
    {
        B[nx2 * N + ny2] = S[threadIdx.x][threadIdx.y];
    }
}

int main()
{
    const int N = 1024;
    real* h_A = new real[N * N];
    real* h_B = new real[N * N];

    // Initialize matrix A
    for (int i = 0; i < N * N; ++i) {
        h_A[i] = i;
    }

    real* d_A, * d_B;
    cudaMalloc(&d_A, N * N * sizeof(real));
    cudaMalloc(&d_B, N * N * sizeof(real));

    cudaMemcpy(d_A, h_A, N * N * sizeof(real), cudaMemcpyHostToDevice);

    dim3 dimGrid((N + TILE_DIM - 1) / TILE_DIM, (N + TILE_DIM - 1) / TILE_DIM, 1);
    dim3 dimBlock(TILE_DIM, TILE_DIM, 1);

    transpose2 << <dimGrid, dimBlock >> > (d_A, d_B, N);

    cudaMemcpy(h_B, d_B, N * N * sizeof(real), cudaMemcpyDeviceToHost);

    // ... Here you can print/check the result, etc.

    // Cleanup
    delete[] h_A;
    delete[] h_B;
    cudaFree(d_A);
    cudaFree(d_B);

    return 0;
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值