#include <iostream>
#include <cuda_runtime.h>
const int TILE_DIM = 32;
typedef float real;
__global__ void transpose1(const real* A, real* B, const int N)
{
__shared__ real S[TILE_DIM][TILE_DIM];
int bx = blockIdx.x * TILE_DIM;
int by = blockIdx.y * TILE_DIM;
int nx1 = bx + threadIdx.x;
int ny1 = by + threadIdx.y;
if (nx1 < N && ny1 < N)
{
S[threadIdx.y][threadIdx.x] = A[ny1 * N + nx1];
}
__syncthreads();
int nx2 = bx + threadIdx.y;
int ny2 = by + threadIdx.x;
if (nx2 < N && ny2 < N)
{
B[nx2 * N + ny2] = S[threadIdx.x][threadIdx.y];
}
}
__global__ void transpose2(const real* A, real* B, const int N)
{
__shared__ real S[TILE_DIM][TILE_DIM + 1];
int bx = blockIdx.x * TILE_DIM;
int by = blockIdx.y * TILE_DIM;
int nx1 = bx + threadIdx.x;
int ny1 = by + threadIdx.y;
if (nx1 < N && ny1 < N)
{
S[threadIdx.y][threadIdx.x] = A[ny1 * N + nx1];
}
__syncthreads();
int nx2 = bx + threadIdx.y;
int ny2 = by + threadIdx.x;
if (nx2 < N && ny2 < N)
{
B[nx2 * N + ny2] = S[threadIdx.x][threadIdx.y];
}
}
int main()
{
const int N = 1024;
real* h_A = new real[N * N];
real* h_B = new real[N * N];
// Initialize matrix A
for (int i = 0; i < N * N; ++i) {
h_A[i] = i;
}
real* d_A, * d_B;
cudaMalloc(&d_A, N * N * sizeof(real));
cudaMalloc(&d_B, N * N * sizeof(real));
cudaMemcpy(d_A, h_A, N * N * sizeof(real), cudaMemcpyHostToDevice);
dim3 dimGrid((N + TILE_DIM - 1) / TILE_DIM, (N + TILE_DIM - 1) / TILE_DIM, 1);
dim3 dimBlock(TILE_DIM, TILE_DIM, 1);
transpose2 << <dimGrid, dimBlock >> > (d_A, d_B, N);
cudaMemcpy(h_B, d_B, N * N * sizeof(real), cudaMemcpyDeviceToHost);
// ... Here you can print/check the result, etc.
// Cleanup
delete[] h_A;
delete[] h_B;
cudaFree(d_A);
cudaFree(d_B);
return 0;
}
CUDA:消除bank conflict前后的矩阵转置
最新推荐文章于 2024-06-13 23:44:45 发布