1 CUDA矩阵转置
#include <cstdio>
#include <iostream>
#include <chrono>
#define BLOCK_SIZE 32
__global__ void transpose(int* out, int* const in, int nx, int ny) {
__shared__ int temp[(BLOCK_SIZE + 1) * BLOCK_SIZE];
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x >= nx || y >= ny) return;
int rx = blockDim.y * blockIdx.y + threadIdx.x;
int ry = blockDim.x * blockIdx.x + threadIdx.y;
temp[threadIdx.y * (BLOCK_SIZE + 1) + threadIdx.x] = in[ry * nx + rx];
__syncthreads();
out[y * nx + x] = temp[threadIdx.x * (BLOCK_SIZE + 1) + threadIdx.y];
}
int main() {
int nx = 1<<12, ny = 1<<12;
int* in_host = (int*)malloc(sizeof(int) * nx * ny);
int* out_host = (int*)malloc(sizeof(int) * nx * ny);
for (int i = 0; i < nx * ny; i++) {
in_host[i] = i;
}
int* in_device;
int* out_device;
cudaMalloc(&in_device, sizeof(int) * nx * ny);
cudaMalloc(&out_device, sizeof(int) * nx * ny);
cudaMemcpy(in_device, in_host, sizeof(int)*nx*ny, cudaMemcpyHostToDevice);
transpose<<<dim3(nx/BLOCK_SIZE, ny/BLOCK_SIZE, 1), dim3(BLOCK_SIZE, BLOCK_SIZE, 1)>>>(out_device, in_device, nx, ny);
cudaMemcpy(out_host, out_device, sizeof(int)*nx*ny, cudaMemcpyDeviceToHost);
for (int y = 0; y < ny; y++) {
for (int x = 0; x < nx; x++) {
if (out_host[y*nx + x] != in_host[x*ny + y]) {
printf("Wrong At (%d, %d)!\n", x, y);
printf("out: %d\n", out_host[y*nx + x]);
printf("in: %d\n", in_host[x*ny + y]);
return -1;
}
}
}
printf("All Correct!\n");
return 0;
}