leetgpu

最新推荐文章于 2025-10-20 22:12:13 发布

原创最新推荐文章于 2025-10-20 22:12:13 发布 · 765 阅读

7 ·

CC 4.0 BY-SA版权

文章标签：

#算法

向量相加
1.1 题目
在GPU上实现一个程序，该程序能够对两个包含32位浮点数的向量进行逐元素相加。该程序应接受两个长度相等的输入向量，并生成一个包含它们之和的输出向量。

https://leetgpu.com/challenges/vector-addition
1.2 实施要求

不允许使用外部库
solve函数的签名必须保持不变
最终结果必须存储在向量C中

1.3 例子
[图片]

1.4 参考代码
#include <cuda_runtime.h>

global void vector_add(const float* A, const float* B, float* C, int N) {
// 计算当前线程的全局索引
int i = blockIdx.x * blockDim.x + threadIdx.x;

// 确保索引在有效范围内，避免越界访问
if (i < N) {
    C[i] = A[i] + B[i];
}

}

// A, B, C are device pointers (i.e. pointers to memory on the GPU)
extern “C” void solve(const float* A, const float* B, float* C, int N) {
int threadsPerBlock = 256;
// 计算所需的线程块数量，确保所有元素都能被处理
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

// 启动核函数，使用计算得到的网格和块配置
vector_add<<<blocksPerGrid, threadsPerBlock>>>(A, B, C, N);
// 等待所有线程完成，并检查是否有错误发生
cudaDeviceSynchronize();

}

矩阵乘法
2.1 题目
编写一个程序，在GPU上将两个32位浮点数矩阵相乘。
[图片]

2.2 例子
[图片]

2.3 代码

#include <cuda_runtime.h>

global void matrix_multiplication_kernel(const float* A, const float* B, float* C, int M, int N, int K) {
// 计算当前线程负责的C矩阵元素的行和列
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;

// 确保行和列都在有效范围内
if (row < M && col < K) {
    float sum = 0.0f;
    // 计算C[row][col]的值
    for (int i = 0; i < N; ++i) {
        sum += A[row * N + i] * B[i * K + col];
    }
    C[row * K + col] = sum;
}

}

// A, B, C are device pointers (i.e. pointers to memory on the GPU)
extern “C” void solve(const float* A, const float* B, float* C, int M, int N, int K) {
// 每个线程块使用16x16的线程布局，这是一个在GPU上表现良好的配置
dim3 threadsPerBlock(16, 16);

// 计算所需的线程块数量，确保覆盖整个C矩阵
dim3 blocksPerGrid((K + threadsPerBlock.x - 1) / threadsPerBlock.x,
                   (M + threadsPerBlock.y - 1) / threadsPerBlock.y);

// 启动核函数
matrix_multiplication_kernel<<<blocksPerGrid, threadsPerBlock>>>(A, B, C, M, N, K);
// 等待所有线程完成
cudaDeviceSynchronize();

}

#include <cuda_runtime.h>

global void matrix_multiplication_kernel(const float* A, const float* B, float* C, int M, int N, int K) {
int outI = blockIdx.y * blockDim.y + threadIdx.y;
int outJ = blockIdx.x * blockDim.x + threadIdx.x;
if (outI >= M || outJ >= K) return;
float sum{0.0};
// sum of loop up to N, A[outI, n] * B[n, outJ]
for (int n{0}; n < N; n++) {
sum += A[outI * N + n] * B[n * K + outJ];
}
C[outI * K + outJ] = sum;
}

// A, B, C are device pointers (i.e. pointers to memory on the GPU)
extern “C” void solve(const float* A, const float* B, float* C, int M, int N, int K) {
dim3 threadsPerBlock(16, 16);
dim3 blocksPerGrid((K + threadsPerBlock.x - 1) / threadsPerBlock.x,
(M + threadsPerBlock.y - 1) / threadsPerBlock.y);

matrix_multiplication_kernel<<<blocksPerGrid, threadsPerBlock>>>(A, B, C, M, N, K);
cudaDeviceSynchronize();

}

矩阵转置
3.1 题目

[图片]

3.2 例子
[图片]

3.3 代码
#include <cuda_runtime.h>

global void matrix_transpose_kernel(const float* input, float* output, int rows, int cols) {

}

// input, output are device pointers (i.e. pointers to memory on the GPU)
extern “C” void solve(const float* input, float* output, int rows, int cols) {
dim3 threadsPerBlock(16, 16);
dim3 blocksPerGrid((cols + threadsPerBlock.x - 1) / threadsPerBlock.x,
(rows + threadsPerBlock.y - 1) / threadsPerBlock.y);

matrix_transpose_kernel<<<blocksPerGrid, threadsPerBlock>>>(input, output, rows, cols);
cudaDeviceSynchronize();

}

#include <cuda_runtime.h>

global void matrix_transpose_kernel(const float* input, float* output, int rows, int cols) {
int col = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
if (row >= rows || col >= cols) {
return;
}
output[col * rows + row] = input[row * cols + col];
}

matrix_transpose_kernel<<<blocksPerGrid, threadsPerBlock>>>(input, output, rows, cols);
cudaDeviceSynchronize();

}

颜色反转
4.1 题目
[图片]

4.2 例子
[图片]

4.3 代码
#include <cuda_runtime.h>

global void invert_kernel(unsigned char* image, int width, int height) {
// One thread per pixel
int x = blockDim.x * blockIdx.x + threadIdx.x;
if (x < width * height) {
x = 4;
image[x] = 255 - image[x];
image[x+1] = 255 - image[x+1];
image[x+2] = 255 - image[x+2];
}
}
// image_input, image_output are device pointers (i.e. pointers to memory on the GPU)
extern “C” void solve(unsigned char image, int width, int height) {
int threadsPerBlock = 256;
int blocksPerGrid = (width * height + threadsPerBlock - 1) / threadsPerBlock;

invert_kernel<<<blocksPerGrid, threadsPerBlock>>>(image, width, height);
cudaDeviceSynchronize();

}

1D卷积
5.1 题目
[图片]

5.2 例子
[图片]

5.3 代码
#include <cuda_runtime.h>

#define MAX_KERNEL_SIZE 2048
#define BLOCK_SIZE 256

constant float kernel_constant[2048];

global void convolution_1d_kernel(const float* input, const float* kernel, float* output,
int input_size, int kernel_size) {
int tx = threadIdx.x;
int x = blockIdx.x * blockDim.x + threadIdx.x;
extern shared float input_cache[];

int start_pos = blockIdx.x * blockDim.x;
int tile_size = blockDim.x + kernel_size - 1;
for (int i = tx; i < tile_size ; i += blockDim.x) {
    int pos = start_pos + i;
    if (pos < input_size) {
        input_cache[i] = input[pos];
    }
}
__syncthreads();

if (x > input_size - kernel_size) {
    return;
}
float res = 0;
for (int i = 0; i < kernel_size; ++i) {
    res += input_cache[tx + i] * kernel_constant[i];
}
output[x] = res;

}

// input, kernel, output are device pointers (i.e. pointers to memory on the GPU)
extern “C” void solve(const float* input, const float* kernel, float* output, int input_size, int kernel_size) {
int output_size = input_size - kernel_size + 1;
int threadsPerBlock = BLOCK_SIZE;
int blocksPerGrid = (output_size + threadsPerBlock - 1) / threadsPerBlock;
cudaMemcpyToSymbol(kernel_constant, kernel, kernel_size*sizeof(float));

size_t shared_size = (threadsPerBlock + kernel_size - 1) * sizeof(float);

convolution_1d_kernel<<<blocksPerGrid, threadsPerBlock, shared_size>>>(input, kernel, output, input_size, kernel_size);
cudaDeviceSynchronize();

}

反向阵列
6.1 题目
[图片]

6.2 代码
#include <cuda_runtime.h>

global void reverse_array(float* input, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int half_N = (N + 1) / 2;
if (idx >= half_N) return;
float tmp = input[idx];
input[idx] = input[N - idx - 1];
input[N - idx - 1] = tmp;
}

// input is device pointer
extern “C” void solve(float* input, int N) {
int threadsPerBlock = 256;
int blocksPerGrid = ((N + 1) / 2 + threadsPerBlock - 1) / threadsPerBlock;

reverse_array<<<blocksPerGrid, threadsPerBlock>>>(input, N);
cudaDeviceSynchronize();

}