NVIDIA CUDA2023春训营(五)CUDA 向量加法与矩阵乘法

1D Grid, 1D Block 向量加法

普通实现

#include <stdio.h>
#include <math.h>
#define N 100
const double EPS = 1E-6;

void __global__ add(const double *x, const double *y, double *z, int n) {
    // 获取全局索引
    const int index = blockDim.x * blockIdx.x + threadIdx.x;
    
    // 步长
    int stride = blockDim.x * gridDim.x;
    for (int i = index; i < n; i += stride) {
        z[i] = x[i] + y[i];
    }
}

// 误差检测
void check(const double *z, const int n) {
    bool error = false;
    double maxError = 0;
    
    for (int i = 0; i < n; i++) {
        maxError = fmax(maxError, fabs(z[i]-70));
        if (fabs(z[i] - 70) > EPS) {
            error = true;
        }
    }
    
    printf("%s\n", error ? "Errors" : "Pass");
    printf("最大误差: %lf\n", maxError);
}

int main() {
    const int arraySize = sizeof(double) * N;

    // 申请host锁定内存
    double *h_x, *h_y, *h_z;
    cudaMallocHost(&h_x, arraySize);
    cudaMallocHost(&h_y, arraySize);
    cudaMallocHost(&h_z, arraySize);

    // 初始化数据
    for (int i = 0; i < N; i++) {
        h_x[i] = 50;
        h_y[i] = 20;
    }

    // 申请device显存
    double *d_x, *d_y, *d_z;
    cudaMalloc((void **)&d_x, arraySize);
    cudaMalloc((void **)&d_y, arraySize);
    cudaMalloc((void **)&d_z, arraySize);
    
    // host数据传输到device
    cudaMemcpy(d_x, h_x, arraySize, cudaMemcpyHostToDevice);
    cudaMemcpy(d_y, h_y, arraySize, cudaMemcpyHostToDevice);

    // 核函数执行配置
    dim3 blockSize(128);
    dim3 gridSize((N + blockSize.x - 1) / blockSize.x);
    
    // 执行核函数
    add<<<gridSize, blockSize>>>(d_x, d_y, d_z, N);

    // 将device得到的结果传输到host
    cudaMemcpy(h_z, d_z, arraySize, cudaMemcpyDeviceToHost);
    
    // 检查执行结果
    check(h_z, N);

    // 释放host锁定内存
    cudaFreeHost(h_x);
    cudaFreeHost(h_y);
    cudaFreeHost(h_z);
    
    // 释放device显存
    cudaFree(d_x);
    cudaFree(d_y);
    cudaFree(d_z);
    
    return 0;
}

统一内存实现

#include <stdio.h>
#include <math.h>
#define N 100
const double EPS = 1E-6;

__global__ void add(const double *x, const double *y, double *z, int n) {
    // 获取全局索引
    const int index = blockDim.x * blockIdx.x + threadIdx.x;
    
    // 步长
    int stride = blockDim.x * gridDim.x;
    for (int i = index; i < n; i += stride) {
        z[i] = x[i] + y[i];
    }
}

void check(const double *z, const int n) {
    bool error = false;
    double maxError = 0;
    
    // 误差检测
    for (int i = 0; i < n; i++) {
        maxError = fmax(maxError, fabs(z[i]-70));
        if (fabs(z[i] - 70) > EPS) {
            error = true;
        }
    }
    
    printf("%s\n", error ? "Errors" : "Pass");
    printf("最大误差: %lf\n", maxError);
}

int main() {
    // 申请统一内存
    const int arraySize = sizeof(double) * N;
    double *x, *y, *z;
    cudaMallocManaged((void**)&x, arraySize);
    cudaMallocManaged((void**)&y, arraySize);
    cudaMallocManaged((void**)&z, arraySize);

    // 初始化数据
    for (int i = 0; i < N; i++) {
        x[i] = 50;
        y[i] = 20;
    }

    // 核函数执行配置
    dim3 blockSize(128);
    dim3 gridSize((N + blockSize.x - 1) / blockSize.x);
    
    // 执行核函数
    add<<<gridSize, blockSize>>>(x, y, z, N);

    // 同步函数
    cudaDeviceSynchronize();
    
    // 检查执行结果
    check(z, N);

    // 释放统一内存
    cudaFree(x);
    cudaFree(y);
    cudaFree(z);
    
    return 0;
}

2D Grid, 2D Block 矩阵乘法

普通实现

#include <stdio.h>
#include <math.h>
#define BLOCK_SIZE 16
const double EPS = 1E-6;

// GPU 矩阵乘法
__global__ void gpu_matrix_mult(int *a,int *b, int *c, int m, int n, int k) { 
    
    // 当前线程在所有线程中的索引坐标,即结果矩阵中的行与列
    int row = blockIdx.y * blockDim.y + threadIdx.y; 
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    int sum = 0;
    if (row < m && col < k) {
        for(int i = 0; i < n; i++) {
            sum += a[row * n + i] * b[i * k + col];
        }
        c[row * k + col] = sum;
    }
} 

// CPU 矩阵乘法
void cpu_matrix_mult(int *h_a, int *h_b, int *h_result, int m, int n, int k) {
    for (int i = 0; i < m; i++) {
        for (int j = 0; j < k; j++) {
            int sum = 0;
            for (int h = 0; h < n; h++) {
                sum += h_a[i * n + h] * h_b[h * k + j];
            }
            h_result[i * k + j] = sum;
        }
    }
}

// 检查执行结果
void check(int *h_c_cpu, int *h_c_gpu, int m, int k) {
    int error = false;
    for (int i = 0; i < m; i++) {
        for (int j = 0; j < k; j++) {
            if(fabs(h_c_gpu[i * k + j] - h_c_cpu[i * k + j]) > EPS) {
                error = true;
            }
        }
    }
    printf("检查结果:%s\n", error ? "Errors" : "Pass");
}

int main() {
    int m = 5;
    int n = 5;
    int k = 5;

    // 申请统一内存
    int *a, *b, *c_gpu, *c_cpu;

    cudaMallocManaged((void **) &a, sizeof(int) * m * n);
    cudaMallocManaged((void **) &b, sizeof(int) * n * k);
    cudaMallocManaged((void **) &c_gpu, sizeof(int) * m * k);
    cudaMallocManaged((void **) &c_cpu, sizeof(int) * m * k);

    // 初始化数据
    for (int i = 0; i < m; i++) {
        for (int j = 0; j < n; j++) {
            a[i * n + j] = rand() % 1024;
        }
    }
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < k; j++) {
            b[i * k + j] = rand() % 1024;
        }
    }

    // 核函数执行配置
    unsigned int grid_rows = (m + BLOCK_SIZE - 1) / BLOCK_SIZE;
    unsigned int grid_cols = (k + BLOCK_SIZE - 1) / BLOCK_SIZE;
    dim3 grid(grid_cols, grid_rows);
    dim3 block(BLOCK_SIZE, BLOCK_SIZE);
   
    // 执行核函数
    gpu_matrix_mult<<<grid, block>>>(a, b, c_gpu, m, n, k);

    // 同步函数
    cudaDeviceSynchronize();

    // 输出GPU结果
    printf("GPU执行结果:\n");
    for (int i = 0; i < m; i++) {
        for (int j = 0; j < k; j++) {
            printf("%d ", c_gpu[i * k + j]);
        }
        printf("\n");
    }

    // cpu矩阵乘积
    cpu_matrix_mult(a, b, c_cpu, m, n, k);

    // 输出CPU结果
    printf("CPU执行结果:\n");
    for (int i = 0; i < m; i++) {
        for (int j = 0; j < k; j++) {
            printf("%d ", c_cpu[i * k + j]);
        }
        printf("\n");
    }

    // 检查执行结果
    check(c_cpu, c_gpu, m, k);

    // 释放统一内存
    cudaFree(a);
    cudaFree(b);
    cudaFree(c_cpu);
    cudaFree(c_gpu);

    return 0;
}

共享内存优化实现

在处理矩阵乘法时,假设矩阵 M M M m ∗ n m*n mn 维,矩阵 N N N n ∗ k n*k nk 维,得到的矩阵 P P P m ∗ k m*k mk

举例来说,当需要读取矩阵 M M M 中的一个数值 M ( row , col ) M(\text{row}, \text{col}) M(row,col) 时,就要被 grid 中所有满足 row = blockIdx.y * blockDim.y + threadIdx.y 的线程从全局内存中读取一次,总共要读取 n n n

那么,对于这么多次的重复读取,可以将这个变量存放在共享内存中,从而减少每次的读取时间

#include <stdio.h>
#include <math.h>
#define BLOCK_SIZE 16
const double EPS = 1E-6;

// GPU 矩阵乘法,使用 shared memory
__global__ void gpu_matrix_mult_shared(int *a,int *b, int *c, int m, int n, int k) { 
    
    // 当前线程在所有线程中的索引坐标,即结果矩阵中的行与列
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    // 申请共享内存,存于每个block中
    __shared__ int s_a[BLOCK_SIZE][BLOCK_SIZE];
    __shared__ int s_b[BLOCK_SIZE][BLOCK_SIZE];

    int sum = 0;
    // 枚举所有的grid,sub * BLOCK_SIZE为第sub个grid前的所有grid中的所有block所占空间
    for (int sub = 0; sub < gridDim.x; sub++) {
        /** index_a:该线程要读取的数据在矩阵a中的索引
         *  - sub * BLOCK_SIZE + threadIdx.x:该线程要读取的数据所在行
         *  - row*n:该线程所在行之前的所有数据
        **/
        int index_a = (sub * BLOCK_SIZE + threadIdx.x) + row * n;
        if (row < m && (sub * BLOCK_SIZE + threadIdx.x) < n) {
            s_a[threadIdx.y][threadIdx.x] = a[index_a];
        } else {
            s_a[threadIdx.y][threadIdx.x] = 0;
        }
        /** index_b:该线程要读取的数据在矩阵b中的索引
         *  - col:该线程要读取的数据所在行
         *  - n * (sub * BLOCK_SIZE + threadIdx.y):该线程所在行之前的所有数据
        **/ 
        int index_b = col + n * (sub * BLOCK_SIZE + threadIdx.y);
        if (col < n && (sub * BLOCK_SIZE + threadIdx.y) < k) {
            s_b[threadIdx.y][threadIdx.x] = b[index_b];
        } else {
            s_b[threadIdx.y][threadIdx.x] = 0;
        }

        // 控制线程同步,保证共享变量中的元素都被加载
        __syncthreads();

        // 从共享空间中取元素计算
        for (int i = 0; i < BLOCK_SIZE; i++) {
            sum += s_a[threadIdx.y][i] * s_b[i][threadIdx.x];
        }
        __syncthreads();

        if (row < m && col < k) {
            c[row * k + col] = sum;
        }
    }
} 

// CPU 矩阵乘法
void cpu_matrix_mult(int *a, int *b, int *c, int m, int n, int k) {
    for (int i = 0; i < m; i++) {
        for (int j = 0; j < k; j++) {
            int sum = 0;
            for (int h = 0; h < n; h++) {
                sum += a[i * n + h] * b[h * k + j];
            }
            c[i * k + j] = sum;
        }
    }
}

// 检查执行结果
void check(int *c_cpu, int *c_gpu, int m, int k) {
    int error = false;
    for (int i = 0; i < m; i++) {
        for (int j = 0; j < k; j++) {
            if(fabs(c_gpu[i * k + j] - c_cpu[i * k + j]) > EPS) {
                error = true;
            }
        }
    }
    printf("检查结果:%s\n", error ? "Errors" : "Pass");
}

int main() {
    int m = 5;
    int n = 5;
    int k = 5;

    // 申请统一内存
    int *a, *b, *c_gpu, *c_cpu;

    cudaMallocManaged((void **) &a, sizeof(int) * m * n);
    cudaMallocManaged((void **) &b, sizeof(int) * n * k);
    cudaMallocManaged((void **) &c_gpu, sizeof(int) * m * k);
    cudaMallocManaged((void **) &c_cpu, sizeof(int) * m * k);

    // 初始化数据
    for (int i = 0; i < m; i++) {
        for (int j = 0; j < n; j++) {
            a[i * n + j] = rand() % 1024;
        }
    }
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < k; j++) {
            b[i * k + j] = rand() % 1024;
        }
    }

    // 核函数执行配置
    unsigned int grid_rows = (m + BLOCK_SIZE - 1) / BLOCK_SIZE;
    unsigned int grid_cols = (k + BLOCK_SIZE - 1) / BLOCK_SIZE;
    dim3 grid(grid_cols, grid_rows);
    dim3 block(BLOCK_SIZE, BLOCK_SIZE);
   
    // 执行核函数
    gpu_matrix_mult_shared<<<grid, block>>>(a, b, c_gpu, m, n, k);

    // 同步函数
    cudaDeviceSynchronize();

    // 输出GPU结果
    printf("GPU执行结果:\n");
    for (int i = 0; i < m; i++) {
        for (int j = 0; j < k; j++) {
            printf("%d ", c_gpu[i * k + j]);
        }
        printf("\n");
    }

    // cpu矩阵乘积
    cpu_matrix_mult(a, b, c_cpu, m, n, k);

    // 输出CPU结果
    printf("CPU执行结果:\n");
    for (int i = 0; i < m; i++) {
        for (int j = 0; j < k; j++) {
            printf("%d ", c_cpu[i * k + j]);
        }
        printf("\n");
    }

    // 检查执行结果
    check(c_cpu, c_gpu, m, k);

    // 释放统一内存
    cudaFree(a);
    cudaFree(b);
    cudaFree(c_cpu);
    cudaFree(c_gpu);

    return 0;
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值