CUDA实现矩阵相乘

最新推荐文章于 2024-08-29 11:54:24 发布

武乐乐~

最新推荐文章于 2024-08-29 11:54:24 发布

阅读量2.1k

点赞数 1

分类专栏： CUDA编程

本文链接：https://blog.csdn.net/wulele2/article/details/119007645

版权

c++ cuda

CUDA编程专栏收录该内容

5 篇文章 0 订阅

订阅专栏

文章目录

前言
1、简单思路
分析
2、优化
总结

前言

本文主要借助CUDA实现矩阵相乘。

1、简单思路

#include <stdio.h>

#define BLOCK_NUM  8
#define THREAD_NUM  32
#define R_SIZE BLOCK_NUM * THREAD_NUM
#define M_SIZE R_SIZE*R_SIZE

void __global__ matmul1(int *da, int *db, int *dres);

void __global__ matmul1(int *da, int *db, int *dres)
{
    // 获取每一个线程的绝对编号，总共256条
    int tid = blockDim.x * blockIdx.x + threadIdx.x; 
    // 每一条线程计算结果矩阵一行的数据
    // 以tid = 0 为例，需要累加
    for(int c=0; c<R_SIZE; ++c)
    {
        for(int r=0; r<R_SIZE; ++r)
	    dres[tid*R_SIZE + c] += da[tid*R_SIZE+r] * db[r*R_SIZE+c];
    }
}


int main(int argc, char *argv[])
{
    //分配主机内存
    int *ha, *hb, *hres;
    ha = (int *) malloc (sizeof(int) * M_SIZE);
    hb = (int *) malloc (sizeof(int) * M_SIZE);
    hres = (int *) malloc(sizeof(int) * M_SIZE);

    //赋值
    for(int i=0; i<R_SIZE; ++i)
    {
        for(int j=0; j<R_SIZE; ++j)
	{
	    ha[i*R_SIZE+j] = 1;
	    hb[i*R_SIZE+j] = 1;
	    hres[i*R_SIZE+j] = 0; 
	}
    }
    // 分配设备内润
    int *da, *db, *dres;
    cudaMalloc((void**)&da, sizeof(int)*M_SIZE);
    cudaMalloc((void**)&db, sizeof(int)*M_SIZE);
    cudaMalloc((void**)&dres, sizeof(int)*M_SIZE);

    // 拷贝数据
    cudaMemcpy(da,ha, sizeof(int)*M_SIZE, cudaMemcpyHostToDevice);
    cudaMemcpy(db,hb, sizeof(int)*M_SIZE, cudaMemcpyHostToDevice);
    cudaMemcpy(dres, hres, sizeof(int)*M_SIZE, cudaMemcpyHostToDevice);

    // 调用核函数
    matmul1<<<BLOCK_NUM,THREAD_NUM>>>(da,db,dres);

    // 拷贝数据
    cudaMemcpy(hres, dres, sizeof(int)*M_SIZE, cudaMemcpyDeviceToHost);
    
    // 打印看看
    printf("%d\n",hres[0]);

    //释放内存
    free(ha);
    free(hb);
    free(hres);
    cudaFree(da);
    cudaFree(db);
    cudaFree(dres);

    return 0;
}

分析

首先定义了256个线程，线程数量和矩阵的行数相等。在核函数中，变量tid获取到了每一个线程的ID。即[0~255]。对应最终矩阵的256行。即一个线程需要计算一行的结果矩阵。假设tid =0，然后在分析核函数中的两重循环，分别获取da矩阵的行元素和db矩阵的列元素相乘并累加求和得到最终对应位置的解。
后续会介绍矩阵乘法优化，根据合理的线程安排去掉一层for循环。

2、优化

#include <stdio.h>

#define BLOCK_NUM  8
#define THREAD_NUM  32
#define R_SIZE BLOCK_NUM * THREAD_NUM
#define M_SIZE R_SIZE*R_SIZE

void __global__ matmul2(int *da, int *db, int *dres);

void __global__ matmul2(int *da, int *db, int *dres)
{
    // 获取每一个线程的ID, 编号ID:(row,col)。对应结果矩阵的 行 和 列
    int row = blockDim.y * blockIdx.y + threadIdx.y;
    int col = blockDim.x * blockIdx.x + threadIdx.x; 
    // 对应每一个的线程的结果,一个线程对应一个结果矩阵的一个元素
    for(int i=0; i<R_SIZE; ++i)
    {
        dres[row*R_SIZE + col] += da[row*R_SIZE+i] * db[i*row+col];
    }
}


int main(int argc, char *argv[])
{
    //分配主机内存
    int *ha, *hb, *hres;
    ha = (int *) malloc (sizeof(int) * M_SIZE);
    hb = (int *) malloc (sizeof(int) * M_SIZE);
    hres = (int *) malloc(sizeof(int) * M_SIZE);

    //赋值
    for(int i=0; i<R_SIZE; ++i)
    {
        for(int j=0; j<R_SIZE; ++j)
	{
	    ha[i*R_SIZE+j] = 1;
	    hb[i*R_SIZE+j] = 1;
	    hres[i*R_SIZE+j] = 0; 
	}
    }
    // 分配设备内润
    int *da, *db, *dres;
    cudaMalloc((void**)&da, sizeof(int)*M_SIZE);
    cudaMalloc((void**)&db, sizeof(int)*M_SIZE);
    cudaMalloc((void**)&dres, sizeof(int)*M_SIZE);

    // 拷贝数据
    cudaMemcpy(da,ha, sizeof(int)*M_SIZE, cudaMemcpyHostToDevice);
    cudaMemcpy(db,hb, sizeof(int)*M_SIZE, cudaMemcpyHostToDevice);
    cudaMemcpy(dres, hres, sizeof(int)*M_SIZE, cudaMemcpyHostToDevice);
    
    // 调用核函数
    // 分配线程
    const dim3 grid_size(BLOCK_NUM, BLOCK_NUM);
    const dim3 block_size(THREAD_NUM, THREAD_NUM);

    matmul2<<<grid_size, block_size>>>(da,db,dres);

    // 拷贝数据
    cudaMemcpy(hres, dres, sizeof(int)*M_SIZE, cudaMemcpyDeviceToHost);
    
    // 打印看看
    printf("%d\n",hres[0]);

    //释放内存
    free(ha);
    free(hb);
    free(hres);
    cudaFree(da);
    cudaFree(db);
    cudaFree(dres);

    return 0;
}