cuda编程：怎么用2维网格2维块计算矩阵加法

小马敲马

于 2024-07-25 22:07:31 发布

阅读量838

点赞数 16

分类专栏： cuda编程指南---官方英文文档版文章标签：矩阵线性代数

本文链接：https://blog.csdn.net/buuliuda/article/details/140701117

版权

cuda编程指南---官方英文文档版专栏收录该内容

6 篇文章 0 订阅

订阅专栏

网格、块、线程的索引计算（二维）
代码

网格、块、线程的索引计算（二维）

一个线程在网格中的块的索引是ix = threadIdx.x + blockIdx.x * blockDim.x和iy = threadIdx.y + blockIdx.y * blockDim.y

代码

/******************************************************************
 * Author: Da Liu
 * Date: 2024-07-25
 * File: grid2D_block2D.cu
 * Description: 组织线程模型：二维网格二维线程块计算二维矩阵加法.
 *****************************************************************/

#include <stdio.h>
#include "../cudalearn/tools/common.cuh"

__global__ void add_matrix(int *a, int *b, int *c, const int nx, const int ny) 
{
    int ix = threadIdx.x + blockIdx.x * blockDim.x;
    int iy = threadIdx.y + blockIdx.y * blockDim.y;
    unsigned int idx = iy * nx + ix;
    if (ix < nx && iy < ny) {
        c[idx] = a[idx] + b[idx];
    }
}

int main()
{
    setGPU();                                           //设置GPU设备
    
    int nx = 16, ny = 8;                                //矩阵大小
    int nxy = nx * ny;                                  //矩阵元素个数
    size_t stBytesCount = nxy * sizeof(int);            //矩阵元素字节数

    int *ipHost_A, *ipHost_B, *ipHost_C;                //主机内存
    ipHost_A = (int* )malloc(stBytesCount);
    ipHost_B = (int* )malloc(stBytesCount);
    ipHost_C = (int* )malloc(stBytesCount);
    if (ipHost_A != NULL && ipHost_B != NULL && ipHost_C != NULL)
    {
        for(int i = 0; i < nxy; i++)
        {
            ipHost_A[i] = i;                             //矩阵A元素初始化为0到nxy-1
            ipHost_B[i] = i + 1;                         //矩阵B元素初始化为A元素+1
        }
        memset(ipHost_C, 0, stBytesCount);               //初始化矩阵C为0
    }
    else
    {
        printf("Memory allocation failed!\n");
        exit(-1);
    }
    int *ipDevice_A, *ipDevice_B, *ipDevice_C;           //设备内存
    ErrorCheck(cudaMalloc((int**)&ipDevice_A, stBytesCount), __FILE__, __LINE__);
    ErrorCheck(cudaMalloc((int**)&ipDevice_B, stBytesCount), __FILE__, __LINE__);
    ErrorCheck(cudaMalloc((int**)&ipDevice_C, stBytesCount), __FILE__, __LINE__);
    if (ipDevice_A != NULL && ipDevice_B != NULL && ipDevice_C != NULL)
    {
        ErrorCheck(cudaMemcpy(ipDevice_A, ipHost_A, stBytesCount, cudaMemcpyHostToDevice), __FILE__, __LINE__);
        ErrorCheck(cudaMemcpy(ipDevice_B, ipHost_B, stBytesCount, cudaMemcpyHostToDevice), __FILE__, __LINE__);
        ErrorCheck(cudaMemcpy(ipDevice_C, ipHost_C, stBytesCount, cudaMemcpyHostToDevice), __FILE__, __LINE__);
    }
    else
    {
        printf("Device Memory copy failed!\n");
        free(ipHost_A);
        free(ipHost_B);
        free(ipHost_C);
        exit(1);
    }

    dim3 blockDim(4, 4);
    dim3 gridDim((nx + blockDim.x - 1) / blockDim.x, (ny + blockDim.y - 1) / blockDim.y);
    printf("Grid Dim: %d, %d\n", gridDim.x, gridDim.y);
    printf("Block Dim: %d, %d\n", blockDim.x, blockDim.y);

    add_matrix<<<gridDim, blockDim>>>(ipDevice_A, ipDevice_B, ipDevice_C, nx, ny);
    ErrorCheck(cudaMemcpy(ipHost_C, ipDevice_C, stBytesCount, cudaMemcpyDeviceToHost), __FILE__, __LINE__);
    for (int i = 0; i < 20; i++)
    {
        printf("id = %d, matrix_A = %d, matrix_B = %d, matrix_C = %d\n", i + 1, ipHost_A[i], ipHost_B[i], ipHost_C[i]);
    }
    free(ipHost_A);
    free(ipHost_B);
    free(ipHost_C);
    ErrorCheck(cudaFree(ipDevice_A), __FILE__, __LINE__);
    ErrorCheck(cudaFree(ipDevice_B), __FILE__, __LINE__);
    ErrorCheck(cudaFree(ipDevice_C), __FILE__, __LINE__);
    return 0;
}

这里代码中的commom.cuh的代码是：
···cpp
/******************************************************************

Author: Da Liu
Date: 2024-07-10
File: common.cuh
*****************************************************************/
#pragma once
#include
#include<stdlib.h>
#include<stdio.h>
#include<cuda_runtime.h>

cudaError_t ErrorCheck(cudaError_t error_code, const char* filename, int lineNumber);

//查看当前可用GPU设备数并将可用device设置为0
void setGPU(){
int iDeviceCount = 0;
cudaError_t err = ErrorCheck(cudaGetDeviceCount(&iDeviceCount), FILE, LINE);
if (err!= cudaSuccess || iDeviceCount == 0)
{
std::cout<< “No CUDA-capable device found.” << std::endl;
exit(-1);
}
else
{
std::cout << "Number of CUDA-capable devices found: " << iDeviceCount << std::endl;
}

int iDevice = 0;
err = ErrorCheck(cudaSetDevice(iDevice), __FILE__, __LINE__);
if( err != cudaSuccess)
{
    std::cout << "Failed to set device " << iDevice << std::endl;
    exit(-1);
}
else
{
    std::cout << "Device " << iDevice << " set successfully." << std::endl;
}

}

//错误检查函数
cudaError_t ErrorCheck(cudaError_t error_code, const char* filename, int lineNumber)
{
if(error_code != cudaSuccess)
{
std::cout << “CUDA error:\r\ncode=%d,name=%s,description=%s\r\nfile=%s,line=%d\r\n” << error_code
<< cudaGetErrorName(error_code) << cudaGetErrorString(error_code) << filename << lineNumber << std::endl;
return error_code;
}
return error_code;
}

&emsp;&emsp;此时需要将grid2D_block2D.cu代码变成可执行文件，在代码目录的终端中输入
```sh
nvcc name.cu -o name
./name.exe

这样就可以输出结果。
在这里插入图片描述

小马敲马

关注

16
点赞
踩
15

收藏

觉得还不错? 一键收藏
打赏
0
评论
cuda编程：怎么用2维网格2维块计算矩阵加法

一个线程在网格中的块的索引是ix = threadIdx.x + blockIdx.x * blockDim.x和iy = threadIdx.y + blockIdx.y * blockDim.y。//查看当前可用GPU设备数并将可用device设置为0。这样就可以输出结果。
复制链接

扫一扫