cuda矩阵相加分块平铺cudaMemcpy2D

该代码实现了矩阵相加计算,在GPU中将矩阵的数据分块平铺处理,使用cudaMemcpy2D优化矩阵访问,并且对比了算法相对于CPU版本的加速效果,验证了算法正确性。

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <malloc.h>
#include <random>
#include "time.h"
#define W 2000//矩阵维度
#define H 3000

int a[H][W];
int b[H][W];
int c1[H][W];//存储CPU计算结果
int c2[H][W];//存储GPU计算结果

__global__ void matrixAddGPU(int* c, int* a, int* b, size_t pitch)//GPU版本
{
    int x = blockDim.x * blockIdx.x + threadIdx.x;//计算全局序号
    int y = blockDim.y * blockIdx.y + threadIdx.y;
    if (x < W && y < H) {//防止越界访问
        c[y * pitch + x] = a[y * pitch + x] + b[y * pitch + x];
    }
}

void matrixAddCPU(int c[][W], int a[][W], int b[][W]) {//CPU版本
    for (int i = 0; i < H; ++i) {
        for (int j = 0; j < W; ++j)
            c[i][j] = a[i][j] + b[i][j];
    }
}

int main()
{
    srand(0);
    for (int i = 0; i < H; ++i) {
        for (int j = 0; j < W; ++j) {
            a[i][j] = rand() % 1000;
            b[i][j] = rand() % 1000;
        }
    }

    clock_t start, end;
    double elapsedTime;
    start = clock();
    matrixAddCPU(c1, a, b);
    end = clock();
    elapsedTime = (double)(end - start);
    printf("time to generate CPU:% 5.3f ms\n", elapsedTime);//打印CPU执行用时

    int* dev_a, * dev_b, * dev_c;
    size_t pitch;
    cudaMallocPitch((void**)&dev_a, &pitch, sizeof(int) * W, H);
    cudaMemcpy2D(dev_a, pitch,
        a, sizeof(int) * W,
        sizeof(int) * W, H, cudaMemcpyHostToDevice);
    cudaMallocPitch((void**)&dev_b, &pitch, sizeof(int) * W, H);
    cudaMemcpy2D(dev_b, pitch,
        b, sizeof(int) * W,
        sizeof(int) * W, H, cudaMemcpyHostToDevice);
    cudaMallocPitch((void**)&dev_c, &pitch, sizeof(int) * W, H);

    dim3 dimBlock(16, 16);//矩阵分块大小
    dim3 dimGrid((W + 16 - 1) / 16, (H + 16 - 1) / 16);//矩阵分块后维度

    cudaEvent_t start1, stop1;
    cudaEventCreate(&start1);
    cudaEventCreate(&stop1);
    cudaEventRecord(start1, 0);
    matrixAddGPU << <dimGrid, dimBlock >> > (dev_c, 
        dev_a, dev_b, pitch/sizeof(int));
    cudaEventRecord(stop1, 0);
    cudaEventSynchronize(stop1);
    float elapsedTime1;
    cudaEventElapsedTime(&elapsedTime1, start1, stop1);
    printf("time to generate GPU:% 5.3f ms\n", elapsedTime1);//打印GPU执行用时
    cudaEventDestroy(start1);
    cudaEventDestroy(stop1);

    cudaMemcpy2D(c2, sizeof(int) * W,//将GPU计算结果拷贝回CPU端
        dev_c, pitch, sizeof(int) * W, H, cudaMemcpyDeviceToHost);

    bool flag = true;
    for (int i = 0; i < H; ++i) {//检验计算正确性
        for(int j=0;j<W;++j)
            if (c1[i][j] != c2[i][j]) {
                flag = false;
                break;
            }
    }
    if (flag) printf("Consistent!!!\n");
    else printf("Not consistent!!!\n");

    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);

    return 0;
}

  • 0
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

赴星辰大海

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值