Openacc优化矩阵乘法

#include <iostream>
#include <cstdlib>
#include <chrono>

#define N 1000//可以更改大小

using namespace std;
using namespace std::chrono;

double a[N][N], b[N][N], c[N][N];

int main()
{
    double tmp;
    int n;//矩阵大小
    scanf("%d",&n);

    //输入矩阵
    for(int i=0; i<n; i++){
        for(int j=0; j<n; j++){
            scanf("%lf",&a[i][j]);
        }
    }
    for(int i=0; i<n; i++){
        for(int j=0; j<n; j++){
            scanf("%lf",&b[i][j]);
        }
    }

#pragma acc enter data create(a, b, c)
#pragma acc kernels present(a, b, c)
    {
        for(int i=0; i<n; i++){//初始化数组c,每次都清零
            for(int j=0; j<n; j++){
                c[i][j] = 0;
            }
        }

    }

    high_resolution_clock::time_point t1 = high_resolution_clock::now();

#pragma acc kernels present(a, b, c)       // 最简单的,每层循环都 auto
    {
#pragma acc loop auto
        for (int i=0; i<n; i++)
        {
#pragma acc loop auto
            for (int j=0; j<n; j++)
            {
#pragma acc loop auto
                for (int k=0; k<n; k++){
                        c[i][j] += a[i][k] * b[k][j];//矩阵乘法
                    }
                printf("%.3lf  ",c[i][j]);//这里可以控制输出精度
            }

            printf("\n");
        }
    }

    //可以计算出所需要的时间
    high_resolution_clock::time_point t2 = high_resolution_clock::now();
    duration<double> time = duration_cast<duration<double>>(t2 - t1);
    printf("Time OpenACC - Auto: %.6lf s.\n\n", time.count());


    
    
#pragma acc kernels present(c)
    for (int i=0; i<n; i++)
    {
        for (int j=0; j<n; j++)
            c[i][j] = 0.0;
    }

    t1 = high_resolution_clock::now();

#pragma acc kernels present(a, b, c)        // 方法 2,外两层 independent,最里层串行
    {
#pragma acc loop independent
        for (int i=0; i<n; i++)
        {
#pragma acc loop independent
            for (int j=0; j<n; j++)
            {
#pragma acc loop independent
                for (int k=0; k<n; k++)
                    c[i][j] += a[i][k] * b[k][j];
                printf("%.3lf  ",c[i][j]);//这里可以控制输出精度
            }
            printf("\n");
        }
    }

    t2 = high_resolution_clock::now();
    time = duration_cast<duration<double>>(t2 - t1);
    printf("Time OpenACC - Independent Seq: %.6lf s.\n\n", time.count());
    
    
    


#pragma acc kernels present(c)
    for (int i = 0; i < N; i++)
    {
        for (int j = 0; j < N; j++)
            c[i][j] = 0.0;
    }

    t1 = high_resolution_clock::now();

#pragma acc kernels present(a, b, c)        // 方法 3,外两层 independent,最里层规约
    {
#pragma acc loop independent
        for (int i=0; i<n; i++)
        {
#pragma acc loop independent
            for (int j=0; j<n; j++)
            {
                tmp = 0.0f;
#pragma acc loop reduction(+: tmp)
                for (int k=0; k<n; k++)
                    tmp += a[i][k] * b[k][j];
                c[i][j] = tmp;
                printf("%.3lf  ",c[i][j]);//这里可以控制输出精度
            }
             printf("\n");
        }
    }

    t2 = high_resolution_clock::now();
    time = duration_cast<duration<double>>(t2 - t1);
    printf("Time OpenACC - Independent Reduction: %.6lf s.\n\n", time.count());
    
    
    
    
    
    
    
    
   

#pragma acc kernels present(c)
    for (int i=0; i<n; i++)
    {
        for (int j=0; j<n; j++)
            c[i][j] = 0.0;
    }

    t1 = high_resolution_clock::now();

#pragma acc kernels present(a, b, c)        // 方法 4,手动指定 gang 和 vector
    {
#pragma acc loop gang(32)
        for (int i=0; i<n; i++)
        {
#pragma acc loop vector(16)
            for (int j=0; j<n; j++)
            {
                tmp = 0.0f;
#pragma acc loop reduction(+: tmp)
                for (int k=0; k<n; k++)
                    tmp += a[i][k] * b[k][j];
                c[i][j] = tmp;
                printf("%.3lf  ",c[i][j]);//这里可以控制输出精度
            }
            printf("\n");
        }
    }

    t2 = high_resolution_clock::now();
    time = duration_cast<duration<double>>(t2 - t1);
    printf("Time OpenACC - Gang Vector: %.6lf s.\n\n", time.count());
    
    
    
    
    
    
    
    
    

#pragma acc kernels present(c)
    for (int i=0; i<n; i++)
    {
        for (int j=0; j<n; j++)
            c[i][j] = 0.0;
    }

    t1 = high_resolution_clock::now();

#pragma acc kernels present(a, b, c)        // 方法 5,分块重排
    {
#pragma acc loop tile(32, 32)
        for (int i=0; i<n; i++)
        {
            for (int j=0; j<n; j++)
            {
                tmp = 0.0f;
#pragma acc loop reduction(+ \
                           : tmp)
                for (int k=0; k<n; ++k)
                    tmp += a[i][k] * b[k][j];
                c[i][j] = tmp;
                printf("%.3lf  ",c[i][j]);//这里可以控制输出精度
            }
            printf("\n");
        }
    }

    t2 = high_resolution_clock::now();
    time = duration_cast<duration<double>>(t2 - t1);
    printf("Time OpenACC - tile: %.6lf s.\n\n", time.count());
    
    
    
    
    
    
    

#pragma acc kernels present(c)
    for (int i=0; i<n; i++)
    {
        for (int j=0; j<n; j++)
            c[i][j] = 0.0;
    }

    t1 = high_resolution_clock::now();

#pragma acc kernels present(a, b, c)        // 方法 6,合并多层迭代
    {
#pragma acc loop collapse(2) independent
        for (int i=0; i<n; i++)
        {
            for (int j=0; j<n; j++)
            {
                tmp = 0.0f;
#pragma acc loop reduction(+: tmp)
                for (int k=0; k<n; k++)
                    tmp += a[i][k] * b[k][j];
                c[i][j] = tmp;
                printf("%.3lf  ",c[i][j]);//这里可以控制输出精度
            }
            printf("\n");
        }
    }

    t2 = high_resolution_clock::now();
    time = duration_cast<duration<double>>(t2 - t1);
    printf("Time OpenACC - Collapse: %.6lf s.\n\n", time.count());
    
    return 0;
}

参考链接:

https://www.cnblogs.com/cuancuancuanhao/p/9459007.html

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值