对矩阵向量乘法进行SIMD优化

使用simd和循环展开技术,对矩阵向量乘法dgemv进行改写,以实现y=Ax计算的性能优化。原始程序如下所示:

#include <iostream>
#include <ctime>
#include <omp.h>
using namespace std;

void dgemv(const int n, const int m, const double *const A,
           const double *const x, double *y)
{
    for (int i = 0; i < n; i++)
    {
        double y_tmp = 0.0;
        for (int j = 0; j < m; j++)
        {
            y_tmp += A[i * m + j] * x[j];
        }
        y[i] = y_tmp;
    }
    return;
}

int main()
{
    int nloop = 100;
    int n = 10240;
    double *mat_A = new double[n * n];
    double *vec_x = new double[n];
    double *vec_y = new double[n];

    srand(0);
    for (int i = 0; i < n; i++)
    {
        for (int j = 0; j < n; j++)
        {
            vec_x[j] = (double)(rand() % 10) + (double)rand() / RAND_MAX * 3.14;
            mat_A[i * n + j] = (double)(rand() % 10) + (double)rand() / RAND_MAX * 3.14;
        }
        vec_y[i] = 0.0;
    }

    clock_t t = clock();
    for (int iloop = 0; iloop < nloop; iloop++)
    {
        dgemv(n, n, mat_A, vec_x, vec_y);
    }
    t = clock() - t;

    cout << "y[0]:" << vec_y[0] << endl;
    cout << "y[1]:" << vec_y[1] << endl;
    cout << "y[2]:" << vec_y[2] << endl;
    cout << "y[3]:" << vec_y[3] << endl;
    cout << "y[4]:" << vec_y[4] << endl;

    cout << "gemv ave cost time(clock):" << t / nloop << endl;
}

运行结果如下:

使用SIMD优化后,代码如下

#include <iostream>
#include <ctime>
#include <arm_neon.h>
using namespace std;

void dgemv_neon(const int n, const int m, const double *const A, const double *const x, double *y)
{
    // #pragma omp simd
    for (int i = 0; i < n; i+=2)
    {
        float64x2_t y_sum0 = vdupq_n_f64(0.0);
        float64x2_t y_sum1 = vdupq_n_f64(0.0);

        for (int j = 0; j < m; j += 2)
        {
            float64x2_t v_x = vld1q_f64(&x[j]);
            y_sum0 = vfmaq_f64(y_sum0, vld1q_f64(&A[i * m + j]), v_x);
            y_sum1 = vfmaq_f64(y_sum1, vld1q_f64(&A[(i + 1) * m + j]), v_x);
        }
        y[i] = vaddvq_f64(y_sum0);
        y[i + 1] = vaddvq_f64(y_sum1);
    }
    return;
}

int main()
{
    int nloop = 100;
    int n = 10240;
    double *mat_A = new double[n * n];
    double *vec_x = new double[n];
    double *vec_y = new double[n];

    srand(0);
    for (int i = 0; i < n; i++)
    {
        for (int j = 0; j < n; j++)
        {
            vec_x[j] = (double)(rand() % 10) + (double)rand() / RAND_MAX * 3.14;
            mat_A[i * n + j] = (double)(rand() % 10) + (double)rand() / RAND_MAX * 3.14;
        }
        vec_y[i] = 0.0;
    }

    clock_t t = clock();
    for (int iloop = 0; iloop < nloop; iloop++)
    {
        dgemv_neon(n, n, mat_A, vec_x, vec_y);
    }
    t = clock() - t;

    cout << "y[0]:" << vec_y[0] << endl;
    cout << "y[1]:" << vec_y[1] << endl;
    cout << "y[2]:" << vec_y[2] << endl;
    cout << "y[3]:" << vec_y[3] << endl;
    cout << "y[4]:" << vec_y[4] << endl;

    cout << "gemv ave cost time(clock):" << t / nloop << endl;
}

运行结果如下:

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值