对矩阵向量乘法进行SIMD优化

最新推荐文章于 2024-08-14 19:09:14 发布

板牙奇奇

最新推荐文章于 2024-08-14 19:09:14 发布

阅读量249

点赞数

文章标签：矩阵线性代数

本文链接：https://blog.csdn.net/Fallen250/article/details/132032299

版权

使用simd和循环展开技术，对矩阵向量乘法dgemv进行改写，以实现y=Ax计算的性能优化。原始程序如下所示：

#include <iostream>
#include <ctime>
#include <omp.h>
using namespace std;

void dgemv(const int n, const int m, const double *const A,
           const double *const x, double *y)
{
    for (int i = 0; i < n; i++)
    {
        double y_tmp = 0.0;
        for (int j = 0; j < m; j++)
        {
            y_tmp += A[i * m + j] * x[j];
        }
        y[i] = y_tmp;
    }
    return;
}

int main()
{
    int nloop = 100;
    int n = 10240;
    double *mat_A = new double[n * n];
    double *vec_x = new double[n];
    double *vec_y = new double[n];

    srand(0);
    for (int i = 0; i < n; i++)
    {
        for (int j = 0; j < n; j++)
        {
            vec_x[j] = (double)(rand() % 10) + (double)rand() / RAND_MAX * 3.14;
            mat_A[i * n + j] = (double)(rand() % 10) + (double)rand() / RAND_MAX * 3.14;
        }
        vec_y[i] = 0.0;
    }

    clock_t t = clock();
    for (int iloop = 0; iloop < nloop; iloop++)
    {
        dgemv(n, n, mat_A, vec_x, vec_y);
    }
    t = clock() - t;

    cout << "y[0]:" << vec_y[0] << endl;
    cout << "y[1]:" << vec_y[1] << endl;
    cout << "y[2]:" << vec_y[2] << endl;
    cout << "y[3]:" << vec_y[3] << endl;
    cout << "y[4]:" << vec_y[4] << endl;

    cout << "gemv ave cost time(clock):" << t / nloop << endl;
}

运行结果如下：

使用SIMD优化后，代码如下

#include <iostream>
#include <ctime>
#include <arm_neon.h>
using namespace std;

void dgemv_neon(const int n, const int m, const double *const A, const double *const x, double *y)
{
    // #pragma omp simd
    for (int i = 0; i < n; i+=2)
    {
        float64x2_t y_sum0 = vdupq_n_f64(0.0);
        float64x2_t y_sum1 = vdupq_n_f64(0.0);

        for (int j = 0; j < m; j += 2)
        {
            float64x2_t v_x = vld1q_f64(&x[j]);
            y_sum0 = vfmaq_f64(y_sum0, vld1q_f64(&A[i * m + j]), v_x);
            y_sum1 = vfmaq_f64(y_sum1, vld1q_f64(&A[(i + 1) * m + j]), v_x);
        }
        y[i] = vaddvq_f64(y_sum0);
        y[i + 1] = vaddvq_f64(y_sum1);
    }
    return;
}

int main()
{
    int nloop = 100;
    int n = 10240;
    double *mat_A = new double[n * n];
    double *vec_x = new double[n];
    double *vec_y = new double[n];

    srand(0);
    for (int i = 0; i < n; i++)
    {
        for (int j = 0; j < n; j++)
        {
            vec_x[j] = (double)(rand() % 10) + (double)rand() / RAND_MAX * 3.14;
            mat_A[i * n + j] = (double)(rand() % 10) + (double)rand() / RAND_MAX * 3.14;
        }
        vec_y[i] = 0.0;
    }

    clock_t t = clock();
    for (int iloop = 0; iloop < nloop; iloop++)
    {
        dgemv_neon(n, n, mat_A, vec_x, vec_y);
    }
    t = clock() - t;

    cout << "y[0]:" << vec_y[0] << endl;
    cout << "y[1]:" << vec_y[1] << endl;
    cout << "y[2]:" << vec_y[2] << endl;
    cout << "y[3]:" << vec_y[3] << endl;
    cout << "y[4]:" << vec_y[4] << endl;

    cout << "gemv ave cost time(clock):" << t / nloop << endl;
}

运行结果如下：