cpu GEMM 之微数据复用寄存器优化

Eloudy

已于 2022-09-05 10:43:01 修改

阅读量198

点赞数

文章标签： c++ 算法

于 2022-09-05 10:37:43 首次发布

本文链接：https://blog.csdn.net/eloudy/article/details/126698509

版权

一，三重for循环方式

存储为 helloGemm_1x1.cpp

编译：

g++ hello_gemm_1x1.cpp -o hello_gemm_1x1.out -O3

#include<iostream>
#include<stdlib.h>


using namespace std;

void gemm_A_N_B_T_1x1(float* A, float* B, float* C, int M_, int N_, int K_){

        for(int i=0; i<M_; i++){
                for(int j=0; j<N_; j++){
                        for(int k=0; k<K_; k++){
                                C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
                        }
                }
        }

}

void gemm_A_N_B_T_2x2(float* A, float* B, float* C, int M_, int N_, int K_){

        register float a1, a2, b1, b2;

        for(int i=0; i<M_/2; i++){
                for(int j=0; j<N_/2; j++){
                        for(int k=0; k<K_; k++){
                                a1 = A[2*i     + k*M_   ];//2*i   k
                                a2 = A[(2*i+1) + k*M_   ];//2*i+1 k
                                b1 = B[k*N_    + 2*j    ];//k     2*j
                                b2 = B[k*N_    + (2*j+1)];//k     2*j+1

                                C[2*i     + 2*j*M_    ] += a1*b1;
                                C[2*i     + (2*j+1)*M_] += a1*b2;
                                C[(2*i+1) + 2*j*M_    ] += a2*b1;
                                C[2*i+1   + (2*j+1)*M_] += a2*b2;
//                                C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
                        }
                }
        }

}



void init_mat(float* A, int count, int mod){
        for(int idx=0; idx<count; idx++){
                A[idx] = idx%mod + 1;
        }
}


void print_mat(float* A, int M, int N, bool colMajor){

        cout<<endl<<endl;

        for(int i=0; i<M; i++){
                for(int j=0; j<N; j++){
                        cout<<"  "<<(colMajor? A[i + j*M]: A[i*N + j]);
                }
                cout<<endl;
        }
}



int main(){


        int M = 1024;
        int N = 1024;
        int K = 1024;

        float* A = nullptr;
        float* B = nullptr;
        float* C_1x1 = nullptr;
        float* C_2x2 = nullptr;

        A = (float*)malloc(M*K*sizeof(float));
        B = (float*)malloc(K*N*sizeof(float));
        C_1x1 = (float*)malloc(M*N*sizeof(float));
        C_2x2 = (float*)malloc(M*N*sizeof(float));

        init_mat(A, M*K, 3);
        init_mat(B, K*N, 4);

//      print_mat(A, M, K, true);
//      print_mat(B, K, N, false);

        gemm_A_N_B_T_1x1(A, B, C_1x1, M, N, K);
//      gemm_A_N_B_T_2x2(A, B, C_2x2, M, N, K);

        //print_mat(C_1x1, M, N, true);
        //print_mat(C_2x2, M, N, true);

        cout<<endl<<"C(M,N)"<< C_1x1[M*N-1]<<endl;
//      cout<<endl<<"C(M,N)"<< C_2x2[M*N-1]<<endl;





        return 0;
}

二，block2x2 register 优化

存储为 helloGemm_2x2.cpp

编译：

g++ hello_gemm_2x2.cpp -o hello_gemm_2x2.out -O3

#include<iostream>
#include<stdlib.h>


using namespace std;

void gemm_A_N_B_T_1x1(float* A, float* B, float* C, int M_, int N_, int K_){

        for(int i=0; i<M_; i++){
                for(int j=0; j<N_; j++){
                        for(int k=0; k<K_; k++){
                                C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
                        }
                }
        }

}

void gemm_A_N_B_T_2x2(float* A, float* B, float* C, int M_, int N_, int K_){

        register float a1, a2, b1, b2;

        for(int i=0; i<M_/2; i++){
                for(int j=0; j<N_/2; j++){
                        for(int k=0; k<K_; k++){
                                a1 = A[2*i     + k*M_   ];//2*i   k
                                a2 = A[(2*i+1) + k*M_   ];//2*i+1 k
                                b1 = B[k*N_    + 2*j    ];//k     2*j
                                b2 = B[k*N_    + (2*j+1)];//k     2*j+1

                                C[2*i     + 2*j*M_    ] += a1*b1;
                                C[2*i     + (2*j+1)*M_] += a1*b2;
                                C[(2*i+1) + 2*j*M_    ] += a2*b1;
                                C[2*i+1   + (2*j+1)*M_] += a2*b2;
//                                C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
                        }
                }
        }

}



void init_mat(float* A, int count, int mod){
        for(int idx=0; idx<count; idx++){
                A[idx] = idx%mod + 1;
        }
}


void print_mat(float* A, int M, int N, bool colMajor){

        cout<<endl<<endl;

        for(int i=0; i<M; i++){
                for(int j=0; j<N; j++){
                        cout<<"  "<<(colMajor? A[i + j*M]: A[i*N + j]);
                }
                cout<<endl;
        }
}



int main(){


        int M = 1024;
        int N = 1024;
        int K = 1024;

        float* A = nullptr;
        float* B = nullptr;
        float* C_1x1 = nullptr;
        float* C_2x2 = nullptr;

        A = (float*)malloc(M*K*sizeof(float));
        B = (float*)malloc(K*N*sizeof(float));
        C_1x1 = (float*)malloc(M*N*sizeof(float));
        C_2x2 = (float*)malloc(M*N*sizeof(float));

        init_mat(A, M*K, 3);
        init_mat(B, K*N, 4);

//      print_mat(A, M, K, true);
//      print_mat(B, K, N, false);

//      gemm_A_N_B_T_1x1(A, B, C_1x1, M, N, K);
        gemm_A_N_B_T_2x2(A, B, C_2x2, M, N, K);

        //print_mat(C_1x1, M, N, true);
        //print_mat(C_2x2, M, N, true);

//      cout<<endl<<"C(M,N)"<< C_1x1[M*N-1]<<endl;
        cout<<endl<<"C(M,N)"<< C_2x2[M*N-1]<<endl;





        return 0;
}

(m, n, k) = (1024, 1024, 1024)

加速效果明显，效果图： 2 second VS 5 second

加入 openmp后，并行效果更加明显：

三，使用openmp对三重for循环优化

将测试 size 全部改成： ((m, n, k) = (1024, 1024, 4096))

在上面个的两个版本基础上，加入openmp加速指令后的代码为：

存储为 helloGemm_1x1_omp.cpp

编译：

 g++ hello_gemm_1x1_omp.cpp -o hello_gemm_1x1_omp.out -O3 -fopenmp

#include<iostream>
#include<stdlib.h>
#include <omp.h>

using namespace std;

void gemm_A_N_B_T_1x1(float* A, float* B, float* C, int M_, int N_, int K_){

#pragma omp parallel for num_threads(omp_get_num_procs())
        for(int i=0; i<M_; i++){
                for(int j=0; j<N_; j++){
                        for(int k=0; k<K_; k++){
                                C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
                        }
                }
        }

}

void gemm_A_N_B_T_2x2(float* A, float* B, float* C, int M_, int N_, int K_){

//      register float a1, a2, b1, b2;
#pragma omp parallel for num_threads(omp_get_num_procs())
        for(int i=0; i<M_/2; i++){
                for(int j=0; j<N_/2; j++){
                        for(int k=0; k<K_; k++){
                                register float a1, a2, b1, b2;

                                a1 = A[2*i     + k*M_   ];//2*i   k
                                a2 = A[(2*i+1) + k*M_   ];//2*i+1 k
                                b1 = B[k*N_    + 2*j    ];//k     2*j
                                b2 = B[k*N_    + (2*j+1)];//k     2*j+1

                                C[2*i     + 2*j*M_    ] += a1*b1;
                                C[2*i     + (2*j+1)*M_] += a1*b2;
                                C[(2*i+1) + 2*j*M_    ] += a2*b1;
                                C[2*i+1   + (2*j+1)*M_] += a2*b2;
//                                C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
                        }
                }
        }

}



void init_mat(float* A, int count, int mod){
        for(int idx=0; idx<count; idx++){
                A[idx] = idx%mod + 1;
        }
}


void print_mat(float* A, int M, int N, bool colMajor){

        cout<<endl<<endl;

        for(int i=0; i<M; i++){
                for(int j=0; j<N; j++){
                        cout<<"  "<<(colMajor? A[i + j*M]: A[i*N + j]);
                }
                cout<<endl;
        }
}



int main(){


        int M = 1024;
        int N = 1024;
        int K = 4096;

        float* A = nullptr;
        float* B = nullptr;
        float* C_1x1 = nullptr;
        float* C_2x2 = nullptr;

        A = (float*)malloc(M*K*sizeof(float));
        B = (float*)malloc(K*N*sizeof(float));
        C_1x1 = (float*)malloc(M*N*sizeof(float));
        C_2x2 = (float*)malloc(M*N*sizeof(float));

        init_mat(A, M*K, 3);
        init_mat(B, K*N, 4);

//      print_mat(A, M, K, true);
//      print_mat(B, K, N, false);

        gemm_A_N_B_T_1x1(A, B, C_1x1, M, N, K);
//      gemm_A_N_B_T_2x2(A, B, C_2x2, M, N, K);

        //print_mat(C_1x1, M, N, true);
        //print_mat(C_2x2, M, N, true);

        cout<<endl<<"C(M,N)"<< C_1x1[M*N-1]<<endl;
//      cout<<endl<<"C(M,N)"<< C_2x2[M*N-1]<<endl;





        return 0;
}

四，对block2x2 register 优化加入 openmp加速

存储为 helloGemm_2x2_omp.cpp

g++ hello_gemm_2x2_omp.cpp -o hello_gemm_2x2_omp.out -O3 -fopenmp

#include<iostream>
#include<stdlib.h>
#include <omp.h>

using namespace std;

void gemm_A_N_B_T_1x1(float* A, float* B, float* C, int M_, int N_, int K_){

#pragma omp parallel for num_threads(omp_get_num_procs())
        for(int i=0; i<M_; i++){
                for(int j=0; j<N_; j++){
                        for(int k=0; k<K_; k++){
                                C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
                        }
                }
        }

}

void gemm_A_N_B_T_2x2(float* A, float* B, float* C, int M_, int N_, int K_){

//      register float a1, a2, b1, b2;
#pragma omp parallel for num_threads(omp_get_num_procs())
        for(int i=0; i<M_/2; i++){
                for(int j=0; j<N_/2; j++){
                        for(int k=0; k<K_; k++){
                                register float a1, a2, b1, b2;

                                a1 = A[2*i     + k*M_   ];//2*i   k
                                a2 = A[(2*i+1) + k*M_   ];//2*i+1 k
                                b1 = B[k*N_    + 2*j    ];//k     2*j
                                b2 = B[k*N_    + (2*j+1)];//k     2*j+1

                                C[2*i     + 2*j*M_    ] += a1*b1;
                                C[2*i     + (2*j+1)*M_] += a1*b2;
                                C[(2*i+1) + 2*j*M_    ] += a2*b1;
                                C[2*i+1   + (2*j+1)*M_] += a2*b2;
//                                C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
                        }
                }
        }

}



void init_mat(float* A, int count, int mod){
        for(int idx=0; idx<count; idx++){
                A[idx] = idx%mod + 1;
        }
}


void print_mat(float* A, int M, int N, bool colMajor){

        cout<<endl<<endl;

        for(int i=0; i<M; i++){
                for(int j=0; j<N; j++){
                        cout<<"  "<<(colMajor? A[i + j*M]: A[i*N + j]);
                }
                cout<<endl;
        }
}



int main(){


        int M = 1024;//2048;
        int N = 1024;//2048;
        int K = 4096;

        float* A = nullptr;
        float* B = nullptr;
        float* C_1x1 = nullptr;
        float* C_2x2 = nullptr;

        A = (float*)malloc(M*K*sizeof(float));
        B = (float*)malloc(K*N*sizeof(float));
        C_1x1 = (float*)malloc(M*N*sizeof(float));
        C_2x2 = (float*)malloc(M*N*sizeof(float));

        init_mat(A, M*K, 3);
        init_mat(B, K*N, 4);

//      print_mat(A, M, K, true);
//      print_mat(B, K, N, false);

//      gemm_A_N_B_T_1x1(A, B, C_1x1, M, N, K);
        gemm_A_N_B_T_2x2(A, B, C_2x2, M, N, K);

        //print_mat(C_1x1, M, N, true);
        //print_mat(C_2x2, M, N, true);

//      cout<<endl<<"C(M,N)"<< C_1x1[M*N-1]<<endl;
        cout<<endl<<"C(M,N)"<< C_2x2[M*N-1]<<endl;





        return 0;
}

编译图：