cpu GEMM 之 微数据复用 寄存器优化

一,三重for循环方式

存储为 helloGemm_1x1.cpp

编译:

g++ hello_gemm_1x1.cpp -o hello_gemm_1x1.out -O3

#include<iostream>
#include<stdlib.h>


using namespace std;

void gemm_A_N_B_T_1x1(float* A, float* B, float* C, int M_, int N_, int K_){

        for(int i=0; i<M_; i++){
                for(int j=0; j<N_; j++){
                        for(int k=0; k<K_; k++){
                                C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
                        }
                }
        }

}

void gemm_A_N_B_T_2x2(float* A, float* B, float* C, int M_, int N_, int K_){

        register float a1, a2, b1, b2;

        for(int i=0; i<M_/2; i++){
                for(int j=0; j<N_/2; j++){
                        for(int k=0; k<K_; k++){
                                a1 = A[2*i     + k*M_   ];//2*i   k
                                a2 = A[(2*i+1) + k*M_   ];//2*i+1 k
                                b1 = B[k*N_    + 2*j    ];//k     2*j
                                b2 = B[k*N_    + (2*j+1)];//k     2*j+1

                                C[2*i     + 2*j*M_    ] += a1*b1;
                                C[2*i     + (2*j+1)*M_] += a1*b2;
                                C[(2*i+1) + 2*j*M_    ] += a2*b1;
                                C[2*i+1   + (2*j+1)*M_] += a2*b2;
//                                C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
                        }
                }
        }

}



void init_mat(float* A, int count, int mod){
        for(int idx=0; idx<count; idx++){
                A[idx] = idx%mod + 1;
        }
}


void print_mat(float* A, int M, int N, bool colMajor){

        cout<<endl<<endl;

        for(int i=0; i<M; i++){
                for(int j=0; j<N; j++){
                        cout<<"  "<<(colMajor? A[i + j*M]: A[i*N + j]);
                }
                cout<<endl;
        }
}



int main(){


        int M = 1024;
        int N = 1024;
        int K = 1024;

        float* A = nullptr;
        float* B = nullptr;
        float* C_1x1 = nullptr;
        float* C_2x2 = nullptr;

        A = (float*)malloc(M*K*sizeof(float));
        B = (float*)malloc(K*N*sizeof(float));
        C_1x1 = (float*)malloc(M*N*sizeof(float));
        C_2x2 = (float*)malloc(M*N*sizeof(float));

        init_mat(A, M*K, 3);
        init_mat(B, K*N, 4);

//      print_mat(A, M, K, true);
//      print_mat(B, K, N, false);

        gemm_A_N_B_T_1x1(A, B, C_1x1, M, N, K);
//      gemm_A_N_B_T_2x2(A, B, C_2x2, M, N, K);

        //print_mat(C_1x1, M, N, true);
        //print_mat(C_2x2, M, N, true);

        cout<<endl<<"C(M,N)"<< C_1x1[M*N-1]<<endl;
//      cout<<endl<<"C(M,N)"<< C_2x2[M*N-1]<<endl;





        return 0;
}

二,block2x2 register 优化

存储为 helloGemm_2x2.cpp

编译:

g++ hello_gemm_2x2.cpp -o hello_gemm_2x2.out -O3

#include<iostream>
#include<stdlib.h>


using namespace std;

void gemm_A_N_B_T_1x1(float* A, float* B, float* C, int M_, int N_, int K_){

        for(int i=0; i<M_; i++){
                for(int j=0; j<N_; j++){
                        for(int k=0; k<K_; k++){
                                C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
                        }
                }
        }

}

void gemm_A_N_B_T_2x2(float* A, float* B, float* C, int M_, int N_, int K_){

        register float a1, a2, b1, b2;

        for(int i=0; i<M_/2; i++){
                for(int j=0; j<N_/2; j++){
                        for(int k=0; k<K_; k++){
                                a1 = A[2*i     + k*M_   ];//2*i   k
                                a2 = A[(2*i+1) + k*M_   ];//2*i+1 k
                                b1 = B[k*N_    + 2*j    ];//k     2*j
                                b2 = B[k*N_    + (2*j+1)];//k     2*j+1

                                C[2*i     + 2*j*M_    ] += a1*b1;
                                C[2*i     + (2*j+1)*M_] += a1*b2;
                                C[(2*i+1) + 2*j*M_    ] += a2*b1;
                                C[2*i+1   + (2*j+1)*M_] += a2*b2;
//                                C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
                        }
                }
        }

}



void init_mat(float* A, int count, int mod){
        for(int idx=0; idx<count; idx++){
                A[idx] = idx%mod + 1;
        }
}


void print_mat(float* A, int M, int N, bool colMajor){

        cout<<endl<<endl;

        for(int i=0; i<M; i++){
                for(int j=0; j<N; j++){
                        cout<<"  "<<(colMajor? A[i + j*M]: A[i*N + j]);
                }
                cout<<endl;
        }
}



int main(){


        int M = 1024;
        int N = 1024;
        int K = 1024;

        float* A = nullptr;
        float* B = nullptr;
        float* C_1x1 = nullptr;
        float* C_2x2 = nullptr;

        A = (float*)malloc(M*K*sizeof(float));
        B = (float*)malloc(K*N*sizeof(float));
        C_1x1 = (float*)malloc(M*N*sizeof(float));
        C_2x2 = (float*)malloc(M*N*sizeof(float));

        init_mat(A, M*K, 3);
        init_mat(B, K*N, 4);

//      print_mat(A, M, K, true);
//      print_mat(B, K, N, false);

//      gemm_A_N_B_T_1x1(A, B, C_1x1, M, N, K);
        gemm_A_N_B_T_2x2(A, B, C_2x2, M, N, K);

        //print_mat(C_1x1, M, N, true);
        //print_mat(C_2x2, M, N, true);

//      cout<<endl<<"C(M,N)"<< C_1x1[M*N-1]<<endl;
        cout<<endl<<"C(M,N)"<< C_2x2[M*N-1]<<endl;





        return 0;
}

(m, n, k) = (1024, 1024, 1024)

加速效果明显,  效果图: 2 second    VS   5 second

加入 openmp后,并行效果更加明显:

三,使用openmp对三重for循环优化

将测试 size 全部改成:  ((m, n, k) = (1024, 1024, 4096))

在上面个的两个版本基础上,加入openmp加速指令后的代码为:

存储为 helloGemm_1x1_omp.cpp

编译:

 g++ hello_gemm_1x1_omp.cpp -o hello_gemm_1x1_omp.out -O3 -fopenmp

#include<iostream>
#include<stdlib.h>
#include <omp.h>

using namespace std;

void gemm_A_N_B_T_1x1(float* A, float* B, float* C, int M_, int N_, int K_){

#pragma omp parallel for num_threads(omp_get_num_procs())
        for(int i=0; i<M_; i++){
                for(int j=0; j<N_; j++){
                        for(int k=0; k<K_; k++){
                                C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
                        }
                }
        }

}

void gemm_A_N_B_T_2x2(float* A, float* B, float* C, int M_, int N_, int K_){

//      register float a1, a2, b1, b2;
#pragma omp parallel for num_threads(omp_get_num_procs())
        for(int i=0; i<M_/2; i++){
                for(int j=0; j<N_/2; j++){
                        for(int k=0; k<K_; k++){
                                register float a1, a2, b1, b2;

                                a1 = A[2*i     + k*M_   ];//2*i   k
                                a2 = A[(2*i+1) + k*M_   ];//2*i+1 k
                                b1 = B[k*N_    + 2*j    ];//k     2*j
                                b2 = B[k*N_    + (2*j+1)];//k     2*j+1

                                C[2*i     + 2*j*M_    ] += a1*b1;
                                C[2*i     + (2*j+1)*M_] += a1*b2;
                                C[(2*i+1) + 2*j*M_    ] += a2*b1;
                                C[2*i+1   + (2*j+1)*M_] += a2*b2;
//                                C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
                        }
                }
        }

}



void init_mat(float* A, int count, int mod){
        for(int idx=0; idx<count; idx++){
                A[idx] = idx%mod + 1;
        }
}


void print_mat(float* A, int M, int N, bool colMajor){

        cout<<endl<<endl;

        for(int i=0; i<M; i++){
                for(int j=0; j<N; j++){
                        cout<<"  "<<(colMajor? A[i + j*M]: A[i*N + j]);
                }
                cout<<endl;
        }
}



int main(){


        int M = 1024;
        int N = 1024;
        int K = 4096;

        float* A = nullptr;
        float* B = nullptr;
        float* C_1x1 = nullptr;
        float* C_2x2 = nullptr;

        A = (float*)malloc(M*K*sizeof(float));
        B = (float*)malloc(K*N*sizeof(float));
        C_1x1 = (float*)malloc(M*N*sizeof(float));
        C_2x2 = (float*)malloc(M*N*sizeof(float));

        init_mat(A, M*K, 3);
        init_mat(B, K*N, 4);

//      print_mat(A, M, K, true);
//      print_mat(B, K, N, false);

        gemm_A_N_B_T_1x1(A, B, C_1x1, M, N, K);
//      gemm_A_N_B_T_2x2(A, B, C_2x2, M, N, K);

        //print_mat(C_1x1, M, N, true);
        //print_mat(C_2x2, M, N, true);

        cout<<endl<<"C(M,N)"<< C_1x1[M*N-1]<<endl;
//      cout<<endl<<"C(M,N)"<< C_2x2[M*N-1]<<endl;





        return 0;
}

四,对block2x2 register 优化加入 openmp加速

存储为 helloGemm_2x2_omp.cpp

g++ hello_gemm_2x2_omp.cpp -o hello_gemm_2x2_omp.out -O3 -fopenmp

#include<iostream>
#include<stdlib.h>
#include <omp.h>

using namespace std;

void gemm_A_N_B_T_1x1(float* A, float* B, float* C, int M_, int N_, int K_){

#pragma omp parallel for num_threads(omp_get_num_procs())
        for(int i=0; i<M_; i++){
                for(int j=0; j<N_; j++){
                        for(int k=0; k<K_; k++){
                                C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
                        }
                }
        }

}

void gemm_A_N_B_T_2x2(float* A, float* B, float* C, int M_, int N_, int K_){

//      register float a1, a2, b1, b2;
#pragma omp parallel for num_threads(omp_get_num_procs())
        for(int i=0; i<M_/2; i++){
                for(int j=0; j<N_/2; j++){
                        for(int k=0; k<K_; k++){
                                register float a1, a2, b1, b2;

                                a1 = A[2*i     + k*M_   ];//2*i   k
                                a2 = A[(2*i+1) + k*M_   ];//2*i+1 k
                                b1 = B[k*N_    + 2*j    ];//k     2*j
                                b2 = B[k*N_    + (2*j+1)];//k     2*j+1

                                C[2*i     + 2*j*M_    ] += a1*b1;
                                C[2*i     + (2*j+1)*M_] += a1*b2;
                                C[(2*i+1) + 2*j*M_    ] += a2*b1;
                                C[2*i+1   + (2*j+1)*M_] += a2*b2;
//                                C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
                        }
                }
        }

}



void init_mat(float* A, int count, int mod){
        for(int idx=0; idx<count; idx++){
                A[idx] = idx%mod + 1;
        }
}


void print_mat(float* A, int M, int N, bool colMajor){

        cout<<endl<<endl;

        for(int i=0; i<M; i++){
                for(int j=0; j<N; j++){
                        cout<<"  "<<(colMajor? A[i + j*M]: A[i*N + j]);
                }
                cout<<endl;
        }
}



int main(){


        int M = 1024;//2048;
        int N = 1024;//2048;
        int K = 4096;

        float* A = nullptr;
        float* B = nullptr;
        float* C_1x1 = nullptr;
        float* C_2x2 = nullptr;

        A = (float*)malloc(M*K*sizeof(float));
        B = (float*)malloc(K*N*sizeof(float));
        C_1x1 = (float*)malloc(M*N*sizeof(float));
        C_2x2 = (float*)malloc(M*N*sizeof(float));

        init_mat(A, M*K, 3);
        init_mat(B, K*N, 4);

//      print_mat(A, M, K, true);
//      print_mat(B, K, N, false);

//      gemm_A_N_B_T_1x1(A, B, C_1x1, M, N, K);
        gemm_A_N_B_T_2x2(A, B, C_2x2, M, N, K);

        //print_mat(C_1x1, M, N, true);
        //print_mat(C_2x2, M, N, true);

//      cout<<endl<<"C(M,N)"<< C_1x1[M*N-1]<<endl;
        cout<<endl<<"C(M,N)"<< C_2x2[M*N-1]<<endl;





        return 0;
}

编译图:

效果图:

优化效果统计:

三重for循环:                       43 s

block2x2 register:              11 s

openmp 三重for循环:          2 s

openmp block2x2 register:    1s

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值