一,三重for循环方式
存储为 helloGemm_1x1.cpp
编译:
g++ hello_gemm_1x1.cpp -o hello_gemm_1x1.out -O3
#include<iostream>
#include<stdlib.h>
using namespace std;
void gemm_A_N_B_T_1x1(float* A, float* B, float* C, int M_, int N_, int K_){
for(int i=0; i<M_; i++){
for(int j=0; j<N_; j++){
for(int k=0; k<K_; k++){
C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
}
}
}
}
void gemm_A_N_B_T_2x2(float* A, float* B, float* C, int M_, int N_, int K_){
register float a1, a2, b1, b2;
for(int i=0; i<M_/2; i++){
for(int j=0; j<N_/2; j++){
for(int k=0; k<K_; k++){
a1 = A[2*i + k*M_ ];//2*i k
a2 = A[(2*i+1) + k*M_ ];//2*i+1 k
b1 = B[k*N_ + 2*j ];//k 2*j
b2 = B[k*N_ + (2*j+1)];//k 2*j+1
C[2*i + 2*j*M_ ] += a1*b1;
C[2*i + (2*j+1)*M_] += a1*b2;
C[(2*i+1) + 2*j*M_ ] += a2*b1;
C[2*i+1 + (2*j+1)*M_] += a2*b2;
// C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
}
}
}
}
void init_mat(float* A, int count, int mod){
for(int idx=0; idx<count; idx++){
A[idx] = idx%mod + 1;
}
}
void print_mat(float* A, int M, int N, bool colMajor){
cout<<endl<<endl;
for(int i=0; i<M; i++){
for(int j=0; j<N; j++){
cout<<" "<<(colMajor? A[i + j*M]: A[i*N + j]);
}
cout<<endl;
}
}
int main(){
int M = 1024;
int N = 1024;
int K = 1024;
float* A = nullptr;
float* B = nullptr;
float* C_1x1 = nullptr;
float* C_2x2 = nullptr;
A = (float*)malloc(M*K*sizeof(float));
B = (float*)malloc(K*N*sizeof(float));
C_1x1 = (float*)malloc(M*N*sizeof(float));
C_2x2 = (float*)malloc(M*N*sizeof(float));
init_mat(A, M*K, 3);
init_mat(B, K*N, 4);
// print_mat(A, M, K, true);
// print_mat(B, K, N, false);
gemm_A_N_B_T_1x1(A, B, C_1x1, M, N, K);
// gemm_A_N_B_T_2x2(A, B, C_2x2, M, N, K);
//print_mat(C_1x1, M, N, true);
//print_mat(C_2x2, M, N, true);
cout<<endl<<"C(M,N)"<< C_1x1[M*N-1]<<endl;
// cout<<endl<<"C(M,N)"<< C_2x2[M*N-1]<<endl;
return 0;
}
二,block2x2 register 优化
存储为 helloGemm_2x2.cpp
编译:
g++ hello_gemm_2x2.cpp -o hello_gemm_2x2.out -O3
#include<iostream>
#include<stdlib.h>
using namespace std;
void gemm_A_N_B_T_1x1(float* A, float* B, float* C, int M_, int N_, int K_){
for(int i=0; i<M_; i++){
for(int j=0; j<N_; j++){
for(int k=0; k<K_; k++){
C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
}
}
}
}
void gemm_A_N_B_T_2x2(float* A, float* B, float* C, int M_, int N_, int K_){
register float a1, a2, b1, b2;
for(int i=0; i<M_/2; i++){
for(int j=0; j<N_/2; j++){
for(int k=0; k<K_; k++){
a1 = A[2*i + k*M_ ];//2*i k
a2 = A[(2*i+1) + k*M_ ];//2*i+1 k
b1 = B[k*N_ + 2*j ];//k 2*j
b2 = B[k*N_ + (2*j+1)];//k 2*j+1
C[2*i + 2*j*M_ ] += a1*b1;
C[2*i + (2*j+1)*M_] += a1*b2;
C[(2*i+1) + 2*j*M_ ] += a2*b1;
C[2*i+1 + (2*j+1)*M_] += a2*b2;
// C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
}
}
}
}
void init_mat(float* A, int count, int mod){
for(int idx=0; idx<count; idx++){
A[idx] = idx%mod + 1;
}
}
void print_mat(float* A, int M, int N, bool colMajor){
cout<<endl<<endl;
for(int i=0; i<M; i++){
for(int j=0; j<N; j++){
cout<<" "<<(colMajor? A[i + j*M]: A[i*N + j]);
}
cout<<endl;
}
}
int main(){
int M = 1024;
int N = 1024;
int K = 1024;
float* A = nullptr;
float* B = nullptr;
float* C_1x1 = nullptr;
float* C_2x2 = nullptr;
A = (float*)malloc(M*K*sizeof(float));
B = (float*)malloc(K*N*sizeof(float));
C_1x1 = (float*)malloc(M*N*sizeof(float));
C_2x2 = (float*)malloc(M*N*sizeof(float));
init_mat(A, M*K, 3);
init_mat(B, K*N, 4);
// print_mat(A, M, K, true);
// print_mat(B, K, N, false);
// gemm_A_N_B_T_1x1(A, B, C_1x1, M, N, K);
gemm_A_N_B_T_2x2(A, B, C_2x2, M, N, K);
//print_mat(C_1x1, M, N, true);
//print_mat(C_2x2, M, N, true);
// cout<<endl<<"C(M,N)"<< C_1x1[M*N-1]<<endl;
cout<<endl<<"C(M,N)"<< C_2x2[M*N-1]<<endl;
return 0;
}
(m, n, k) = (1024, 1024, 1024)
加速效果明显, 效果图: 2 second VS 5 second
加入 openmp后,并行效果更加明显:
三,使用openmp对三重for循环优化
将测试 size 全部改成: ((m, n, k) = (1024, 1024, 4096))
在上面个的两个版本基础上,加入openmp加速指令后的代码为:
存储为 helloGemm_1x1_omp.cpp
编译:
g++ hello_gemm_1x1_omp.cpp -o hello_gemm_1x1_omp.out -O3 -fopenmp
#include<iostream>
#include<stdlib.h>
#include <omp.h>
using namespace std;
void gemm_A_N_B_T_1x1(float* A, float* B, float* C, int M_, int N_, int K_){
#pragma omp parallel for num_threads(omp_get_num_procs())
for(int i=0; i<M_; i++){
for(int j=0; j<N_; j++){
for(int k=0; k<K_; k++){
C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
}
}
}
}
void gemm_A_N_B_T_2x2(float* A, float* B, float* C, int M_, int N_, int K_){
// register float a1, a2, b1, b2;
#pragma omp parallel for num_threads(omp_get_num_procs())
for(int i=0; i<M_/2; i++){
for(int j=0; j<N_/2; j++){
for(int k=0; k<K_; k++){
register float a1, a2, b1, b2;
a1 = A[2*i + k*M_ ];//2*i k
a2 = A[(2*i+1) + k*M_ ];//2*i+1 k
b1 = B[k*N_ + 2*j ];//k 2*j
b2 = B[k*N_ + (2*j+1)];//k 2*j+1
C[2*i + 2*j*M_ ] += a1*b1;
C[2*i + (2*j+1)*M_] += a1*b2;
C[(2*i+1) + 2*j*M_ ] += a2*b1;
C[2*i+1 + (2*j+1)*M_] += a2*b2;
// C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
}
}
}
}
void init_mat(float* A, int count, int mod){
for(int idx=0; idx<count; idx++){
A[idx] = idx%mod + 1;
}
}
void print_mat(float* A, int M, int N, bool colMajor){
cout<<endl<<endl;
for(int i=0; i<M; i++){
for(int j=0; j<N; j++){
cout<<" "<<(colMajor? A[i + j*M]: A[i*N + j]);
}
cout<<endl;
}
}
int main(){
int M = 1024;
int N = 1024;
int K = 4096;
float* A = nullptr;
float* B = nullptr;
float* C_1x1 = nullptr;
float* C_2x2 = nullptr;
A = (float*)malloc(M*K*sizeof(float));
B = (float*)malloc(K*N*sizeof(float));
C_1x1 = (float*)malloc(M*N*sizeof(float));
C_2x2 = (float*)malloc(M*N*sizeof(float));
init_mat(A, M*K, 3);
init_mat(B, K*N, 4);
// print_mat(A, M, K, true);
// print_mat(B, K, N, false);
gemm_A_N_B_T_1x1(A, B, C_1x1, M, N, K);
// gemm_A_N_B_T_2x2(A, B, C_2x2, M, N, K);
//print_mat(C_1x1, M, N, true);
//print_mat(C_2x2, M, N, true);
cout<<endl<<"C(M,N)"<< C_1x1[M*N-1]<<endl;
// cout<<endl<<"C(M,N)"<< C_2x2[M*N-1]<<endl;
return 0;
}
四,对block2x2 register 优化加入 openmp加速
存储为 helloGemm_2x2_omp.cpp
g++ hello_gemm_2x2_omp.cpp -o hello_gemm_2x2_omp.out -O3 -fopenmp
#include<iostream>
#include<stdlib.h>
#include <omp.h>
using namespace std;
void gemm_A_N_B_T_1x1(float* A, float* B, float* C, int M_, int N_, int K_){
#pragma omp parallel for num_threads(omp_get_num_procs())
for(int i=0; i<M_; i++){
for(int j=0; j<N_; j++){
for(int k=0; k<K_; k++){
C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
}
}
}
}
void gemm_A_N_B_T_2x2(float* A, float* B, float* C, int M_, int N_, int K_){
// register float a1, a2, b1, b2;
#pragma omp parallel for num_threads(omp_get_num_procs())
for(int i=0; i<M_/2; i++){
for(int j=0; j<N_/2; j++){
for(int k=0; k<K_; k++){
register float a1, a2, b1, b2;
a1 = A[2*i + k*M_ ];//2*i k
a2 = A[(2*i+1) + k*M_ ];//2*i+1 k
b1 = B[k*N_ + 2*j ];//k 2*j
b2 = B[k*N_ + (2*j+1)];//k 2*j+1
C[2*i + 2*j*M_ ] += a1*b1;
C[2*i + (2*j+1)*M_] += a1*b2;
C[(2*i+1) + 2*j*M_ ] += a2*b1;
C[2*i+1 + (2*j+1)*M_] += a2*b2;
// C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
}
}
}
}
void init_mat(float* A, int count, int mod){
for(int idx=0; idx<count; idx++){
A[idx] = idx%mod + 1;
}
}
void print_mat(float* A, int M, int N, bool colMajor){
cout<<endl<<endl;
for(int i=0; i<M; i++){
for(int j=0; j<N; j++){
cout<<" "<<(colMajor? A[i + j*M]: A[i*N + j]);
}
cout<<endl;
}
}
int main(){
int M = 1024;//2048;
int N = 1024;//2048;
int K = 4096;
float* A = nullptr;
float* B = nullptr;
float* C_1x1 = nullptr;
float* C_2x2 = nullptr;
A = (float*)malloc(M*K*sizeof(float));
B = (float*)malloc(K*N*sizeof(float));
C_1x1 = (float*)malloc(M*N*sizeof(float));
C_2x2 = (float*)malloc(M*N*sizeof(float));
init_mat(A, M*K, 3);
init_mat(B, K*N, 4);
// print_mat(A, M, K, true);
// print_mat(B, K, N, false);
// gemm_A_N_B_T_1x1(A, B, C_1x1, M, N, K);
gemm_A_N_B_T_2x2(A, B, C_2x2, M, N, K);
//print_mat(C_1x1, M, N, true);
//print_mat(C_2x2, M, N, true);
// cout<<endl<<"C(M,N)"<< C_1x1[M*N-1]<<endl;
cout<<endl<<"C(M,N)"<< C_2x2[M*N-1]<<endl;
return 0;
}
编译图:
效果图:
优化效果统计:
三重for循环: 43 s
block2x2 register: 11 s
openmp 三重for循环: 2 s
openmp block2x2 register: 1s