简介
本文章对六种矩阵相乘算法进行了结果精度对比与耗时统计分析。六种矩阵相乘分别为Fortran的内置函数matmul(),blas库的dgemm(),Fortran编写的常规矩阵相乘,DCU并行矩阵相乘,C编写的常规矩阵相乘,C编写的优化后矩阵相乘。DCU使用1000个线程做矩阵乘法,每个线程计算最终矩阵的一行结果。C编写的优化后的矩阵相乘为通过提高缓存命中率从而提高效率。
矩阵为随机生成的1000*1000规模的双精度浮点数,矩阵乘法的规则为矩阵A乘矩阵A的转置,但是矩阵A与矩阵A的转置存储在内存地址不连续的两块内存中。
实验过程
执行python文件,生成双精度浮点随机数矩阵
python makedata.py
申请DCU节点
salloc -p kshdnormal -N 1 --gres=dcu:1
切换运行环境及编译环境
module switch compiler/rocm/4.0.1
编译及运行程序
make
./main.out
结果分析
(1)Fortran的内置函数matmul()的效率最高,但是却存在10的-13次方的平均精度误差;
(2)blas库的dgemm()的计算结果准确且耗时较少,但是使用起来较为麻烦;
(3)Fortran实现的矩阵乘法非常慢,但在编译Fortran文件时使用-O3进行代码优化,最后的计算时间可以缩减为1.350秒;
(4)DCU实现的矩阵乘法效率不高,这需要优化DCU矩阵乘法的算法架构,且DCU计算存在10的-14次方的平均精度误差;
(5)CPU实现的矩阵乘法与Fortran实现的算法逻辑相同,耗时却更短,可能是编译器的原因?自动优化了嘛?
(6)改变矩阵相乘的计算过程,提高缓存命中率,有明显的效率提升。
平均误差 | 最大误差 | 计算耗时(s) | |
matmul | 1.597E-013 | 1.597E-007 | 0.370 |
dgemm | 0 | 0 | 1.093 |
mm | 0 | 0 | 4.465 |
DCUMM | 1.336e-14 | 1.336e-08 | 1.27 |
CPUMM | 0 | 0 | 1.47 |
CPUMMUP | 0 | 0 | 0.62 |
日志打印
-------------------------------------------
matrixA 与 matrixB 初始矩阵精度对比 :
Accuracy Error : 0.0000000000000000
transpose_fort 与 transpose_auto 的计算结果对比 :
Accuracy Error : 0.0000000000000000
matmul 与 mm 的计算结果对比 :
Accuracy Error : 1.5970101685525150E-013
-------------------------------------------
matrixA 与 matrixB 初始矩阵精度对比 :
Accuracy Error : 0.0000000000000000
transpose_fort 与 transpose_auto 的计算结果对比 :
Accuracy Error : 0.0000000000000000
dgemm 与 mm 的计算结果对比 :
Accuracy Error : 0.0000000000000000
-------------------------------------------
transpose_fort time (s) is : 4.9999998882412910E-003
transpose_auto time (s) is : 9.9999997764825821E-003
dgemm calculation time (s) is : 1.0909999608993530
matmul calculation time (s) is : 0.37000000476837158
mm calculation time (s) is : 4.4580001831054688
-------------------------------------------
-------------------------------------------
transpose_fort 与 transpose_c 的计算结果对比 :
Accuracy Error : 0
Max Accuracy Error : 0
-------------------------------------------
dgemm 与 DCUMM 的计算结果对比 :
Accuracy Error : 1.33633e-14
Max Accuracy Error : 1.33633e-08
-------------------------------------------
dgemm 与 CPUMM 的计算结果对比 :
Accuracy Error : 0
Max Accuracy Error : 0
-------------------------------------------
dgemm 与 CPUMMUP 的计算结果对比 :
Accuracy Error : 0
Max Accuracy Error : 0
-------------------------------------------
DCUMM calculation time (s) is 1.27
CPUMM calculation time (s) is 1.46
CPUMMUP calculation time (s) is 0.72
transpose_c time (us) is 255.031957
-------------------------------------------
附录
文件名:makedata.py
import numpy as np
np.random.seed(10)
filename = "random_double1000_1000.bin"
fileobj = open(filename, mode='wb')
b = np.random.rand(1000 * 1000)
b.tofile(fileobj)
fileobj.close
print(b)
文件名:fMain.F90
program main
implicit none
! c矩阵相乘
external :: cmatmul
! dgemm,matmul测试矩阵
real*8, allocatable, dimension(:,:) :: matrixA, matrixAT, matrixC
! mm测试矩阵
real*8, allocatable, dimension(:,:) :: matrixB, matrixBT, matrixD
! 矩阵大小
integer, parameter :: row=1000, col=1000
real*8 :: alpha, beta
! 文件名
character(len=255) filename
integer :: i, j, k
! 误差
real*8 :: accuracyError, elD
! 统计计算耗时
integer :: time_start, time_end
real*8 :: dgemm_time, matmul_time, mm_time
real*8 :: transpose_fort_time, transpose_auto_time
filename = 'random_double1000_1000.bin'
allocate(matrixA(row, col))
allocate(matrixC(row, col))
allocate(matrixB(row, col))
allocate(matrixD(row, col))
allocate(matrixAT(row, col))
allocate(matrixBT(row, col))
! 读取矩阵数据
open(10,file=filename, status='old', form='unformatted', access='stream', action='read')
read(10, pos=1)matrixA
close(10)
open(10,file=filename, status='old', form='unformatted', access='stream', action='read')
read(10, pos=1)matrixB
close(10)
! matmul矩阵相乘
call system_clock(time_start)
matrixAT = transpose(matrixA)
call system_clock(time_end)
transpose_fort_time = (time_end - time_start) / 1000.
call system_clock(time_start)
matrixC = matmul(matrixA, matrixAT)
call system_clock(time_end)
matmul_time = (time_end - time_start) / 1000.
! transpose_auto, 矩阵B乘矩阵B的转置,结果为矩阵D
call system_clock(time_start)
matrixBT = transpose(matrixB)
do i=1, col
do j=1, row
matrixBT(j, i) = matrixB(i, j)
end do
end do
call system_clock(time_end)
transpose_auto_time = (time_end - time_start) / 1000.
! mm矩阵相乘
call system_clock(time_start)
do i=1, row
do j=1, col
elD = 0
do k=1, col
elD = elD + matrixB(i, k) * matrixBT(k, j)
end do
matrixD(i, j) = elD
end do
end do
call system_clock(time_end)
mm_time = (time_end - time_start) / 1000.
! 精度对比
print *, ' ------------------------------------------- '
print *, 'matrixA 与 matrixB 初始矩阵精度对比 : '
call checkaccuracy(matrixA, matrixB, row, col, accuracyError)
print *, 'transpose_fort 与 transpose_auto 的计算结果对比 : '
call checkaccuracy(matrixAT, matrixBT, row, col, accuracyError)
print *, 'matmul 与 mm 的计算结果对比 : '
call checkaccuracy(matrixC, matrixD, row, col, accuracyError)
! dgemm矩阵相乘,矩阵A乘矩阵A的转置,结果为矩阵C
! 双精度科学计数法表示,否则dgemm计算结果错误
alpha = 1.d0
beta = 0.d0
matrixC = 0.d0
matrixAT = transpose(matrixA)
call system_clock(time_start)
call dgemm('n', 'n', row, col, row, alpha, matrixA, row, matrixAT, row, beta, matrixC, row)
call system_clock(time_end)
dgemm_time = (time_end - time_start) / 1000.
! 精度对比
print *, ' ------------------------------------------- '
print *, 'matrixA 与 matrixB 初始矩阵精度对比 : '
call checkaccuracy(matrixA, matrixB, row, col, accuracyError)
print *, 'transpose_fort 与 transpose_auto 的计算结果对比 : '
call checkaccuracy(matrixAT, matrixBT, row, col, accuracyError)
print *, 'dgemm 与 mm 的计算结果对比 : '
call checkaccuracy(matrixC, matrixD, row, col, accuracyError)
! 输出打印
print *, ' ------------------------------------------- '
print *, 'transpose_fort time (s) is : ', transpose_fort_time
print *, 'transpose_auto time (s) is : ', transpose_auto_time
print *, 'dgemm calculation time (s) is : ', dgemm_time
print *, 'matmul calculation time (s) is : ', matmul_time
print *, 'mm calculation time (s) is : ', mm_time
print *, ' ------------------------------------------- '
! 调用C以及DCU
call cmatmul(matrixC, matrixA, matrixAT, row, col)
end program
subroutine checkaccuracy(matrixA, matrixB, row, col, accuracyError)
real*8, dimension(100,100), intent(in) :: matrixA, matrixB
integer, intent(in) :: row, col
real*8, intent(inout) :: accuracyError
real*8 :: maxAE
! 精度对比
accuracyError = 0
maxAE = 0
do i=1, row
do j=1, col
if (matrixA(j, i) < matrixB(j, i)) then
accuracyError = accuracyError + matrixB(j, i) - matrixA(j, i)
else
accuracyError = accuracyError + matrixA(j, i) - matrixB(j, i)
end if
if (accuracyError > maxAE) then
maxAE = accuracyError
end if
end do
end do
print *, 'Accuracy Error :', accuracyError / (row * col)
print *, 'Max Accuracy Error :', maxAE
end subroutine
文件名:cMain.cpp
#include <iostream>
#include <stdio.h>
#include <hip/hip_runtime.h>
#include <time.h>
#include <sys/time.h>
using namespace std;
// DUC计算矩阵相乘
__global__ void MatrixM(double *pfMatrix, double *pfMatrixT, double *pfMatrixM, int row, int col)
{
int tid;
tid = blockDim.x * blockIdx.x + threadIdx.x;
for (int i=0; i < col; i++)
{
double elD = 0;
for (int j = 0; j < row; j++)
{
elD += pfMatrix[tid * col + j] * pfMatrixT[j * row + i];
}
pfMatrixM[tid * col + i] = elD;
}
}
// 精度对比
void checkaccuracy(double *pfMatrixA, double *pfMatrixM, int ROW, int COL)
{
double accuracyError = 0;
double maxAE = 0;
for (int i=0; i < ROW; i++)
{
for (int j=0; j < COL; j++)
{
if (pfMatrixA[i * COL + j] < pfMatrixM[i * COL + j])
{
accuracyError = accuracyError + pfMatrixM[i * COL + j] - pfMatrixA[i * COL + j];
}
else
{
accuracyError = accuracyError + pfMatrixA[i * COL + j] - pfMatrixM[i * COL + j];
}
if (accuracyError > maxAE)
{
maxAE = accuracyError;
}
}
}
std::cout << "Accuracy Error : " << accuracyError / (ROW * COL) << endl;
std::cout << "Max Accuracy Error : " << maxAE << endl;
return;
}
// 矩阵转置
double* transpose(double *pfMatrix, int row, int col)
{
double *pfMatrixT;
pfMatrixT = (double*)malloc(sizeof(double) * row * col);
for (int i=0; i < row; i++)
{
for(int j=0; j < col; j++)
{
pfMatrixT[i * col + j] = pfMatrix[j * row + i];
}
}
return pfMatrixT;
}
extern "C" void cmatmul_(double *pfMatrixA, double *pfMatrixB, double *pfMatrixBT, int *row, int *col)
{
// 矩阵规模
const int ROW = *row;
const int COL = *col;
// 计时
clock_t time_start, time_end;
double DCUMM_time, CPUMM_time, CPUMMUP_time;
long transpose_c_time;
struct timeval start;
struct timeval end;
double *pfMatrixB2;
pfMatrixB2 = (double*)malloc(sizeof(double) * ROW * COL);
// 转置
pfMatrixA = transpose(pfMatrixA, ROW, COL);
gettimeofday(&start, NULL);
pfMatrixB2 = transpose(pfMatrixB, ROW, COL);
gettimeofday(&end, NULL);
transpose_c_time = ((end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec));
// 精度对比
std::cout << " ------------------------------------------- " << endl;
std::cout << "transpose_fort 与 transpose_c 的计算结果对比 : " << endl;
checkaccuracy(pfMatrixB2, pfMatrixBT, ROW, COL);
double * pfMatrixM;
pfMatrixM = (double*)malloc(sizeof(double) * ROW * COL);
// 为DCU矩阵分配显存
double *pfDevMatrix, *pfDevMatrixT, *pfDevMatrixM;
hipMalloc((void**)&pfDevMatrix, sizeof(double) * ROW * COL);
hipMalloc((void**)&pfDevMatrixT, sizeof(double) * ROW *COL);
hipMalloc((void**)&pfDevMatrixM, sizeof(double) * ROW * COL);
// 内存数据拷贝到显存
hipMemcpy(pfDevMatrix, pfMatrixB2, sizeof(double) * ROW * COL, hipMemcpyHostToDevice);
hipMemcpy(pfDevMatrixT, pfMatrixB, sizeof(double) * ROW * COL, hipMemcpyHostToDevice);
// 启动核函数,每个线程处理矩阵的一行数据
time_start = clock();
hipLaunchKernelGGL(MatrixM, dim3(1), dim3(ROW), 0, 0, pfDevMatrix, pfDevMatrixT, pfDevMatrixM, ROW, COL);
hipDeviceSynchronize();
time_end = clock();
DCUMM_time = (double)(time_end - time_start) / CLOCKS_PER_SEC;
// 显存数据拷贝回内存
hipMemcpy(pfMatrixM, pfDevMatrixM, sizeof(double) * ROW * COL, hipMemcpyDeviceToHost);
// 精度对比
std::cout << " ------------------------------------------- " << endl;
std::cout << "dgemm 与 DCUMM 的计算结果对比 : " << endl;
checkaccuracy(pfMatrixA, pfMatrixM, ROW, COL);
// CPUMM矩阵相乘
memset(pfMatrixM, 0, sizeof(double) * ROW * COL);
time_start = clock();
double elD;
for (int i=0; i < ROW; i++)
{
for(int j=0; j < COL; j++)
{
elD = 0;
for(int k=0; k < ROW; k++)
{
elD = elD + pfMatrixB2[i * COL + k] * pfMatrixB[k * COL + j];
}
pfMatrixM[i * COL + j] = elD;
}
}
time_end = clock();
CPUMM_time = (double)(time_end - time_start) / CLOCKS_PER_SEC;
// 精度对比
std::cout << " ------------------------------------------- " << endl;
std::cout << "dgemm 与 CPUMM 的计算结果对比 : " << endl;
checkaccuracy(pfMatrixA, pfMatrixM, ROW, COL);
// CPUMMUP矩阵相乘
memset(pfMatrixM, 0, sizeof(double) * ROW * COL);
time_start = clock();
for(int i=0; i < ROW; i++)
{
for(int j=0; j < COL; j++)
{
for(int k=0; k < COL; k++)
{
pfMatrixM[i * COL + k] += pfMatrixB2[i * COL + j] * pfMatrixB[j * COL + k];
}
}
}
time_end = clock();
CPUMMUP_time = (double)(time_end - time_start) / CLOCKS_PER_SEC;
// 精度对比
std::cout << " ------------------------------------------- " << endl;
std::cout << "dgemm 与 CPUMMUP 的计算结果对比 : " << endl;
checkaccuracy(pfMatrixA, pfMatrixM, ROW, COL);
// 时间打印
std::cout << " ------------------------------------------- " << endl;
std::cout << "DCUMM calculation time (s) is " << DCUMM_time << endl;
std::cout << "CPUMM calculation time (s) is " << CPUMM_time << endl;
std::cout << "CPUMMUP calculation time (s) is " << CPUMMUP_time << endl;
printf( "transpose_c time (us) is %f\n", transpose_c_time);
std::cout << " ------------------------------------------- " << endl;
return;
}
文件名:makefile
LPACKHOME=/public/software/mathlib/lapack/gnu/3.8.0
HIP_INSTALL_PATH = /public/software/compiler/rocm/rocm-4.0.1
# 编译器
CC = hipcc
FC = gfortran
# 链接器
LD = gfortran
# 编译C程序的指令
CFLAGS = --offload-arch=gfx906
# 编译Fortran程序的指令
FFLAGS =
# 链接指令
LIBS = -lstdc++ -lamdhip64 -lhsa-runtime64 -lamd_comgr -lhsakmt -L$(HIP_INSTALL_PATH)/lib64/ -L$(HIP_INSTALL_PATH)/hip/lib -L$(HIP_INSTALL_PATH)/lib/ -L$(HIP_INSTALL_PATH)/hipfort/lib/ -L$(LPACKHOME)/lib64 -llapack -lblas
# Fortran源文件
FFILES = fMain.F90
# C/HIP源文件
CFILES = cMain.cpp
# 中间文件
OBJECTS = fMain.o cMain.o
# 目标文件
TARGET = main.out
all:
$(FC) $(FFLAGS) -c $(FFILES)
$(CC) $(CFLAGS) -c $(CFILES)
$(LD) $(LIBS) $(OBJECTS) -o $(TARGET)
clean:
rm -f *.o $(TARGET)