python 加速计算矩阵乘法
小记
2022.4.30
做的毕设要使用C++对python做加速,用的时pybind11接口。
原以为即使用C++加速了,但自己写的代码总不上库里的,加速不了,结果确是真的可以大大加速!
暂时不用Cache Blocking,目前用这个调换循环顺序的算法就可以加速很多了。
参考网站: https://zhuanlan.zhihu.com/p/362854057
方案1:用C++实现朴素算法
//2维矩阵相乘,朴素算法
py::array_t<double> matmul_2d(py::array_t<double>& input1, py::array_t<double>& input2)
{
// 获取input1, input2的信息
py::buffer_info buf1 = input1.request();
py::buffer_info buf2 = input2.request();
if (buf1.ndim != 2 || buf2.ndim != 2)
{
throw std::runtime_error("Number of dimensions must be two!");
}
if (buf1.shape[1] != buf2.shape[0])
{
throw std::runtime_error("Input shape must match");
}
const int buf1_r = buf1.shape[0];
const int buf1_c = buf1.shape[1];
const int buf2_r = buf2.shape[0];
const int buf2_c = buf2.shape[1];
const int buf3_r = buf1.shape[0];
const int buf3_c = buf2.shape[1];
//申请空间,确定结果矩阵的形状
auto result = py::array_t<double>({ buf3_r,buf3_c });
py::buffer_info buf3 = result.request();
//获取numpy.ndarray 数据指针
double* ptr1 = (double*)buf1.ptr;
double* ptr2 = (double*)buf2.ptr;
double* ptr3 = (double*)buf3.ptr;
//指针访问numpy.ndarray
//loc1,loc3的提出会影响代码易读性,但是可以加速
for (int i = 0; i < buf3_r; ++i)
{
int loc3 = i * buf3_c;
for (int j = 0; j < buf3_c;++j)
{
double sum = 0;
int loc1 = i * buf1_c;
for (int k = 0; k < buf1_c; ++k)
{
sum += ptr1[ loc1+ k] * ptr2[k * buf2_c + j];
}
ptr3[loc3 + j] = sum;
}
}
return result;
}
方案二:C++实现,调换循环顺序的算法
算法思想来自于这里 : https://zhuanlan.zhihu.com/p/146250334
/// 2d矩阵相乘,调换循环顺序的算法
py::array_t<double> matmul_2d2 (py::array_t<double> & input1, py::array_t<double> & input2)
{
// 获取input1, input2的信息
py::buffer_info buf1 = input1.request();
py::buffer_info buf2 = input2.request();
if (buf1.ndim != 2 || buf2.ndim != 2)
{
throw std::runtime_error("Number of dimensions must be two!");
}
if (buf1.shape[1] != buf2.shape[0])
{
throw std::runtime_error("Input shape must match");
}
const int buf1_r = buf1.shape[0];
const int buf1_c = buf1.shape[1];
const int buf2_r = buf2.shape[0];
const int buf2_c = buf2.shape[1];
const int buf3_r = buf1.shape[0];
const int buf3_c = buf2.shape[1];
//申请空间,确定结果矩阵的形状
auto result = py::array_t<double>({ buf3_r,buf3_c });
py::buffer_info buf3 = result.request();
//获取numpy.ndarray 数据指针
double* ptr1 = (double*)buf1.ptr;
double* ptr2 = (double*)buf2.ptr;
double* ptr3 = (double*)buf3.ptr;
for (int i = 0; i < buf3_r; ++i)
for (int j = 0; j < buf3_c; ++j)
ptr3[i * buf3_c + j] = 0.0;
//return result;
//指针访问numpy.ndarray
for (int i = 0; i < buf3_r; ++i)
{
for (int k = 0; k < buf1_c; ++k)
{
double v = ptr1[i * buf1_c+k];
int loc3 = i * buf3_c;
int loc2 = k * buf2_c;
for (int j = 0; j < buf3_c; ++j)
{
ptr3[loc3+j]+= v* ptr2[loc2 + j];
}
}
}
return result;
}
方案三:使用numpy
测速代码
import numpy as np
from icecream import ic
import example
import time
import platform
#生成矩阵
a=np.random.randint(2,4,(400,300))
b=np.random.randint(2,4,(300,500))
#测试
T1 = time.perf_counter()
c1=example.matmul_2d(a,b)
T2 =time.perf_counter()
print('算法1(朴素C++)——程序运行时间:%s毫秒' % ((T2 - T1)*1000))
#算法2
T1 = time.perf_counter()
c2=example.matmul_2d2(a,b)
T2 =time.perf_counter()
print('算法2(调换循环顺序C++)——程序运行时间:%s毫秒' % ((T2 - T1)*1000))
#使用python算
T1 = time.perf_counter()
c3=np.matmul(a,b)
T2 =time.perf_counter()
print('算法3(numpy)——程序运行时间:%s毫秒' % ((T2 - T1)*1000))
#确保计算结果相同
print((c1==c2).all())
print((c1==c3).all())
"""
算法1(朴素C++)——程序运行时间:74.98659999999995毫秒
算法2(调换循环顺序C++)——程序运行时间:44.84659999999996毫秒
算法3(numpy)——程序运行时间:80.46960000000003毫秒
True
True
"""