矩阵乘法加速
矩阵乘法加速,我经历了4步:
- 初步无任何调优的三层循环
- 后矩阵转置向量点乘
- 开openmp多线程加速。
- 用cuda大法,给cublas加个外衣,当自己的货。
使用eigen3线性代数库做对标,第1,2,3次调优完败,第4次打个平手。所以说调优这事,调无止境。
好多年没有用过线性代数,捡起来需要一天。矩阵乘法基本的实现是矩阵A乘以矩阵B,用A的行点乘B的列,就是一个基本的向量点乘,结果作为矩阵C的一个元素。矩阵C的行数由A决定,列数由B决定。
做三层循环时候,还是有点懵,后来用B转置点乘,才基本回过味。
以下是开openmp的代码,与转置点乘法基本没有区别,就是在for循环中加个语句:
#pragma omp parallel for num_threads(8) schedule(dynamic)
当然,需要在编译的时候加上开openmp的指令
g++.exe "-fopenmp"
//向量点乘
template <typename T>
T dot(const std::vector<T> &x, const std::vector<T> &y)
{
T matdot = 0;
if (x.size() == y.size())
{
int xysize = x.size();
for (size_t i = 0; i != xysize; ++i)
{
matdot += x[i] * y[i];
}
return matdot;
}
return matdot;
}
//矩阵转置
template <typename T>
T transpose(const T &a)
{
if (!a.empty() && !a[0].empty())
{
int resultCol = a.size(); //行变列
int resultRow = a[0].size(); //列变行
auto tp = a[0][0];
T result(resultRow, std::vector<decltype(tp)>(resultCol));
#pragma omp parallel for num_threads(8) schedule(dynamic)
for (size_t i = 0; i != resultRow; ++i)
{
for (size_t j = 0; j != resultCol; ++j)
{
result[i][j] = a[j][i];
}
}
return result;
}
return T();
}
//矩阵相乘
template <typename T>
T mult(const T &a, const T &b)
{
if ((!a.empty() && !b.empty()) && ((a[0]).size() == b.size()))
{
const T transB = transpose(b);
const int aRow = a.size();
const int transBRow = transB.size();
auto tp = a[0][0];
T ab(aRow, std::vector<decltype(tp)>(transBRow));
#pragma omp parallel for num_threads(8) schedule(dynamic)
for (size_t i = 0; i != aRow; ++i)
{
for (size_t j = 0; j != transBRow; ++j)
{
ab[i][j] = dot(a[i], transB[j]);
}
}
return ab;
}
return T();
}
当然,为了使用以上算法,需要用如下数据结构:
using MatD = std::vector<std::vector<double>>;
using MatF = std::vector<std::vector<float>>;
C++ primer 说现代c++不要用c数组。对不起,其实我是为了偷懒。
拼不过Eigen,所以用CUDA包外衣
矩阵的转置点乘,是为了方便内存的行优先读取,所以会快。openmp是压榨cpu多线程算力,所以会快。但和eigen还有至少一个数量级的差距,我就不明白了。指令集魔法?不知道,没研究过。
如何打败它,这是个问题。打败一个大个子,需要找另外一个大个子,于是我找到了cublas,有Nv的显卡,闲着不用,难道用来打游戏?
cuda装起,cublas码起。
以下是一堆又臭又长的代码,没研究过cuda和cublas的估计看不懂,其实很简单,几句话就说明白了。
- 将数组变成内存连续型数组,这不是废话,因为我用的数据结构就不是连续的,不理解会发生莫名奇妙的问题。毕竟显卡debug和cpu debug 那不是一回事。
- 将这个内存连续的数组拷贝给显卡。从内存拷至显存,很重要,显卡用不了内存。
- 用cublas的现成的矩阵乘法公式进行计算。
- 将计算完的显存中的连续型数组,拷贝给内存的数组。
注意:
- cuda面对的数据结构一定是可以并行计算,各不相干的数值,否则不知道算的是什么玩意。
- 某些穷玩家,比如我,买不起超大显存显卡,内存和显存大小差了好几倍,可能导致一些大矩阵,比如10000 * 10000的乘法,由于显存问题崩溃,而用cpu计算就没问题。所以我加了一项A矩阵行限制,将矩阵分割,分批次进行计算,效果不错,不崩了。当然如果超过内存的限制,无论如何都会崩。
- cuda比较擅长float类型数组的乘法,不太擅长double类型的乘法。证据是float大矩阵的乘法速度比eigen的float矩阵乘法快两倍左右,double的大矩阵比eigen的double矩阵乘法慢一倍左右。
- 从完全无优化,到转置,可以快0.5倍左右。开多线程,可以快10倍,根据cpu的线程。用cuda,可以再快5倍,如果用float类型,至少快10倍。总之,提升大概是100倍速度,没有某些大神说的提个几千倍。可能是我还没有掌握算法的精髓吧。
以下是代码:
//cublas矩阵相乘double
MatD cuMultD(const MatD &vecA, const MatD &vecB, const int rowALmt)
{
int devID = 0;
cudaDeviceProp deviceProp;
cudaError_t error;
cublasStatus_t status;
cublasHandle_t handle;
error = cudaGetDeviceProperties(&deviceProp, devID);
if (error != cudaSuccess)
{
printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
int block_size = (deviceProp.major < 2) ? 8 : 16; //原来为16:32,过8000崩溃,设置小一倍,增加稳定性。
MatD result;
int rowA = vecA.size();
int rowABuf = 0;
int rowANum = 0;
const int colA = vecA[0].size();
const int rowB = vecB.size();
const int colB = vecB[0].size();
dim3 threads(block_size, block_size);
dim3 grid(colB / threads.x, (rowA > rowALmt ? rowALmt : rowA) / threads.y);
cublasCreate(&handle);
unsigned int size_A = colA * (rowA > rowALmt ? rowALmt : rowA);
unsigned int mem_size_A = sizeof(double) * size_A;
unsigned int size_B = colB * rowB;
unsigned int mem_size_B = sizeof(double) * size_B;
unsigned int size_C = colB * (rowA > rowALmt ? rowALmt : rowA);
unsigned int mem_size_C = sizeof(double) * size_C;
double *d_A, *d_B, *d_C;
error = cudaMalloc((void **)&d_A, mem_size_A);
if (error != cudaSuccess)
{
printf("cudaMalloc returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
error = cudaMalloc((void **)&d_B, mem_size_B);
if (error != cudaSuccess)
{
printf("cudaMalloc returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
error = cudaMalloc((void **)&d_C, mem_size_C);
if (error != cudaSuccess)
{
printf(" cudaMalloc returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
double *h_CUBLAS = (double *)malloc(mem_size_C);
std::vector<double> B; //可优化到while之前
B.reserve(rowB * colB);
for (size_t i = 0; i != rowB; ++i)
{
B.insert(B.end(), vecB[i].begin(), vecB[i].end());
}
auto h_B = B.data();
error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice); //B不变所以提到循环之前。
if (error != cudaSuccess)
{
printf(" cudaMalloc returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
while (rowA > 0)
{
rowABuf = rowA;
if (rowA > rowALmt)
{
rowA = rowALmt;
}
std::vector<double> A;
A.reserve(rowA * colA);
int loopbeg = rowANum * rowALmt;
int loopend = rowA + loopbeg;
for (size_t i = loopbeg; i != loopend; ++i)
{
A.insert(A.end(), vecA[i].begin(), vecA[i].end());
}
auto h_A = A.data();
error = cudaMemcpy(d_A, h_A, colA * (rowA > rowALmt ? rowALmt : rowA) * sizeof(double), cudaMemcpyHostToDevice);
if (error != cudaSuccess)
{
printf(" cudaMalloc returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
{
const double alpha = 1.0;
const double beta = 0.0;
status = cublasDgemm_v2(handle, CUBLAS_OP_N, CUBLAS_OP_N,
colB, rowA, colA,
&alpha,
d_B, colB,
d_A, colA,
&beta,
d_C, colB);
if (status != CUBLAS_STATUS_SUCCESS)
{
printf(" cublasDgemm_v2 returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
error = cudaDeviceSynchronize();
if (error != cudaSuccess)
{
printf(" cudaDeviceSynchronize returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
error = cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost);
if (error != cudaSuccess)
{
printf(" cudaMemcpy returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
std::vector<double> buff(colB);
for (size_t i = 0; i != rowA; ++i)
{
buff.assign(h_CUBLAS + i * colB, h_CUBLAS + (i + 1) * colB);
result.push_back(buff);
}
}
rowA = rowABuf - rowALmt;
++rowANum;
}
{
status = cublasDestroy_v2(handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
printf(" cublasDgemm_v2 returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
error = cudaFree(d_A);
if (error != cudaSuccess)
{
printf(" cudaFree returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
error = cudaFree(d_B);
if (error != cudaSuccess)
{
printf(" cudaFree returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
error = cudaFree(d_C);
if (error != cudaSuccess)
{
printf(" cudaFree returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
free(h_CUBLAS);
}
return result;
}
//cublas矩阵相乘float
MatF cuMultF(const MatF &vecA, const MatF &vecB, const int rowALmt)
{
int devID = 0;
cudaDeviceProp deviceProp;
cudaError_t error;
cublasStatus_t status;
cublasHandle_t handle;
error = cudaGetDeviceProperties(&deviceProp, devID);
if (error != cudaSuccess)
{
printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
int block_size = (deviceProp.major < 2) ? 8 : 16; //原来为16:32,过8000崩溃,设置小一倍,增加稳定性。
MatF result;
int rowA = vecA.size();
int rowABuf = 0;
int rowANum = 0;
const int colA = vecA[0].size();
const int rowB = vecB.size();
const int colB = vecB[0].size();
dim3 threads(block_size, block_size);
dim3 grid(colB / threads.x, (rowA > rowALmt ? rowALmt : rowA) / threads.y);
cublasCreate(&handle);
unsigned int size_A = colA * (rowA > rowALmt ? rowALmt : rowA);
unsigned int mem_size_A = sizeof(float) * size_A;
unsigned int size_B = colB * rowB;
unsigned int mem_size_B = sizeof(float) * size_B;
unsigned int size_C = colB * (rowA > rowALmt ? rowALmt : rowA);
unsigned int mem_size_C = sizeof(float) * size_C;
float *d_A, *d_B, *d_C;
error = cudaMalloc((void **)&d_A, mem_size_A);
if (error != cudaSuccess)
{
printf("cudaMalloc returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
error = cudaMalloc((void **)&d_B, mem_size_B);
if (error != cudaSuccess)
{
printf("cudaMalloc returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
error = cudaMalloc((void **)&d_C, mem_size_C);
if (error != cudaSuccess)
{
printf(" cudaMalloc returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
float *h_CUBLAS = (float *)malloc(mem_size_C);
std::vector<float> B; //可优化到while之前
B.reserve(rowB * colB);
for (size_t i = 0; i != rowB; ++i)
{
B.insert(B.end(), vecB[i].begin(), vecB[i].end());
}
auto h_B = B.data();
error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice); //B不变所以提到循环之前。
if (error != cudaSuccess)
{
printf(" cudaMalloc returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
while (rowA > 0)
{
rowABuf = rowA;
if (rowA > rowALmt)
{
rowA = rowALmt;
}
std::vector<float> A;
A.reserve(rowA * colA);
int loopbeg = rowANum * rowALmt;
int loopend = rowA + loopbeg;
for (size_t i = loopbeg; i != loopend; ++i)
{
A.insert(A.end(), vecA[i].begin(), vecA[i].end());
}
auto h_A = A.data();
error = cudaMemcpy(d_A, h_A, colA * (rowA > rowALmt ? rowALmt : rowA) * sizeof(float), cudaMemcpyHostToDevice);
if (error != cudaSuccess)
{
printf(" cudaMalloc returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
{
const float alpha = 1.0;
const float beta = 0.0;
status = cublasSgemm_v2(handle, CUBLAS_OP_N, CUBLAS_OP_N,
colB, rowA, colA,
&alpha,
d_B, colB,
d_A, colA,
&beta,
d_C, colB);
if (status != CUBLAS_STATUS_SUCCESS)
{
printf(" cublasDgemm_v2 returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
error = cudaDeviceSynchronize();
if (error != cudaSuccess)
{
printf(" cudaDeviceSynchronize returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
error = cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost);
if (error != cudaSuccess)
{
printf(" cudaMemcpy returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
std::vector<float> buff(colB);
for (size_t i = 0; i != rowA; ++i)
{
buff.assign(h_CUBLAS + i * colB, h_CUBLAS + (i + 1) * colB);
result.push_back(buff);
}
}
rowA = rowABuf - rowALmt;
++rowANum;
}
{
status = cublasDestroy_v2(handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
printf(" cublasDgemm_v2 returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
error = cudaFree(d_A);
if (error != cudaSuccess)
{
printf(" cudaFree returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
error = cudaFree(d_B);
if (error != cudaSuccess)
{
printf(" cudaFree returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
error = cudaFree(d_C);
if (error != cudaSuccess)
{
printf(" cudaFree returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
free(h_CUBLAS);
}
return result;
}