底层直接用vector模板,对于double和float可以直接开cublas,也可以用普通的三层循环,小规模矩阵乘法,向量乘法都可以。当然,功能可以说是比较简陋,够用就成
对于cublas的调优,基本就是他了。大矩阵能用,小矩阵灵活,80008000的不惧,44的也可。
#ifndef MATRIX
#define MATRIX
#include <cuda/cuda_runtime.h>
#include <cuda/cublas_v2.h>
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <vector>
#include <typeinfo>
namespace Lnr
{
template <typename T>
struct Vec;
template <typename T>
struct Mat
{
Mat() = delete;
Mat(unsigned r = 1, unsigned c = 1, T d = 0)
: mat(std::vector<T>(r * c, d)), row(r), col(c) {}
Mat(const std::vector<T> &rhs, unsigned r, unsigned c)
: mat(rhs), row(r), col(c) {}
Mat(std::vector<T> &&rhs, unsigned r, unsigned c)
: mat(rhs), row(r), col(c) {}
Mat(const Mat &rhs)
: mat(rhs.mat), row(rhs.row), col(rhs.col) {}
Mat(Mat &&rhs)
: mat(rhs.mat), row(rhs.row), col(rhs.col) {}
Mat &operator=(const Mat &rhs)
{
mat = rhs.mat;
row = rhs.row;
col = rhs.col;
return *this;
}
Mat &operator=(Mat &&rhs)
{
if (this != &rhs)
{
mat = std::move(rhs.mat);
row = rhs.row;
col = rhs.col;
}
return *this;
}
~Mat() {}
const T *data() const
{
return mat.data();
}
T *data()
{
return mat.data();
}
const std::vector<T> getdata() const { return mat; }
//矩阵转置
void transpose()
{
if (row * col != 0)
{
Mat<T> result(col, row);
#pragma omp parallel for num_threads(8) schedule(dynamic)
for (size_t i = 0; i != row; ++i)
{
for (size_t j = 0; j != col; ++j)
{
std::swap(result(j, i), (*this)(i, j));
}
}
*this = std::move(result);
}
}
inline const T &operator()(unsigned r, unsigned c) const
{
if (r < row && c < col)
{
return mat[r * col + c];
}
else
{
std::cout << " out of row or col at " << __LINE__ << std::endl;
}
return mat[0];
}
inline T &operator()(unsigned r, unsigned c)
{
if (r < row && c < col)
{
return mat[r * col + c];
}
else
{
std::cout << " out of row or col at " << __LINE__ << std::endl;
}
return mat[0];
}
//矩阵向量乘积
Vec<T> operator*(const Vec<T> &rhs) const
{
if (col == rhs.row)
{
Vec<T> result(row);
#pragma omp parallel for num_threads(8) schedule(dynamic)
for (size_t i = 0; i != row; ++i)
{
for (size_t j = 0; j != col; ++j)
{
result(i) += (*this)(i, j) * rhs(j);
}
}
return result;
}
return Vec<T>(1);
}
//矩阵相乘
Mat<T> operator*(const Mat<T> &rhs) const
{
if (col == rhs.row)
{
Mat<T> result(row, rhs.col);
Mat<T> buff(rhs);
buff.transpose();
#pragma omp parallel for num_threads(8) schedule(dynamic)
for (size_t i = 0; i != row; ++i)
{
for (size_t j = 0; j != buff.row; ++j)
{
for (size_t k = 0; k != col; ++k)
{
result(i, j) += (*this)(i, k) * buff(j, k);
}
}
}
return result;
}
return Mat<T>(1, 1);
}
//CUDA初始化
void cuInit(int &devID)
{
devID = 0;
cudaError_t error;
cudaDeviceProp deviceProp;
error = cudaSetDevice(devID);
err(error);
error = cudaGetDevice(&devID);
err(error);
error = cudaGetDeviceProperties(&deviceProp, devID);
err(error);
}
//cublas矩阵相乘
Mat<T> cuMult(const Mat<T> &rhs, const unsigned &rowLmt) const
{
std::vector<T> result;
result.reserve(row * rhs.col);
cudaError_t error;
cublasStatus_t status;
cublasHandle_t handle;
cudaDeviceProp deviceProp;
int devID = 0;
error = cudaGetDeviceProperties(&deviceProp, devID);
err(error);
int block_size = (deviceProp.major < 2) ? 8 : 16; //原来为16:32,过8000崩溃,设置小一倍,增加稳定性。
dim3 threads(block_size, block_size);
dim3 grid(rhs.col / threads.x, (row > rowLmt ? rowLmt : row) / threads.y);
cublasCreate(&handle);
unsigned int mem_size_A = sizeof(T) * col * (row > rowLmt ? rowLmt : row);
unsigned int mem_size_B = sizeof(T) * rhs.col * rhs.row;
unsigned int mem_size_C = sizeof(T) * rhs.col * (row > rowLmt ? rowLmt : row);
T *d_A, *d_B, *d_C;
error = cudaMalloc((void **)&d_A, mem_size_A);
err(error);
error = cudaMalloc((void **)&d_B, mem_size_B);
err(error);
error = cudaMalloc((void **)&d_C, mem_size_C);
err(error);
T *h_CUBLAS = (T *)malloc(mem_size_C);
const T *h_B = rhs.data();
//B不变所以提到循环之前。
error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
err(error);
int loopNum = 0;
for (int i = row; i > 0; i -= rowLmt)
{
int rowLoop = i > rowLmt ? rowLmt : i;
const T *h_A = (*this).data() + loopNum * rowLmt * col; //易出错地方,每循环一次,
//指针移动 “循环次数 * 限定行数 * 左矩阵列数。
error = cudaMemcpy(d_A, h_A, col * rowLoop * sizeof(T), cudaMemcpyHostToDevice);
err(error);
{
cublasCore(handle, error, status, rhs.col, rowLoop, d_B, d_A, d_C, T(0));
error = cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost);
err(error);
result.insert(result.end(), h_CUBLAS, h_CUBLAS + rowLoop * rhs.col); //输出尺寸:计算行 X 后矩阵列数
}
++loopNum;
}
{
status = cublasDestroy_v2(handle);
stat(status);
error = cudaFree(d_A);
err(error);
error = cudaFree(d_B);
err(error);
error = cudaFree(d_C);
err(error);
free(h_CUBLAS);
}
return Mat<T>(std::move(result), row, rhs.col);
}
private:
friend struct Vec<T>;
void err(cudaError_t &error) const
{
if (error != cudaSuccess)
{
printf("cuda returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
}
void stat(cublasStatus_t &status) const
{
if (status != CUBLAS_STATUS_SUCCESS)
{
printf(" cublas returned error code %d, line(%d)\n", status, __LINE__);
exit(EXIT_FAILURE);
}
}
void cublasCore(cublasHandle_t &handle, cudaError_t &error, cublasStatus_t &status, unsigned rhscol, unsigned rowLoop, T *d_B, T *d_A, T *d_C, float f) const
{
const T alpha = 1.0;
const T beta = 0.0;
status = cublasSgemm_v2(handle, CUBLAS_OP_N, CUBLAS_OP_N,
rhscol, rowLoop, col,
&alpha,
d_B, rhscol,
d_A, col,
&beta,
d_C, rhscol);
stat(status);
error = cudaDeviceSynchronize();
err(error);
}
void cublasCore(cublasHandle_t &handle, cudaError_t &error, cublasStatus_t &status, unsigned rhscol, unsigned rowLoop, T *d_B, T *d_A, T *d_C, double d) const
{
const T alpha = 1.0;
const T beta = 0.0;
status = cublasDgemm_v2(handle, CUBLAS_OP_N, CUBLAS_OP_N,
rhscol, rowLoop, col,
&alpha,
d_B, rhscol,
d_A, col,
&beta,
d_C, rhscol);
stat(status);
error = cudaDeviceSynchronize();
err(error);
}
std::vector<T> mat;
unsigned row;
unsigned col;
};
template <typename T>
struct Vec
{
Vec() = delete;
Vec(unsigned r = 1, T d = 0)
: vec(std::vector<T>(r, d)), row(r) {}
Vec(const Vec &rhs)
: vec(rhs.vec), row(rhs.row) {}
Vec(Vec &&rhs)
: vec(rhs.vec), row(rhs.row) {}
Vec &operator=(const Vec &rhs)
{
vec = rhs.vec;
row = rhs.row;
return *this;
}
Vec &operator=(Vec &&rhs)
{
if (this != &rhs)
{
vec = std::move(rhs.vec);
row = rhs.row;
}
return *this;
}
~Vec() {}
//向量取值
T &operator()(unsigned num)
{
if (num < row)
{
return vec[num];
}
}
//const向量取值
const T &operator()(unsigned num) const
{
if (num < row)
{
return vec[num];
}
}
//向量取首元素指针
inline T *data()
{
return vec.data();
}
//向量取首元素指针
inline const T *data() const
{
return vec.data();
}
//向量点乘
T operator*(const Vec &x) const
{
T matdot = 0;
if (row == x.row)
{
for (size_t i = 0; i != row; ++i)
{
matdot += x(i) * (*this)(i);
}
}
return matdot;
}
//向量矩阵乘积
Vec<T> operator=(const Mat<T> &rhs) const
{
if (row = rhs.row)
{
Mat<T> result(rhs);
result.transpose();
return result * (*this);
}
}
private:
friend struct Mat<T>;
std::vector<T> vec;
unsigned row;
};
} // namespace Lnr
#endif