贴一个cublas库的矩阵乘法封装

底层直接用vector模板,对于double和float可以直接开cublas,也可以用普通的三层循环,小规模矩阵乘法,向量乘法都可以。当然,功能可以说是比较简陋,够用就成

对于cublas的调优,基本就是他了。大矩阵能用,小矩阵灵活,80008000的不惧,44的也可。

#ifndef MATRIX
#define MATRIX

#include <cuda/cuda_runtime.h>
#include <cuda/cublas_v2.h>
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <vector>
#include <typeinfo>

namespace Lnr
{
    template <typename T>
    struct Vec;

    template <typename T>
    struct Mat
    {
        Mat() = delete;

        Mat(unsigned r = 1, unsigned c = 1, T d = 0)
            : mat(std::vector<T>(r * c, d)), row(r), col(c) {}

        Mat(const std::vector<T> &rhs, unsigned r, unsigned c)
            : mat(rhs), row(r), col(c) {}

        Mat(std::vector<T> &&rhs, unsigned r, unsigned c)
            : mat(rhs), row(r), col(c) {}

        Mat(const Mat &rhs)
            : mat(rhs.mat), row(rhs.row), col(rhs.col) {}

        Mat(Mat &&rhs)
            : mat(rhs.mat), row(rhs.row), col(rhs.col) {}

        Mat &operator=(const Mat &rhs)
        {
            mat = rhs.mat;
            row = rhs.row;
            col = rhs.col;
            return *this;
        }

        Mat &operator=(Mat &&rhs)
        {
            if (this != &rhs)
            {
                mat = std::move(rhs.mat);
                row = rhs.row;
                col = rhs.col;
            }
            return *this;
        }

        ~Mat() {}

        const T *data() const
        {
            return mat.data();
        }

        T *data()
        {
            return mat.data();
        }

        const std::vector<T> getdata() const { return mat; }

        //矩阵转置
        void transpose()
        {
            if (row * col != 0)
            {
                Mat<T> result(col, row);
#pragma omp parallel for num_threads(8) schedule(dynamic)
                for (size_t i = 0; i != row; ++i)
                {
                    for (size_t j = 0; j != col; ++j)
                    {
                        std::swap(result(j, i), (*this)(i, j));
                    }
                }
                *this = std::move(result);
            }
        }

        inline const T &operator()(unsigned r, unsigned c) const
        {
            if (r < row && c < col)
            {
                return mat[r * col + c];
            }
            else
            {
                std::cout << "  out of row or col at " << __LINE__ << std::endl;
            }
            return mat[0];
        }

        inline T &operator()(unsigned r, unsigned c)
        {
            if (r < row && c < col)
            {
                return mat[r * col + c];
            }
            else
            {
                std::cout << "  out of row or col  at " << __LINE__ << std::endl;
            }
            return mat[0];
        }

        //矩阵向量乘积
        Vec<T> operator*(const Vec<T> &rhs) const
        {
            if (col == rhs.row)
            {
                Vec<T> result(row);

#pragma omp parallel for num_threads(8) schedule(dynamic)
                for (size_t i = 0; i != row; ++i)
                {
                    for (size_t j = 0; j != col; ++j)
                    {
                        result(i) += (*this)(i, j) * rhs(j);
                    }
                }

                return result;
            }
            return Vec<T>(1);
        }

        //矩阵相乘
        Mat<T> operator*(const Mat<T> &rhs) const
        {
            if (col == rhs.row)
            {
                Mat<T> result(row, rhs.col);
                Mat<T> buff(rhs);
                buff.transpose();

#pragma omp parallel for num_threads(8) schedule(dynamic)
                for (size_t i = 0; i != row; ++i)
                {
                    for (size_t j = 0; j != buff.row; ++j)
                    {
                        for (size_t k = 0; k != col; ++k)
                        {
                            result(i, j) += (*this)(i, k) * buff(j, k);
                        }
                    }
                }

                return result;
            }

            return Mat<T>(1, 1);
        }

        //CUDA初始化
        void cuInit(int &devID)
        {
            devID = 0;
            cudaError_t error;
            cudaDeviceProp deviceProp;

            error = cudaSetDevice(devID);
            err(error);

            error = cudaGetDevice(&devID);
            err(error);

            error = cudaGetDeviceProperties(&deviceProp, devID);
            err(error);
        }

        //cublas矩阵相乘
        Mat<T> cuMult(const Mat<T> &rhs, const unsigned &rowLmt) const
        {
            std::vector<T> result;
            result.reserve(row * rhs.col);

            cudaError_t error;
            cublasStatus_t status;
            cublasHandle_t handle;
            cudaDeviceProp deviceProp;
            int devID = 0;
            error = cudaGetDeviceProperties(&deviceProp, devID);
            err(error);

            int block_size = (deviceProp.major < 2) ? 8 : 16; //原来为16:32,过8000崩溃,设置小一倍,增加稳定性。

            dim3 threads(block_size, block_size);
            dim3 grid(rhs.col / threads.x, (row > rowLmt ? rowLmt : row) / threads.y);
            cublasCreate(&handle);

            unsigned int mem_size_A = sizeof(T) * col * (row > rowLmt ? rowLmt : row);
            unsigned int mem_size_B = sizeof(T) * rhs.col * rhs.row;
            unsigned int mem_size_C = sizeof(T) * rhs.col * (row > rowLmt ? rowLmt : row);

            T *d_A, *d_B, *d_C;

            error = cudaMalloc((void **)&d_A, mem_size_A);
            err(error);

            error = cudaMalloc((void **)&d_B, mem_size_B);
            err(error);

            error = cudaMalloc((void **)&d_C, mem_size_C);
            err(error);

            T *h_CUBLAS = (T *)malloc(mem_size_C);

            const T *h_B = rhs.data();

            //B不变所以提到循环之前。
            error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
            err(error);

            int loopNum = 0;
            for (int i = row; i > 0; i -= rowLmt)
            {
                int rowLoop = i > rowLmt ? rowLmt : i;

                const T *h_A = (*this).data() + loopNum * rowLmt * col; //易出错地方,每循环一次,
                                                                        //指针移动 “循环次数 * 限定行数 * 左矩阵列数。

                error = cudaMemcpy(d_A, h_A, col * rowLoop * sizeof(T), cudaMemcpyHostToDevice);
                err(error);

                {
                    cublasCore(handle, error, status, rhs.col, rowLoop, d_B, d_A, d_C, T(0));

                    error = cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost);
                    err(error);

                    result.insert(result.end(), h_CUBLAS, h_CUBLAS + rowLoop * rhs.col); //输出尺寸:计算行 X 后矩阵列数
                }

                ++loopNum;
            }

            {
                status = cublasDestroy_v2(handle);
                stat(status);

                error = cudaFree(d_A);
                err(error);

                error = cudaFree(d_B);
                err(error);

                error = cudaFree(d_C);
                err(error);

                free(h_CUBLAS);
            }
            return Mat<T>(std::move(result), row, rhs.col);
        }

    private:
        friend struct Vec<T>;

        void err(cudaError_t &error) const
        {
            if (error != cudaSuccess)
            {
                printf("cuda returned error code %d, line(%d)\n", error, __LINE__);
                exit(EXIT_FAILURE);
            }
        }

        void stat(cublasStatus_t &status) const
        {
            if (status != CUBLAS_STATUS_SUCCESS)
            {
                printf(" cublas returned error code %d, line(%d)\n", status, __LINE__);
                exit(EXIT_FAILURE);
            }
        }

        void cublasCore(cublasHandle_t &handle, cudaError_t &error, cublasStatus_t &status, unsigned rhscol, unsigned rowLoop, T *d_B, T *d_A, T *d_C, float f) const
        {
            const T alpha = 1.0;
            const T beta = 0.0;
            status = cublasSgemm_v2(handle, CUBLAS_OP_N, CUBLAS_OP_N,
                                    rhscol, rowLoop, col,
                                    &alpha,
                                    d_B, rhscol,
                                    d_A, col,
                                    &beta,
                                    d_C, rhscol);
            stat(status);
            error = cudaDeviceSynchronize();
            err(error);
        }

        void cublasCore(cublasHandle_t &handle, cudaError_t &error, cublasStatus_t &status, unsigned rhscol, unsigned rowLoop, T *d_B, T *d_A, T *d_C, double d) const
        {
            const T alpha = 1.0;
            const T beta = 0.0;
            status = cublasDgemm_v2(handle, CUBLAS_OP_N, CUBLAS_OP_N,
                                    rhscol, rowLoop, col,
                                    &alpha,
                                    d_B, rhscol,
                                    d_A, col,
                                    &beta,
                                    d_C, rhscol);
            stat(status);
            error = cudaDeviceSynchronize();
            err(error);
        }

        std::vector<T> mat;
        unsigned row;
        unsigned col;
    };

    template <typename T>
    struct Vec
    {
        Vec() = delete;

        Vec(unsigned r = 1, T d = 0)
            : vec(std::vector<T>(r, d)), row(r) {}

        Vec(const Vec &rhs)
            : vec(rhs.vec), row(rhs.row) {}

        Vec(Vec &&rhs)
            : vec(rhs.vec), row(rhs.row) {}

        Vec &operator=(const Vec &rhs)
        {
            vec = rhs.vec;
            row = rhs.row;
            return *this;
        }

        Vec &operator=(Vec &&rhs)
        {
            if (this != &rhs)
            {
                vec = std::move(rhs.vec);
                row = rhs.row;
            }
            return *this;
        }

        ~Vec() {}

        //向量取值
        T &operator()(unsigned num)
        {
            if (num < row)
            {
                return vec[num];
            }
        }

        //const向量取值
        const T &operator()(unsigned num) const
        {
            if (num < row)
            {
                return vec[num];
            }
        }

        //向量取首元素指针
        inline T *data()
        {
            return vec.data();
        }

        //向量取首元素指针
        inline const T *data() const
        {
            return vec.data();
        }

        //向量点乘
        T operator*(const Vec &x) const
        {
            T matdot = 0;
            if (row == x.row)
            {
                for (size_t i = 0; i != row; ++i)
                {
                    matdot += x(i) * (*this)(i);
                }
            }
            return matdot;
        }

        //向量矩阵乘积
        Vec<T> operator=(const Mat<T> &rhs) const
        {
            if (row = rhs.row)
            {
                Mat<T> result(rhs);
                result.transpose();
                return result * (*this);
            }
        }

    private:
        friend struct Mat<T>;

        std::vector<T> vec;
        unsigned row;
    };

} // namespace Lnr

#endif
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

不停感叹的老林_<C 语言编程核心突破>

不打赏的人, 看完也学不会.

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值