CUDA cusolverDn＜t＞gesvd()函数认知

不正道

已于 2022-11-03 17:14:35 修改

阅读量1.2k

点赞数 2

分类专栏：踩坑记录文章标签： matlab 开发语言 c++ 经验分享

于 2022-10-28 11:32:08 首次发布

本文链接：https://blog.csdn.net/weixin_49069890/article/details/127557125

版权

踩坑记录专栏收录该内容

9 篇文章 0 订阅

订阅专栏

函数说明
cusolverDn<t>gesvd()函数用于矩阵的奇异值分解，和Matlab的svd函数对应。
数学简介
- 奇异值分解的公式为：
  
  这里用S代表西格玛，即A = U*S*VH。我们的最终目的就是计算出右边三个矩阵。
- 矩阵尺寸：
  设：A为m行n列的矩阵，则U为m行m列，S为m行n列，VH为n行n列。注：后面三个矩阵的秩和A的秩相等。
- 因为矩阵A、U、S、VH都代表了一次线性变换，又因为公式右边等于左边。所以，我们可以说右面三个矩阵对应的线性变换等价于左边，即将A的线性变换分解为三步，U、S、VH分别代表了旋转、拉伸、旋转。
- U、S、VH求解过程
- SVD的意义和用途

matlab计算
对矩阵A做svd分解，matlab代码如下：

% 生成矩阵A
A = [
    1 2
    2 5
    3 6
] + [
    0i 3i
    -3i 0i
    -5i -2i
];
% svd分解
[U,S,V] = svd(A);
% 打印结果
A       %原矩阵
S       %奇异值
U       %左奇异向量
VH = V' %右奇异向量的共轭转置（这样打印为了和CUDA比较）

matlab执行结果：

A =
   1.0000 + 0.0000i   2.0000 + 3.0000i
   2.0000 - 3.0000i   5.0000 + 0.0000i
   3.0000 - 5.0000i   6.0000 - 2.0000i
S =
   11.0864         0
         0    1.7583
         0         0
U =
  -0.3085 - 0.0443i  -0.7870 - 0.2161i   0.4850 - 0.0398i
  -0.3564 + 0.4239i  -0.3013 - 0.0884i  -0.6875 + 0.3494i
  -0.3575 + 0.6844i   0.4126 - 0.2554i   0.3774 - 0.1612i
VH =
  -0.6122 + 0.0000i  -0.5453 - 0.5726i
   0.7907 + 0.0000i  -0.4222 - 0.4433i

CUDA函数原型（以cuDoubleComplex类型举例）
cusolverDnZgesvd (

cusolverStatus_t				//状态值
cusolverDnZgesvd (				//函数名，D代表稠密矩阵，Z代表双精度复数
    cusolverDnHandle_t handle,	//句柄，上下文相关，估计和FILE指针类似
    signed char jobu,			//U的保存方式。'A'-all，不化简，以m行m列保存，不论A是否满秩；'S'-simplify，化简；
    signed char jobvh,			//同jobu
    int m,						//A的行数
    int n,						//A的列数
    cuDoubleComplex *A,			//A的指针
    int lda,					//A的主维度
    double *S,					//S的指针，结果从大到小排序，即S[i]>S[i+1]
    cuDoubleComplex *U,			//U的指针
    int ldu,					//U的主维度
    cuDoubleComplex *VH,		//VH的指针
    int ldvh,					//VH的主维度
    cuDoubleComplex *work,		//工作空间指针，用于函数计算的缓存空间，需要用户申请
    int lwork,					//工作空间的大小，由cusolverDnZgesvd_bufferSize()计算
    double *rwork,				//不知道做啥用的
    int *devInfo				//计算结果标志位，=0时正常
);

示例代码
cusolver_utils.h，来自Nvidia官方库。

/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#pragma once

#include <cmath>
#include <functional>
#include <iostream>
#include <random>
#include <stdexcept>
#include <string>

#include <cuComplex.h>
#include <cuda_runtime_api.h>
#include <cublas_api.h>
#include <cusolverDn.h>
#include <library_types.h>

// CUDA API error checking
#define CUDA_CHECK(err)                                                                            \
    do {                                                                                           \
        cudaError_t err_ = (err);                                                                  \
        if (err_ != cudaSuccess) {                                                                 \
            printf("CUDA error %d at %s:%d\n", err_, __FILE__, __LINE__);                          \
            throw std::runtime_error("CUDA error");                                                \
        }                                                                                          \
    } while (0)

// cusolver API error checking
#define CUSOLVER_CHECK(err)                                                                        \
    do {                                                                                           \
        cusolverStatus_t err_ = (err);                                                             \
        if (err_ != CUSOLVER_STATUS_SUCCESS) {                                                     \
            printf("cusolver error %d at %s:%d\n", err_, __FILE__, __LINE__);                      \
            throw std::runtime_error("cusolver error");                                            \
        }                                                                                          \
    } while (0)

// cublas API error checking
#define CUBLAS_CHECK(err)                                                                          \
    do {                                                                                           \
        cublasStatus_t err_ = (err);                                                               \
        if (err_ != CUBLAS_STATUS_SUCCESS) {                                                       \
            printf("cublas error %d at %s:%d\n", err_, __FILE__, __LINE__);                        \
            throw std::runtime_error("cublas error");                                              \
        }                                                                                          \
    } while (0)

// cublas API error checking
#define CUSPARSE_CHECK(err)                                                                        \
    do {                                                                                           \
        cusparseStatus_t err_ = (err);                                                             \
        if (err_ != CUSPARSE_STATUS_SUCCESS) {                                                     \
            printf("cusparse error %d at %s:%d\n", err_, __FILE__, __LINE__);                      \
            throw std::runtime_error("cusparse error");                                            \
        }                                                                                          \
    } while (0)

// memory alignment
#define ALIGN_TO(A, B) (((A + B - 1) / B) * B)

// device memory pitch alignment
static const size_t device_alignment = 32;

// type traits
template <typename T> struct traits;

template <> struct traits<float> {
    // scalar type
    typedef float T;
    typedef T S;

    static constexpr T zero = 0.f;
    static constexpr cudaDataType cuda_data_type = CUDA_R_32F;
#if CUDART_VERSION >= 11000
    static constexpr cusolverPrecType_t cusolver_precision_type = CUSOLVER_R_32F;
#endif

    inline static S abs(T val) { return fabs(val); }

    template <typename RNG> inline static T rand(RNG &gen) { return (S)gen(); }

    inline static T add(T a, T b) { return a + b; }

    inline static T mul(T v, double f) { return v * f; }
};

template <> struct traits<double> {
    // scalar type
    typedef double T;
    typedef T S;

    static constexpr T zero = 0.;
    static constexpr cudaDataType cuda_data_type = CUDA_R_64F;
#if CUDART_VERSION >= 11000
    static constexpr cusolverPrecType_t cusolver_precision_type = CUSOLVER_R_64F;
#endif

    inline static S abs(T val) { return fabs(val); }

    template <typename RNG> inline static T rand(RNG &gen) { return (S)gen(); }

    inline static T add(T a, T b) { return a + b; }

    inline static T mul(T v, double f) { return v * f; }
};

template <> struct traits<cuFloatComplex> {
    // scalar type
    typedef float S;
    typedef cuFloatComplex T;

    static constexpr T zero = {0.f, 0.f};
    static constexpr cudaDataType cuda_data_type = CUDA_C_32F;
#if CUDART_VERSION >= 11000
    static constexpr cusolverPrecType_t cusolver_precision_type = CUSOLVER_C_32F;
#endif

    inline static S abs(T val) { return cuCabsf(val); }

    template <typename RNG> inline static T rand(RNG &gen) {
        return make_cuFloatComplex((S)gen(), (S)gen());
    }

    inline static T add(T a, T b) { return cuCaddf(a, b); }
    inline static T add(T a, S b) { return cuCaddf(a, make_cuFloatComplex(b, 0.f)); }

    inline static T mul(T v, double f) { return make_cuFloatComplex(v.x * f, v.y * f); }
};

template <> struct traits<cuDoubleComplex> {
    // scalar type
    typedef double S;
    typedef cuDoubleComplex T;

    static constexpr T zero = {0., 0.};
    static constexpr cudaDataType cuda_data_type = CUDA_C_64F;
#if CUDART_VERSION >= 11000
    static constexpr cusolverPrecType_t cusolver_precision_type = CUSOLVER_C_64F;
#endif

    inline static S abs(T val) { return cuCabs(val); }

    template <typename RNG> inline static T rand(RNG &gen) {
        return make_cuDoubleComplex((S)gen(), (S)gen());
    }

    inline static T add(T a, T b) { return cuCadd(a, b); }
    inline static T add(T a, S b) { return cuCadd(a, make_cuDoubleComplex(b, 0.)); }

    inline static T mul(T v, double f) { return make_cuDoubleComplex(v.x * f, v.y * f); }
};

template <typename T> void print_matrix(const int &m, const int &n, const T *A, const int &lda);

template <> void print_matrix(const int &m, const int &n, const float *A, const int &lda) {
    for (int i = 0; i < m; i++) {
        for (int j = 0; j < n; j++) {
            std::printf("%0.2f ", A[j * lda + i]);
        }
        std::printf("\n");
    }
}

template <> void print_matrix(const int &m, const int &n, const double *A, const int &lda) {
    for (int i = 0; i < m; i++) {
        for (int j = 0; j < n; j++) {
            std::printf("%0.2f ", A[j * lda + i]);
        }
        std::printf("\n");
    }
}

template <> void print_matrix(const int &m, const int &n, const cuComplex *A, const int &lda) {
    for (int i = 0; i < m; i++) {
        for (int j = 0; j < n; j++) {
            std::printf("%0.2f + %0.2fj ", A[j * lda + i].x, A[j * lda + i].y);
        }
        std::printf("\n");
    }
}

template <>
void print_matrix(const int &m, const int &n, const cuDoubleComplex *A, const int &lda) {
    for (int i = 0; i < m; i++) {
        for (int j = 0; j < n; j++) {
            std::printf("%0.2f + %0.2fj ", A[j * lda + i].x, A[j * lda + i].y);
        }
        std::printf("\n");
    }
}

template <typename T>
void generate_random_matrix(cusolver_int_t m, cusolver_int_t n, T **A, int *lda) {
    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_real_distribution<typename traits<T>::S> dis(-1.0, 1.0);
    auto rand_gen = std::bind(dis, gen);

    *lda = n;

    size_t matrix_mem_size = static_cast<size_t>(*lda * m * sizeof(T));
    // suppress gcc 7 size warning
    if (matrix_mem_size <= PTRDIFF_MAX)
        *A = (T *)malloc(matrix_mem_size);
    else
        throw std::runtime_error("Memory allocation size is too large");

    if (*A == NULL)
        throw std::runtime_error("Unable to allocate host matrix");

    // random matrix and accumulate row sums
    for (int i = 0; i < m; ++i) {
        for (int j = 0; j < n; ++j) {
            T *A_row = (*A) + *lda * i;
            A_row[j] = traits<T>::rand(rand_gen);
        }
    }
}

// Makes matrix A of size mxn and leading dimension lda diagonal dominant
template <typename T>
void make_diag_dominant_matrix(cusolver_int_t m, cusolver_int_t n, T *A, int lda) {
    for (int i = 0; i < std::min(m, n); ++i) {
        T *A_row = A + lda * i;
        auto row_sum = traits<typename traits<T>::S>::zero;
        for (int j = 0; j < n; ++j) {
            row_sum += traits<T>::abs(A_row[j]);
        }
        A_row[i] = traits<T>::add(A_row[i], row_sum);
    }
}

// Returns cudaDataType value as defined in library_types.h for the string containing type name
cudaDataType get_cuda_library_type(std::string type_string) {
    if (type_string.compare("CUDA_R_16F") == 0)
        return CUDA_R_16F;
    else if (type_string.compare("CUDA_C_16F") == 0)
        return CUDA_C_16F;
    else if (type_string.compare("CUDA_R_32F") == 0)
        return CUDA_R_32F;
    else if (type_string.compare("CUDA_C_32F") == 0)
        return CUDA_C_32F;
    else if (type_string.compare("CUDA_R_64F") == 0)
        return CUDA_R_64F;
    else if (type_string.compare("CUDA_C_64F") == 0)
        return CUDA_C_64F;
    else if (type_string.compare("CUDA_R_8I") == 0)
        return CUDA_R_8I;
    else if (type_string.compare("CUDA_C_8I") == 0)
        return CUDA_C_8I;
    else if (type_string.compare("CUDA_R_8U") == 0)
        return CUDA_R_8U;
    else if (type_string.compare("CUDA_C_8U") == 0)
        return CUDA_C_8U;
    else if (type_string.compare("CUDA_R_32I") == 0)
        return CUDA_R_32I;
    else if (type_string.compare("CUDA_C_32I") == 0)
        return CUDA_C_32I;
    else if (type_string.compare("CUDA_R_32U") == 0)
        return CUDA_R_32U;
    else if (type_string.compare("CUDA_C_32U") == 0)
        return CUDA_C_32U;
    else
        throw std::runtime_error("Unknown CUDA datatype");
}

// Returns cusolverIRSRefinement_t value as defined in cusolver_common.h for the string containing
// solver name
cusolverIRSRefinement_t get_cusolver_refinement_solver(std::string solver_string) {
    if (solver_string.compare("CUSOLVER_IRS_REFINE_NONE") == 0)
        return CUSOLVER_IRS_REFINE_NONE;
    else if (solver_string.compare("CUSOLVER_IRS_REFINE_CLASSICAL") == 0)
        return CUSOLVER_IRS_REFINE_CLASSICAL;
    else if (solver_string.compare("CUSOLVER_IRS_REFINE_GMRES") == 0)
        return CUSOLVER_IRS_REFINE_GMRES;
    else if (solver_string.compare("CUSOLVER_IRS_REFINE_CLASSICAL_GMRES") == 0)
        return CUSOLVER_IRS_REFINE_CLASSICAL_GMRES;
    else if (solver_string.compare("CUSOLVER_IRS_REFINE_GMRES_GMRES") == 0)
        return CUSOLVER_IRS_REFINE_GMRES_GMRES;
    else
        printf("Unknown solver parameter: \"%s\"\n", solver_string.c_str());

    return CUSOLVER_IRS_REFINE_NOT_SET;
}

cusolver_gesvd_example.cu，参考官方库编写。

#include <cstdio>
#include <cstdlib>
#include <vector>
#include <cuda_runtime.h>
#include <cusolverDn.h>
#include "cusolver_utils.h"

void print_matrix(const int &m, const int &n, const cuDoubleComplex *A, const int &ldx);

/* matlab矩阵A为：
    1.0000 + 0.0000i   2.0000 + 3.0000i
    2.0000 - 3.0000i   5.0000 + 0.0000i
    3.0000 - 5.0000i   6.0000 - 2.0000i

    gesvd中应该对应A的转置（非共轭转置），内存中存储顺序如下：
    1.0000 + 0.0000i   2.0000 - 3.0000i   3.0000 - 5.0000i    2.0000 + 3.0000i   5.0000 + 0.0000i   6.0000 - 2.0000i
*/
// 输入矩阵A应为列存储格式，即matlab中A.'
const int m = 3;   //矩阵行
const int n = 2;   //矩阵列
const int lda = m; /* lda >= m */
const std::vector<cuDoubleComplex> h_A = {        //列存储
    {1, 0}, {2, -3}, {3, -5},
    {2, 3}, {5, 0}, {6, -2},
};


int main(int argc, char *argv[]) {    
    // 步骤1：创建句柄
    cusolverDnHandle_t cusolverH = NULL;
    CUSOLVER_CHECK(cusolverDnCreate(&cusolverH));

    // 步骤2：申请空间
    cuDoubleComplex *A = nullptr;
    double *S = nullptr;
    cuDoubleComplex *U = nullptr;       // 左奇异矩阵
    cuDoubleComplex *VH = nullptr;      // 又奇异矩阵的复共轭转置
    const int ldu = m;                  // 根据公式，U为m行m列的方阵
    const int ldvh = n;                 // 根据公式，VH为n行n列的仿真
    cuDoubleComplex *W = nullptr;       // W = S*VH 没看懂啥意思
    int *devInfo = nullptr;             // 函数运行状态返回值
    int lwork = 0;                      // 工作空间大小
    cuDoubleComplex *Work = nullptr;    // 工作空间指针
    double *rwork = nullptr;
    CUDA_CHECK(cudaMallocManaged(reinterpret_cast<void **>(&A), sizeof(cuDoubleComplex) * m * n));
    CUDA_CHECK(cudaMallocManaged(reinterpret_cast<void **>(&S), sizeof(double) * n));
    CUDA_CHECK(cudaMallocManaged(reinterpret_cast<void **>(&U), sizeof(cuDoubleComplex) * ldu * n));
    CUDA_CHECK(cudaMallocManaged(reinterpret_cast<void **>(&VH), sizeof(cuDoubleComplex) * ldvh * n));
    CUDA_CHECK(cudaMallocManaged(reinterpret_cast<void **>(&W), sizeof(cuDoubleComplex) * lda * n));
    CUDA_CHECK(cudaMallocManaged(reinterpret_cast<void **>(&devInfo), sizeof(int)));
    CUSOLVER_CHECK(cusolverDnZgesvd_bufferSize(cusolverH, m, n, &lwork));
    CUDA_CHECK(cudaMallocManaged(reinterpret_cast<void **>(&Work), sizeof(cuDoubleComplex) * lwork));
    CUDA_CHECK(cudaMemcpy(A, h_A.data(), sizeof(cuDoubleComplex) * h_A.size(), cudaMemcpyHostToDevice));
    std::printf("A = \n");
    print_matrix(m, n, A, lda);
    std::printf("=====\n");

    // 步骤3：SVD计算
    signed char jobu = 'A';  // all m columns of U
    signed char jobvt = 'A'; // all n columns of VH
    CUSOLVER_CHECK(
        cusolverDnZgesvd(
            cusolverH, jobu, jobvt,
            m, n, A, lda,
            S, 
            U, ldu, // ldu
            VH, ldvh, // ldvt,
            Work, lwork, rwork,
            devInfo
        )
    );
    CUDA_CHECK(cudaDeviceSynchronize());

    std::printf("after gesvd: *devInfo = %d\n", *devInfo);
    if (0 == *devInfo) {
        std::printf("gesvd converges \n");
    } else if (0 > *devInfo) {
        std::printf("%d-th parameter is wrong \n", -*devInfo);
        exit(1);
    } else {
        std::printf("WARNING: info = %d : gesvd does not converge \n", *devInfo);
    }

    std::printf("S = 奇异向量值\n");
    print_matrix(n, 1, S, n);
    std::printf("=====\n");

    std::printf("U = 左奇异向量\n");
    print_matrix(m, m, U, ldu);
    std::printf("=====\n");

    std::printf("VH = 右奇异向量\n");
    print_matrix(n, n, VH, ldvh);
    std::printf("=====\n");

    // 步骤4：释放资源
    CUDA_CHECK(cudaFree(A));
    CUDA_CHECK(cudaFree(U));
    CUDA_CHECK(cudaFree(VH));
    CUDA_CHECK(cudaFree(S));
    CUDA_CHECK(cudaFree(W));
    CUDA_CHECK(cudaFree(devInfo));
    CUDA_CHECK(cudaFree(Work));
    CUDA_CHECK(cudaFree(rwork));

    CUSOLVER_CHECK(cusolverDnDestroy(cusolverH));
    CUDA_CHECK(cudaDeviceReset());

    return EXIT_SUCCESS;
}

// 打印矩阵（列存储格式的矩阵）
void print_matrix(const int &m, const int &n, const cuDoubleComplex *A, const int &ldx) {
    for (int i = 0; i < m; i++) {
        for (int j = 0; j < n; j++) {
            if(A[j * ldx + i].y < 0){
                if(A[j * ldx + i].x < 0){
                    std::printf("\t-%0.4f - %0.4fj ", -A[j * ldx + i].x, -A[j * ldx + i].y);
                }else{
                    std::printf("\t %0.4f - %0.4fj ", A[j * ldx + i].x, -A[j * ldx + i].y);
                }
                
            }else{
                if(A[j * ldx + i].x < 0){
                    std::printf("\t-%0.4f + %0.4fj ", -A[j * ldx + i].x, A[j * ldx + i].y);
                }else{
                    std::printf("\t %0.4f + %0.4fj ", A[j * ldx + i].x, A[j * ldx + i].y);
                }
            }
        }
        std::printf("\n");
    }
}

程序运行结果：

A = 
         1.0000 + 0.0000j        2.0000 + 3.0000j 
         2.0000 - 3.0000j        5.0000 + 0.0000j 
         3.0000 - 5.0000j        6.0000 - 2.0000j 
=====
after gesvd: *devInfo = 0
gesvd converges 
S = 奇异向量值
11.09 
1.76 
=====
U = 左奇异向量
        -0.3085 - 0.0443j       -0.7870 - 0.2161j        0.4850 - 0.0398j 
        -0.3564 + 0.4239j       -0.3013 - 0.0884j       -0.6875 + 0.3494j 
        -0.3575 + 0.6844j        0.4126 - 0.2554j        0.3774 - 0.1612j 
=====
VH = 右奇异向量
        -0.6122 + -0.0000j      -0.5453 - 0.5726j 
         0.7907 + 0.0000j       -0.4222 - 0.4433j 
=====