-
函数说明
cusolverDn<t>gesvd()
函数用于矩阵的奇异值分解,和Matlab的svd
函数对应。 -
数学简介
-
奇异值分解的公式为:
这里用S代表西格玛,即
A = U*S*VH
。我们的最终目的就是计算出右边三个矩阵。 -
矩阵尺寸:
设:A为m行n列的矩阵,则U为m行m列,S为m行n列,VH为n行n列。注:后面三个矩阵的秩和A的秩相等。 -
因为矩阵A、U、S、VH都代表了一次线性变换,又因为公式右边等于左边。所以,我们可以说右面三个矩阵对应的线性变换等价于左边,即将A的线性变换分解为三步,U、S、VH分别代表了旋转、拉伸、旋转。
-
-
matlab计算
对矩阵A做svd分解,matlab代码如下:% 生成矩阵A A = [ 1 2 2 5 3 6 ] + [ 0i 3i -3i 0i -5i -2i ]; % svd分解 [U,S,V] = svd(A); % 打印结果 A %原矩阵 S %奇异值 U %左奇异向量 VH = V' %右奇异向量的共轭转置(这样打印为了和CUDA比较)
matlab执行结果:
A = 1.0000 + 0.0000i 2.0000 + 3.0000i 2.0000 - 3.0000i 5.0000 + 0.0000i 3.0000 - 5.0000i 6.0000 - 2.0000i S = 11.0864 0 0 1.7583 0 0 U = -0.3085 - 0.0443i -0.7870 - 0.2161i 0.4850 - 0.0398i -0.3564 + 0.4239i -0.3013 - 0.0884i -0.6875 + 0.3494i -0.3575 + 0.6844i 0.4126 - 0.2554i 0.3774 - 0.1612i VH = -0.6122 + 0.0000i -0.5453 - 0.5726i 0.7907 + 0.0000i -0.4222 - 0.4433i
-
CUDA函数原型(以cuDoubleComplex类型举例)
cusolverDnZgesvd (cusolverStatus_t //状态值 cusolverDnZgesvd ( //函数名,D代表稠密矩阵,Z代表双精度复数 cusolverDnHandle_t handle, //句柄,上下文相关,估计和FILE指针类似 signed char jobu, //U的保存方式。'A'-all,不化简,以m行m列保存,不论A是否满秩;'S'-simplify,化简; signed char jobvh, //同jobu int m, //A的行数 int n, //A的列数 cuDoubleComplex *A, //A的指针 int lda, //A的主维度 double *S, //S的指针,结果从大到小排序,即S[i]>S[i+1] cuDoubleComplex *U, //U的指针 int ldu, //U的主维度 cuDoubleComplex *VH, //VH的指针 int ldvh, //VH的主维度 cuDoubleComplex *work, //工作空间指针,用于函数计算的缓存空间,需要用户申请 int lwork, //工作空间的大小,由cusolverDnZgesvd_bufferSize()计算 double *rwork, //不知道做啥用的 int *devInfo //计算结果标志位,=0时正常 );
-
示例代码
cusolver_utils.h,来自Nvidia官方库。/* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #pragma once #include <cmath> #include <functional> #include <iostream> #include <random> #include <stdexcept> #include <string> #include <cuComplex.h> #include <cuda_runtime_api.h> #include <cublas_api.h> #include <cusolverDn.h> #include <library_types.h> // CUDA API error checking #define CUDA_CHECK(err) \ do { \ cudaError_t err_ = (err); \ if (err_ != cudaSuccess) { \ printf("CUDA error %d at %s:%d\n", err_, __FILE__, __LINE__); \ throw std::runtime_error("CUDA error"); \ } \ } while (0) // cusolver API error checking #define CUSOLVER_CHECK(err) \ do { \ cusolverStatus_t err_ = (err); \ if (err_ != CUSOLVER_STATUS_SUCCESS) { \ printf("cusolver error %d at %s:%d\n", err_, __FILE__, __LINE__); \ throw std::runtime_error("cusolver error"); \ } \ } while (0) // cublas API error checking #define CUBLAS_CHECK(err) \ do { \ cublasStatus_t err_ = (err); \ if (err_ != CUBLAS_STATUS_SUCCESS) { \ printf("cublas error %d at %s:%d\n", err_, __FILE__, __LINE__); \ throw std::runtime_error("cublas error"); \ } \ } while (0) // cublas API error checking #define CUSPARSE_CHECK(err) \ do { \ cusparseStatus_t err_ = (err); \ if (err_ != CUSPARSE_STATUS_SUCCESS) { \ printf("cusparse error %d at %s:%d\n", err_, __FILE__, __LINE__); \ throw std::runtime_error("cusparse error"); \ } \ } while (0) // memory alignment #define ALIGN_TO(A, B) (((A + B - 1) / B) * B) // device memory pitch alignment static const size_t device_alignment = 32; // type traits template <typename T> struct traits; template <> struct traits<float> { // scalar type typedef float T; typedef T S; static constexpr T zero = 0.f; static constexpr cudaDataType cuda_data_type = CUDA_R_32F; #if CUDART_VERSION >= 11000 static constexpr cusolverPrecType_t cusolver_precision_type = CUSOLVER_R_32F; #endif inline static S abs(T val) { return fabs(val); } template <typename RNG> inline static T rand(RNG &gen) { return (S)gen(); } inline static T add(T a, T b) { return a + b; } inline static T mul(T v, double f) { return v * f; } }; template <> struct traits<double> { // scalar type typedef double T; typedef T S; static constexpr T zero = 0.; static constexpr cudaDataType cuda_data_type = CUDA_R_64F; #if CUDART_VERSION >= 11000 static constexpr cusolverPrecType_t cusolver_precision_type = CUSOLVER_R_64F; #endif inline static S abs(T val) { return fabs(val); } template <typename RNG> inline static T rand(RNG &gen) { return (S)gen(); } inline static T add(T a, T b) { return a + b; } inline static T mul(T v, double f) { return v * f; } }; template <> struct traits<cuFloatComplex> { // scalar type typedef float S; typedef cuFloatComplex T; static constexpr T zero = {0.f, 0.f}; static constexpr cudaDataType cuda_data_type = CUDA_C_32F; #if CUDART_VERSION >= 11000 static constexpr cusolverPrecType_t cusolver_precision_type = CUSOLVER_C_32F; #endif inline static S abs(T val) { return cuCabsf(val); } template <typename RNG> inline static T rand(RNG &gen) { return make_cuFloatComplex((S)gen(), (S)gen()); } inline static T add(T a, T b) { return cuCaddf(a, b); } inline static T add(T a, S b) { return cuCaddf(a, make_cuFloatComplex(b, 0.f)); } inline static T mul(T v, double f) { return make_cuFloatComplex(v.x * f, v.y * f); } }; template <> struct traits<cuDoubleComplex> { // scalar type typedef double S; typedef cuDoubleComplex T; static constexpr T zero = {0., 0.}; static constexpr cudaDataType cuda_data_type = CUDA_C_64F; #if CUDART_VERSION >= 11000 static constexpr cusolverPrecType_t cusolver_precision_type = CUSOLVER_C_64F; #endif inline static S abs(T val) { return cuCabs(val); } template <typename RNG> inline static T rand(RNG &gen) { return make_cuDoubleComplex((S)gen(), (S)gen()); } inline static T add(T a, T b) { return cuCadd(a, b); } inline static T add(T a, S b) { return cuCadd(a, make_cuDoubleComplex(b, 0.)); } inline static T mul(T v, double f) { return make_cuDoubleComplex(v.x * f, v.y * f); } }; template <typename T> void print_matrix(const int &m, const int &n, const T *A, const int &lda); template <> void print_matrix(const int &m, const int &n, const float *A, const int &lda) { for (int i = 0; i < m; i++) { for (int j = 0; j < n; j++) { std::printf("%0.2f ", A[j * lda + i]); } std::printf("\n"); } } template <> void print_matrix(const int &m, const int &n, const double *A, const int &lda) { for (int i = 0; i < m; i++) { for (int j = 0; j < n; j++) { std::printf("%0.2f ", A[j * lda + i]); } std::printf("\n"); } } template <> void print_matrix(const int &m, const int &n, const cuComplex *A, const int &lda) { for (int i = 0; i < m; i++) { for (int j = 0; j < n; j++) { std::printf("%0.2f + %0.2fj ", A[j * lda + i].x, A[j * lda + i].y); } std::printf("\n"); } } template <> void print_matrix(const int &m, const int &n, const cuDoubleComplex *A, const int &lda) { for (int i = 0; i < m; i++) { for (int j = 0; j < n; j++) { std::printf("%0.2f + %0.2fj ", A[j * lda + i].x, A[j * lda + i].y); } std::printf("\n"); } } template <typename T> void generate_random_matrix(cusolver_int_t m, cusolver_int_t n, T **A, int *lda) { std::random_device rd; std::mt19937 gen(rd()); std::uniform_real_distribution<typename traits<T>::S> dis(-1.0, 1.0); auto rand_gen = std::bind(dis, gen); *lda = n; size_t matrix_mem_size = static_cast<size_t>(*lda * m * sizeof(T)); // suppress gcc 7 size warning if (matrix_mem_size <= PTRDIFF_MAX) *A = (T *)malloc(matrix_mem_size); else throw std::runtime_error("Memory allocation size is too large"); if (*A == NULL) throw std::runtime_error("Unable to allocate host matrix"); // random matrix and accumulate row sums for (int i = 0; i < m; ++i) { for (int j = 0; j < n; ++j) { T *A_row = (*A) + *lda * i; A_row[j] = traits<T>::rand(rand_gen); } } } // Makes matrix A of size mxn and leading dimension lda diagonal dominant template <typename T> void make_diag_dominant_matrix(cusolver_int_t m, cusolver_int_t n, T *A, int lda) { for (int i = 0; i < std::min(m, n); ++i) { T *A_row = A + lda * i; auto row_sum = traits<typename traits<T>::S>::zero; for (int j = 0; j < n; ++j) { row_sum += traits<T>::abs(A_row[j]); } A_row[i] = traits<T>::add(A_row[i], row_sum); } } // Returns cudaDataType value as defined in library_types.h for the string containing type name cudaDataType get_cuda_library_type(std::string type_string) { if (type_string.compare("CUDA_R_16F") == 0) return CUDA_R_16F; else if (type_string.compare("CUDA_C_16F") == 0) return CUDA_C_16F; else if (type_string.compare("CUDA_R_32F") == 0) return CUDA_R_32F; else if (type_string.compare("CUDA_C_32F") == 0) return CUDA_C_32F; else if (type_string.compare("CUDA_R_64F") == 0) return CUDA_R_64F; else if (type_string.compare("CUDA_C_64F") == 0) return CUDA_C_64F; else if (type_string.compare("CUDA_R_8I") == 0) return CUDA_R_8I; else if (type_string.compare("CUDA_C_8I") == 0) return CUDA_C_8I; else if (type_string.compare("CUDA_R_8U") == 0) return CUDA_R_8U; else if (type_string.compare("CUDA_C_8U") == 0) return CUDA_C_8U; else if (type_string.compare("CUDA_R_32I") == 0) return CUDA_R_32I; else if (type_string.compare("CUDA_C_32I") == 0) return CUDA_C_32I; else if (type_string.compare("CUDA_R_32U") == 0) return CUDA_R_32U; else if (type_string.compare("CUDA_C_32U") == 0) return CUDA_C_32U; else throw std::runtime_error("Unknown CUDA datatype"); } // Returns cusolverIRSRefinement_t value as defined in cusolver_common.h for the string containing // solver name cusolverIRSRefinement_t get_cusolver_refinement_solver(std::string solver_string) { if (solver_string.compare("CUSOLVER_IRS_REFINE_NONE") == 0) return CUSOLVER_IRS_REFINE_NONE; else if (solver_string.compare("CUSOLVER_IRS_REFINE_CLASSICAL") == 0) return CUSOLVER_IRS_REFINE_CLASSICAL; else if (solver_string.compare("CUSOLVER_IRS_REFINE_GMRES") == 0) return CUSOLVER_IRS_REFINE_GMRES; else if (solver_string.compare("CUSOLVER_IRS_REFINE_CLASSICAL_GMRES") == 0) return CUSOLVER_IRS_REFINE_CLASSICAL_GMRES; else if (solver_string.compare("CUSOLVER_IRS_REFINE_GMRES_GMRES") == 0) return CUSOLVER_IRS_REFINE_GMRES_GMRES; else printf("Unknown solver parameter: \"%s\"\n", solver_string.c_str()); return CUSOLVER_IRS_REFINE_NOT_SET; }
cusolver_gesvd_example.cu,参考官方库编写。
#include <cstdio> #include <cstdlib> #include <vector> #include <cuda_runtime.h> #include <cusolverDn.h> #include "cusolver_utils.h" void print_matrix(const int &m, const int &n, const cuDoubleComplex *A, const int &ldx); /* matlab矩阵A为: 1.0000 + 0.0000i 2.0000 + 3.0000i 2.0000 - 3.0000i 5.0000 + 0.0000i 3.0000 - 5.0000i 6.0000 - 2.0000i gesvd中应该对应A的转置(非共轭转置),内存中存储顺序如下: 1.0000 + 0.0000i 2.0000 - 3.0000i 3.0000 - 5.0000i 2.0000 + 3.0000i 5.0000 + 0.0000i 6.0000 - 2.0000i */ // 输入矩阵A应为列存储格式,即matlab中A.' const int m = 3; //矩阵行 const int n = 2; //矩阵列 const int lda = m; /* lda >= m */ const std::vector<cuDoubleComplex> h_A = { //列存储 {1, 0}, {2, -3}, {3, -5}, {2, 3}, {5, 0}, {6, -2}, }; int main(int argc, char *argv[]) { // 步骤1:创建句柄 cusolverDnHandle_t cusolverH = NULL; CUSOLVER_CHECK(cusolverDnCreate(&cusolverH)); // 步骤2:申请空间 cuDoubleComplex *A = nullptr; double *S = nullptr; cuDoubleComplex *U = nullptr; // 左奇异矩阵 cuDoubleComplex *VH = nullptr; // 又奇异矩阵的复共轭转置 const int ldu = m; // 根据公式,U为m行m列的方阵 const int ldvh = n; // 根据公式,VH为n行n列的仿真 cuDoubleComplex *W = nullptr; // W = S*VH 没看懂啥意思 int *devInfo = nullptr; // 函数运行状态返回值 int lwork = 0; // 工作空间大小 cuDoubleComplex *Work = nullptr; // 工作空间指针 double *rwork = nullptr; CUDA_CHECK(cudaMallocManaged(reinterpret_cast<void **>(&A), sizeof(cuDoubleComplex) * m * n)); CUDA_CHECK(cudaMallocManaged(reinterpret_cast<void **>(&S), sizeof(double) * n)); CUDA_CHECK(cudaMallocManaged(reinterpret_cast<void **>(&U), sizeof(cuDoubleComplex) * ldu * n)); CUDA_CHECK(cudaMallocManaged(reinterpret_cast<void **>(&VH), sizeof(cuDoubleComplex) * ldvh * n)); CUDA_CHECK(cudaMallocManaged(reinterpret_cast<void **>(&W), sizeof(cuDoubleComplex) * lda * n)); CUDA_CHECK(cudaMallocManaged(reinterpret_cast<void **>(&devInfo), sizeof(int))); CUSOLVER_CHECK(cusolverDnZgesvd_bufferSize(cusolverH, m, n, &lwork)); CUDA_CHECK(cudaMallocManaged(reinterpret_cast<void **>(&Work), sizeof(cuDoubleComplex) * lwork)); CUDA_CHECK(cudaMemcpy(A, h_A.data(), sizeof(cuDoubleComplex) * h_A.size(), cudaMemcpyHostToDevice)); std::printf("A = \n"); print_matrix(m, n, A, lda); std::printf("=====\n"); // 步骤3:SVD计算 signed char jobu = 'A'; // all m columns of U signed char jobvt = 'A'; // all n columns of VH CUSOLVER_CHECK( cusolverDnZgesvd( cusolverH, jobu, jobvt, m, n, A, lda, S, U, ldu, // ldu VH, ldvh, // ldvt, Work, lwork, rwork, devInfo ) ); CUDA_CHECK(cudaDeviceSynchronize()); std::printf("after gesvd: *devInfo = %d\n", *devInfo); if (0 == *devInfo) { std::printf("gesvd converges \n"); } else if (0 > *devInfo) { std::printf("%d-th parameter is wrong \n", -*devInfo); exit(1); } else { std::printf("WARNING: info = %d : gesvd does not converge \n", *devInfo); } std::printf("S = 奇异向量值\n"); print_matrix(n, 1, S, n); std::printf("=====\n"); std::printf("U = 左奇异向量\n"); print_matrix(m, m, U, ldu); std::printf("=====\n"); std::printf("VH = 右奇异向量\n"); print_matrix(n, n, VH, ldvh); std::printf("=====\n"); // 步骤4:释放资源 CUDA_CHECK(cudaFree(A)); CUDA_CHECK(cudaFree(U)); CUDA_CHECK(cudaFree(VH)); CUDA_CHECK(cudaFree(S)); CUDA_CHECK(cudaFree(W)); CUDA_CHECK(cudaFree(devInfo)); CUDA_CHECK(cudaFree(Work)); CUDA_CHECK(cudaFree(rwork)); CUSOLVER_CHECK(cusolverDnDestroy(cusolverH)); CUDA_CHECK(cudaDeviceReset()); return EXIT_SUCCESS; } // 打印矩阵(列存储格式的矩阵) void print_matrix(const int &m, const int &n, const cuDoubleComplex *A, const int &ldx) { for (int i = 0; i < m; i++) { for (int j = 0; j < n; j++) { if(A[j * ldx + i].y < 0){ if(A[j * ldx + i].x < 0){ std::printf("\t-%0.4f - %0.4fj ", -A[j * ldx + i].x, -A[j * ldx + i].y); }else{ std::printf("\t %0.4f - %0.4fj ", A[j * ldx + i].x, -A[j * ldx + i].y); } }else{ if(A[j * ldx + i].x < 0){ std::printf("\t-%0.4f + %0.4fj ", -A[j * ldx + i].x, A[j * ldx + i].y); }else{ std::printf("\t %0.4f + %0.4fj ", A[j * ldx + i].x, A[j * ldx + i].y); } } } std::printf("\n"); } }
程序运行结果:
A = 1.0000 + 0.0000j 2.0000 + 3.0000j 2.0000 - 3.0000j 5.0000 + 0.0000j 3.0000 - 5.0000j 6.0000 - 2.0000j ===== after gesvd: *devInfo = 0 gesvd converges S = 奇异向量值 11.09 1.76 ===== U = 左奇异向量 -0.3085 - 0.0443j -0.7870 - 0.2161j 0.4850 - 0.0398j -0.3564 + 0.4239j -0.3013 - 0.0884j -0.6875 + 0.3494j -0.3575 + 0.6844j 0.4126 - 0.2554j 0.3774 - 0.1612j ===== VH = 右奇异向量 -0.6122 + -0.0000j -0.5453 - 0.5726j 0.7907 + 0.0000j -0.4222 - 0.4433j =====
-
注意
- gesvd只能计算m>=n的矩阵。
- gesvd的输出右奇异向量VH而不是V。
- 工作空间Work,需要用户申请设备内存供函数使用。
- 奇异值S是按从大到小排序的,即S[i] > S[i+1]。
- 行存储和列存储为转置关系,matlab表达为
A = A.'
。
CUDA cusolverDn<t>gesvd()函数认知
于 2022-10-28 11:32:08 首次发布