改写 cublas LU分解为 cublas matinv的示例

最新推荐文章于 2022-11-29 20:13:12 发布
Eloudy
最新推荐文章于 2022-11-29 20:13:12 发布
阅读量372
点赞数
分类专栏： blas 文章标签： cuda
本文链接：https://blog.csdn.net/eloudy/article/details/125202352
版权
blas 专栏收录该内容
65 篇文章 2 订阅
订阅专栏
将nv官方的LU分解sample 改写成 matinv的sample


#include <stdio.h>
#include <stdlib.h>

// cuda libraries and helpers 
#include <cublas_v2.h>
#include <cuda_runtime.h>

#define checkCudaErrors(val) (val);

#ifndef MAX
#define MAX(a, b) (a > b ? a : b)
#endif

static const char *_cudaGetErrorEnum(cublasStatus_t error) {
  switch (error) {
    case CUBLAS_STATUS_SUCCESS:
      return "CUBLAS_STATUS_SUCCESS";

    case CUBLAS_STATUS_NOT_INITIALIZED:
      return "CUBLAS_STATUS_NOT_INITIALIZED";

    case CUBLAS_STATUS_ALLOC_FAILED:
      return "CUBLAS_STATUS_ALLOC_FAILED";

    case CUBLAS_STATUS_INVALID_VALUE:
      return "CUBLAS_STATUS_INVALID_VALUE";

    case CUBLAS_STATUS_ARCH_MISMATCH:
      return "CUBLAS_STATUS_ARCH_MISMATCH";

    case CUBLAS_STATUS_MAPPING_ERROR:
      return "CUBLAS_STATUS_MAPPING_ERROR";

    case CUBLAS_STATUS_EXECUTION_FAILED:
      return "CUBLAS_STATUS_EXECUTION_FAILED";

    case CUBLAS_STATUS_INTERNAL_ERROR:
      return "CUBLAS_STATUS_INTERNAL_ERROR";

    case CUBLAS_STATUS_NOT_SUPPORTED:
      return "CUBLAS_STATUS_NOT_SUPPORTED";

    case CUBLAS_STATUS_LICENSE_ERROR:
      return "CUBLAS_STATUS_LICENSE_ERROR";
  }

  return "<unknown>";
}

// configurable parameters
// dimension of matrix
#define N           14
#define BATCH_SIZE  3

// use double precision data type
//#define DOUBLE_PRECISION /* comment this to use single precision */
#ifdef DOUBLE_PRECISION
#define DATA_TYPE double
#define MAX_ERROR 1e-15
#else
#define DATA_TYPE float
#define MAX_ERROR 1e-6
#endif /* DOUBLE_PRCISION */

// use pivot vector while decomposing
#define PIVOT /* comment this to disable pivot use */


// helper functions

// wrapper around cublas<t>getrfBatched()
cublasStatus_t cublasXgetrfBatched(cublasHandle_t handle, int n, DATA_TYPE* const A[], int lda, int* P, int* info, int batchSize)
{
#ifdef DOUBLE_PRECISION
    return cublasDgetrfBatched(handle, n, A, lda, P, info, batchSize);
#else
    return cublasSgetrfBatched(handle, n, A, lda, P, info, batchSize);
#endif
}

//cublasStatus_t cublasSmatinvBatched(cublasHandle_t handle, int n, const float *A[], int lda, float *Ainv[], int lda_inv, int *info, int batchSize);
// wrapper around cublas<t>matinvBatched()
cublasStatus_t cublasXmatinvBatched(cublasHandle_t handle, int n, DATA_TYPE* const A[], int lda, DATA_TYPE* const Ainv[], int lda_inv, int* info, int batchSize)
{
#ifdef DOUBLE_PRECISION
    return cublasDmatinvBatched(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
#else
    return cublasSmatinvBatched(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
#endif
}


// wrapper around malloc
// clears the allocated memory to 0
// terminates the program if malloc fails
void* xmalloc(size_t size)
{
    void* ptr = malloc(size);
    if (ptr == NULL)
    {
        printf("> ERROR: malloc for size %zu failed..\n", size);
        exit(EXIT_FAILURE);
    }
    memset(ptr, 0, size);
    return ptr;
}

// initalize identity matrix
void initIdentityMatrix(DATA_TYPE* mat)
{
    // clear the matrix
    memset(mat, 0, N * N * sizeof(DATA_TYPE));

    // set all diagonals to 1
    for (int i = 0; i < N; i++)
    {
        mat[(i * N) + i] = 1.0;
    }
}

// initialize matrix with all elements as 0
void initZeroMatrix(DATA_TYPE* mat)
{
    memset(mat, 0, N * N * sizeof(DATA_TYPE));
}

// fill random value in column-major matrix
void initRandomMatrix(DATA_TYPE* mat)
{
    for (int i = 0; i < N; i++)
    {
        for (int j = 0; j < N; j++)
        {
            mat[(j * N) + i] = (DATA_TYPE)1.0 + ((DATA_TYPE)rand() / (DATA_TYPE)RAND_MAX);
        }
    }

    // diagonal dominant matrix to insure it is invertible matrix
    for (int i = 0; i < N; i++)
    {
        mat[(i * N) + i] += (DATA_TYPE)N;
    }
}

// print column-major matrix
void printMatrix(DATA_TYPE* mat)
{
    for (int i = 0; i < N; i++)
    {
        for (int j = 0; j < N; j++)
        {
            printf("%20.16f ", mat[(j * N) + i]);
        }
        printf("\n");
    }
    printf("\n");
}

// matrix mulitplication
void matrixMultiply(DATA_TYPE* res, DATA_TYPE* mat1, DATA_TYPE* mat2)
{
    initZeroMatrix(res);

    for (int i = 0; i < N; i++)
    {
        for (int j = 0; j < N; j++)
        {
            for (int k = 0; k < N; k++)
            {
                res[(j * N) + i] += mat1[(k * N) + i] * mat2[(j * N) + k];
            }
        }
    }
}

// check matrix equality
bool checkRelativeError(DATA_TYPE* mat1, DATA_TYPE* mat2, DATA_TYPE maxError)
{
    DATA_TYPE err         = (DATA_TYPE) 0.0;
    DATA_TYPE refNorm     = (DATA_TYPE) 0.0;
    DATA_TYPE relError    = (DATA_TYPE) 0.0;
    DATA_TYPE relMaxError = (DATA_TYPE) 0.0;

    for (int i = 0; i < N * N; i++)
    {
        refNorm = abs(mat1[i]);
        err = abs(mat1[i] - mat2[i]);

        if (refNorm != 0.0 && err > 0.0)
        {
            relError = err / refNorm;
            relMaxError = MAX(relMaxError, relError);
        }

        if (relMaxError > maxError)
            return false;

    }
    return true;
}

// decode lower and upper matrix from single matrix 
// returned by getrfBatched()
void getLUdecoded(DATA_TYPE* mat, DATA_TYPE* L, DATA_TYPE* U)
{
    // init L as identity matrix
    initIdentityMatrix(L);

    // copy lower triangular values from mat to L (skip diagonal)
    for (int i = 0; i < N; i++)
    {
        for (int j = 0; j < i; j++)
        {
            L[(j * N) + i] = mat[(j * N) + i];
        }
    }

    // init U as all zero
    initZeroMatrix(U);

    // copy upper triangular values from mat to U
    for (int i = 0; i < N; i++)
    {
        for (int j = i; j < N; j++)
        {
            U[(j * N) + i] = mat[(j * N) + i];
        }
    }
}

// generate permutation matrix from pivot vector
void getPmatFromPivot(DATA_TYPE* Pmat, int* P)
{
    int pivot[N];

    // pivot vector in base-1
    // convert it to base-0
    for (int i = 0; i < N; i++)
    {
        P[i]--;
    }

    // generate permutation vector from pivot
    // initialize pivot with identity sequence
    for (int k = 0; k < N; k++)
    {
        pivot[k] = k;
    }

    // swap the indices according to pivot vector
    for (int k = 0; k < N; k++)
    {
        int q = P[k];

        // swap pivot(k) and pivot(q)
        int s = pivot[k];
        int t = pivot[q];
        pivot[k] = t;
        pivot[q] = s;
    }

    // generate permutation matrix from pivot vector
    initZeroMatrix(Pmat);
    for (int i = 0; i < N; i++)
    {
        int j = pivot[i];
        Pmat[(j * N) + i] = (DATA_TYPE)1.0;
    }
}


int main(int argc, char **argv) {
    printf("N = %d , BATCH_SIZE = %d\n", N, BATCH_SIZE);
    // cuBLAS variables
    cublasStatus_t status;
    cublasHandle_t handle;

    // host variables 
    size_t matSize = N * N * sizeof(DATA_TYPE);

    DATA_TYPE* h_AarrayInput;
    DATA_TYPE* h_AarrayInvOutput;//LL:: 
    DATA_TYPE* h_ptr_array[BATCH_SIZE];
    DATA_TYPE* h_ptr_array_inv[BATCH_SIZE];//LL::

    int* h_infoArray;

    // device variables
    DATA_TYPE*  d_Aarray;
    DATA_TYPE*  d_Aarray_inv;//LL::
    DATA_TYPE** d_ptr_array;
    DATA_TYPE** d_ptr_array_inv;//LL::

    int* d_infoArray;
    int err_count = 0;

    // seed the rand() function with time
    srand(12345);

    // find cuda device
    printf("> initializing..\n");

    // initialize cuBLAS
    status = cublasCreate(&handle);
    if (status != CUBLAS_STATUS_SUCCESS)
    {
        printf("> ERROR: cuBLAS initialization failed..\n");
        return(EXIT_FAILURE);
    }

#ifdef DOUBLE_PRECISION
    printf("> using DOUBLE precision..\n");
#else
    printf("> using SINGLE precision..\n");
#endif


    // allocate memory for host variables
    h_AarrayInput  = (DATA_TYPE*) xmalloc(BATCH_SIZE * matSize);
    h_AarrayInvOutput = (DATA_TYPE*) xmalloc(BATCH_SIZE * matSize);
    h_infoArray    = (int*) xmalloc(BATCH_SIZE * sizeof(int));

    // allocate memory for device variables
    checkCudaErrors(cudaMalloc((void**)&d_Aarray, BATCH_SIZE * matSize));
    checkCudaErrors(cudaMalloc((void**)&d_Aarray_inv, BATCH_SIZE * matSize));//LL:: 
    checkCudaErrors(cudaMalloc((void**)&d_infoArray, BATCH_SIZE * sizeof(int)));
    checkCudaErrors(cudaMalloc((void**)&d_ptr_array, BATCH_SIZE * sizeof(DATA_TYPE*)));
    checkCudaErrors(cudaMalloc((void**)&d_ptr_array_inv, BATCH_SIZE * sizeof(DATA_TYPE*)));//LL::

    // fill matrix with random data
    printf("> generating random matrices..\n");
    for (int i = 0; i < BATCH_SIZE; i++)
    {
        initRandomMatrix(h_AarrayInput + (i * N * N));
    }

    // copy data to device from host
    printf("> copying data from host memory to GPU memory..\n");
    checkCudaErrors(cudaMemcpy(d_Aarray, h_AarrayInput, BATCH_SIZE * matSize, cudaMemcpyHostToDevice));

    // create pointer array for matrices
    for (int i = 0; i < BATCH_SIZE; i++){
        // d_Aarray_inv  ->  h_ptr_array_inv  =>  d_ptr_array_inv  
        h_ptr_array[i]     = d_Aarray + (i * N * N);
        h_ptr_array_inv[i] = d_Aarray_inv + (i*N*N);//LL::
    }

    // copy pointer array to device memory
    checkCudaErrors(cudaMemcpy(d_ptr_array,     h_ptr_array,     BATCH_SIZE * sizeof(DATA_TYPE*), cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_ptr_array_inv, h_ptr_array_inv, BATCH_SIZE * sizeof(DATA_TYPE*), cudaMemcpyHostToDevice));//LL::

    // perform matrix inversing
    printf("> performing matrix inversing..\n");

    status = cublasXmatinvBatched(handle, N, d_ptr_array, N, d_ptr_array_inv, N, d_infoArray, BATCH_SIZE);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        printf("> ERROR: cublasXmatinvBatched() failed with error %s..\n", _cudaGetErrorEnum(status));
        return(EXIT_FAILURE);
    }

    printf("> performing matrix inversing finished.\n");

    // copy data to host from device
    printf("> copying data from GPU memory to host memory..\n");
    checkCudaErrors(cudaMemcpy(h_AarrayInvOutput, d_Aarray_inv, BATCH_SIZE * matSize,     cudaMemcpyDeviceToHost));  //LL::  
    //h_infoArray <- d_infoArray
    checkCudaErrors(cudaMemcpy(h_infoArray,       d_infoArray,  BATCH_SIZE * sizeof(int), cudaMemcpyDeviceToHost));

//  d_Aarray_inv  ->  h_ptr_array_inv  =>  d_ptr_array_inv  
    printf("> verifying the inversion result..\n");
    for(int i=0; i< BATCH_SIZE; i++){
        if(h_infoArray[i] == 0){
            DATA_TYPE* A  = h_AarrayInput  + (i * N * N);
            DATA_TYPE* A_inv = h_AarrayInvOutput + (i * N * N);
            DATA_TYPE  A_x_Ainv[N*N];

            matrixMultiply(A_x_Ainv, A, A_inv);
            printMatrix(A_x_Ainv);
        }
    }

    // free device variables
    checkCudaErrors(cudaFree(d_ptr_array));
    checkCudaErrors(cudaFree(d_ptr_array_inv));//LL::
    checkCudaErrors(cudaFree(d_infoArray));
    checkCudaErrors(cudaFree(d_Aarray));
    checkCudaErrors(cudaFree(d_Aarray_inv));//LL::

    // free host variables
    if (h_infoArray)       free(h_infoArray);
    if (h_AarrayInput)     free(h_AarrayInput);
    if (h_AarrayInvOutput) free(h_AarrayInvOutput);//LL::

    // destroy cuBLAS handle
    status = cublasDestroy(handle);
    if (status != CUBLAS_STATUS_SUCCESS)
    {
        printf("> ERROR: cuBLAS uninitialization failed..\n");
        return(EXIT_FAILURE);
    }

    if (err_count > 0)
    {
        printf("> TEST FAILED for %d matrices, with precision: %g\n", err_count, MAX_ERROR);
        return(EXIT_FAILURE);
    }

    printf("> TEST SUCCESSFUL, with precision: %g\n", MAX_ERROR);
    return(EXIT_SUCCESS);
}
复用的懒惰的方式：放在cuda sample对应文件夹中编译运行，或自己写makefile 也行