cublasSgetriBatched的input matrix A 的值，在計算之後是否被改變或叫做污染，答案是No-CSDN博客

本文链接：https://blog.csdn.net/eloudy/article/details/128770954
基於Nvidia的sample源文件改寫；可以發現，Sgetri的輸入矩陣A的元素值，并沒有改變；
編譯的話，在cudaSample對應的blas文件夾中置入如下cu文件，并且修改對應的makefile裏的變量名字來編譯運行：
/*
 * Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/*
 * This example demonstrates how to use the cuBLAS library API
 * for lower-upper (LU) decomposition of a matrix. LU decomposition
 * factors a matrix as the product of upper triangular matrix and
 * lower trianglular matrix.
 * 
 * https://en.wikipedia.org/wiki/LU_decomposition
 *
 * This sample uses 10000 matrices of size 4x4 and performs 
 * LU decomposition of them using batched decomposition API
 * of cuBLAS library. To test the correctness of upper and lower
 * matrices generated, they are multiplied and compared with the
 * original input matrix.
 * 
 */


#include <stdio.h>
#include <stdlib.h>

// cuda libraries and helpers 
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <helper_cuda.h>

// configurable parameters
// dimension of matrix
#define N 170
#define BATCH_SIZE  1

// use double precision data type
//LL: #define DOUBLE_PRECISION /* comment this to use single precision */
#ifdef DOUBLE_PRECISION
#define DATA_TYPE double
#define MAX_ERROR 1e-15
#else
#define DATA_TYPE float
#define MAX_ERROR 1e-6
#endif /* DOUBLE_PRCISION */

// use pivot vector while decomposing
#define PIVOT /* comment this to disable pivot use */


// helper functions

// wrapper around cublas<t>getrfBatched()
cublasStatus_t cublasXgetrfBatched(cublasHandle_t handle, int n, DATA_TYPE* const A[], int lda, int* P, int* info, int batchSize)
{
#ifdef DOUBLE_PRECISION
    return cublasDgetrfBatched(handle, n, A, lda, P, info, batchSize);
#else
    return cublasSgetrfBatched(handle, n, A, lda, P, info, batchSize);
#endif
}

// wrapper around malloc
// clears the allocated memory to 0
// terminates the program if malloc fails
void* xmalloc(size_t size)
{
    void* ptr = malloc(size);
    if (ptr == NULL)
    {
        printf("> ERROR: malloc for size %zu failed..\n", size);
        exit(EXIT_FAILURE);
    }
    memset(ptr, 0, size);
    return ptr;
}

// initalize identity matrix
void initIdentityMatrix(DATA_TYPE* mat)
{
    // clear the matrix
    memset(mat, 0, N * N * sizeof(DATA_TYPE));

    // set all diagonals to 1
    for (int i = 0; i < N; i++)
    {
        mat[(i * N) + i] = 1.0;
    }
}

// initialize matrix with all elements as 0
void initZeroMatrix(DATA_TYPE* mat)
{
    memset(mat, 0, N * N * sizeof(DATA_TYPE));
}

// fill random value in column-major matrix
void initRandomMatrix(DATA_TYPE* mat)
{
    for (int i = 0; i < N; i++)
    {
        for (int j = 0; j < N; j++)
        {
            mat[(j * N) + i] = (DATA_TYPE)1.0 + ((DATA_TYPE)rand() / (DATA_TYPE)RAND_MAX);
        }
    }

    // diagonal dominant matrix to insure it is invertible matrix
    for (int i = 0; i < N; i++)
    {
        mat[(i * N) + i] += (DATA_TYPE)N;
    }
}

// print column-major matrix
void printMatrix(DATA_TYPE* mat)
{
    for (int i = 0; i < N; i++)
    {
        for (int j = 0; j < N; j++)
        {
            printf("%20.16f ", mat[(j * N) + i]);
        }
        printf("\n");
    }
    printf("\n");
}

// matrix mulitplication
void matrixMultiply(DATA_TYPE* res, DATA_TYPE* mat1, DATA_TYPE* mat2)
{
    initZeroMatrix(res);

    for (int i = 0; i < N; i++)
    {
        for (int j = 0; j < N; j++)
        {
            for (int k = 0; k < N; k++)
            {
                res[(j * N) + i] += mat1[(k * N) + i] * mat2[(j * N) + k];
            }
        }
    }
}

// check matrix equality
bool checkRelativeError(DATA_TYPE* mat1, DATA_TYPE* mat2, DATA_TYPE maxError)
{
    DATA_TYPE err         = (DATA_TYPE) 0.0;
    DATA_TYPE refNorm     = (DATA_TYPE) 0.0;
    DATA_TYPE relError    = (DATA_TYPE) 0.0;
    DATA_TYPE relMaxError = (DATA_TYPE) 0.0;

    for (int i = 0; i < N * N; i++)
    {
        refNorm = abs(mat1[i]);
        err = abs(mat1[i] - mat2[i]);

        if (refNorm != 0.0 && err > 0.0)
        {
            relError = err / refNorm;
            relMaxError = MAX(relMaxError, relError);
        }

        if (relMaxError > maxError)
            return false;

    }
    return true;
}

// decode lower and upper matrix from single matrix 
// returned by getrfBatched()
void getLUdecoded(DATA_TYPE* mat, DATA_TYPE* L, DATA_TYPE* U)
{
    // init L as identity matrix
    initIdentityMatrix(L);

    // copy lower triangular values from mat to L (skip diagonal)
    for (int i = 0; i < N; i++)
    {
        for (int j = 0; j < i; j++)
        {
            L[(j * N) + i] = mat[(j * N) + i];
        }
    }

    // init U as all zero
    initZeroMatrix(U);

    // copy upper triangular values from mat to U
    for (int i = 0; i < N; i++)
    {
        for (int j = i; j < N; j++)
        {
            U[(j * N) + i] = mat[(j * N) + i];
        }
    }
}

// generate permutation matrix from pivot vector
void getPmatFromPivot(DATA_TYPE* Pmat, int* P)
{
    int pivot[N];

    // pivot vector in base-1
    // convert it to base-0
    for (int i = 0; i < N; i++)
    {
        P[i]--;
    }

    // generate permutation vector from pivot
    // initialize pivot with identity sequence
    for (int k = 0; k < N; k++)
    {
        pivot[k] = k;
    }

    // swap the indices according to pivot vector
    for (int k = 0; k < N; k++)
    {
        int q = P[k];

        // swap pivot(k) and pivot(q)
        int s = pivot[k];
        int t = pivot[q];
        pivot[k] = t;
        pivot[q] = s;
    }

    // generate permutation matrix from pivot vector
    initZeroMatrix(Pmat);
    for (int i = 0; i < N; i++)
    {
        int j = pivot[i];
        Pmat[(j * N) + i] = (DATA_TYPE)1.0;
    }
}

void printFloatMatrix(float* A, int n, int lda){
	for(int i=0; i<n; i++){
		for(int j=0; j<n; j++){
			printf(" %7.4f", A[i + j*lda]);
		}
		printf("\n");
	}
	printf("\n\n");
}

int main(int argc, char **argv) {
    // cuBLAS variables
    cublasStatus_t status;
    cublasHandle_t handle;

    // host variables 
    size_t matSize = N * N * sizeof(DATA_TYPE);

    DATA_TYPE* h_AarrayInput;
    DATA_TYPE* h_AarrayOutput;
    DATA_TYPE* h_ptr_array[BATCH_SIZE];

    int* h_pivotArray;
    int* h_infoArray;

    // device variables
    DATA_TYPE*  d_Aarray;
    DATA_TYPE** d_ptr_array;

    int* d_pivotArray;
    int* d_infoArray;

    int err_count = 0;

    // seed the rand() function with time
    srand(12345);

    // find cuda device
    printf("> initializing..\n");
    int dev = findCudaDevice(argc, (const char **)argv);
    if (dev == -1)
    {
        return(EXIT_FAILURE);
    }

    // initialize cuBLAS
    status = cublasCreate(&handle);
    if (status != CUBLAS_STATUS_SUCCESS)
    {
        printf("> ERROR: cuBLAS initialization failed..\n");
        return(EXIT_FAILURE);
    }

#ifdef DOUBLE_PRECISION
    printf("> using DOUBLE precision..\n");
#else
    printf("> using SINGLE precision..\n");
#endif

#ifdef PIVOT
    printf("> pivot ENABLED..\n");
#else
    printf("> pivot DISABLED..\n");
#endif

    // allocate memory for host variables
    h_AarrayInput  = (DATA_TYPE*) xmalloc(BATCH_SIZE * matSize);
    h_AarrayOutput = (DATA_TYPE*) xmalloc(BATCH_SIZE * matSize);

    h_pivotArray   = (int*) xmalloc(N * BATCH_SIZE * sizeof(int));
    h_infoArray    = (int*) xmalloc(BATCH_SIZE * sizeof(int));

    // allocate memory for device variables
    checkCudaErrors(cudaMalloc((void**)&d_Aarray, BATCH_SIZE * matSize));
    checkCudaErrors(cudaMalloc((void**)&d_pivotArray, N * BATCH_SIZE * sizeof(int)));
    checkCudaErrors(cudaMalloc((void**)&d_infoArray, BATCH_SIZE * sizeof(int)));
    checkCudaErrors(cudaMalloc((void**)&d_ptr_array, BATCH_SIZE * sizeof(DATA_TYPE*)));

    // fill matrix with random data
    printf("> generating random matrices..\n");
    for (int i = 0; i < BATCH_SIZE; i++)
    {
        initRandomMatrix(h_AarrayInput + (i * N * N));
    }

    // copy data to device from host
    printf("> copying data from host memory to GPU memory..\n");
    checkCudaErrors(cudaMemcpy(d_Aarray, h_AarrayInput, BATCH_SIZE * matSize, cudaMemcpyHostToDevice));

    // create pointer array for matrices
    for (int i = 0; i < BATCH_SIZE; i++)
        h_ptr_array[i] = d_Aarray + (i * N * N);

    // copy pointer array to device memory
    checkCudaErrors(cudaMemcpy(d_ptr_array, h_ptr_array, BATCH_SIZE * sizeof(DATA_TYPE*), cudaMemcpyHostToDevice));

    // perform LU decomposition
    printf("> performing LU decomposition..\n");
#ifdef PIVOT
    status = cublasXgetrfBatched(handle, N, d_ptr_array, N, d_pivotArray, d_infoArray, BATCH_SIZE);
#else
    status = cublasXgetrfBatched(handle, N, d_ptr_array, N, NULL, d_infoArray, BATCH_SIZE);
#endif /* PIVOT */
    if (status != CUBLAS_STATUS_SUCCESS)
    {
        printf("> ERROR: cublasDgetrfBatched() failed with error %s..\n", _cudaGetErrorEnum(status));
        return(EXIT_FAILURE);
    }

    // copy data to host from device
    printf("> copying data from GPU memory to host memory..\n");
    checkCudaErrors(cudaMemcpy(h_AarrayOutput, d_Aarray,    BATCH_SIZE * matSize,     cudaMemcpyDeviceToHost));
    //
    printf("A_LU after getrf =\n");
    printFloatMatrix(h_AarrayOutput, N, N);
    //
    checkCudaErrors(cudaMemcpy(h_infoArray,    d_infoArray, BATCH_SIZE * sizeof(int), cudaMemcpyDeviceToHost));
#ifdef PIVOT
    checkCudaErrors(cudaMemcpy(h_pivotArray, d_pivotArray, N * BATCH_SIZE * sizeof(int), cudaMemcpyDeviceToHost));
#endif /* PIVOT */

    // verify the result
    //LL:: P*A = L*U
    printf("> verifying the result..\n");
    for (int i = 0; i < BATCH_SIZE; i++)
    {
        if (h_infoArray[i] == 0)
        {
            DATA_TYPE* A  = h_AarrayInput  + (i * N * N);
            DATA_TYPE* LU = h_AarrayOutput + (i * N * N);
            DATA_TYPE L[N * N];
            DATA_TYPE U[N * N];
            getLUdecoded(LU, L, U);

            // test P * A = L * U
            int* P = h_pivotArray + (i * N);
            DATA_TYPE Pmat[N * N];
#ifdef PIVOT
            getPmatFromPivot(Pmat, P);
#else
            initIdentityMatrix(Pmat);
#endif /* PIVOT */

            // perform matrix multiplication
            DATA_TYPE PxA[N * N];
            DATA_TYPE LxU[N * N];
            matrixMultiply(PxA, Pmat, A);
            matrixMultiply(LxU, L, U);

            // check for equality of matrices
            if (!checkRelativeError(PxA, LxU, (DATA_TYPE)MAX_ERROR))
            {
                printf("> ERROR: accuracy check failed for matrix number %05d..\n", i+1);
                err_count++;
            }

        }
        else if (h_infoArray[i] > 0)
        {
            printf("> execution for matrix %05d is successful, but U is singular and U(%d,%d) = 0..\n", i + 1,
                h_infoArray[i] - 1, h_infoArray[i] - 1);
        }
        else // (h_infoArray[i] < 0)
        {
            printf("> ERROR: matrix %05d have an illegal value at index %d = %lf..\n", i + 1,
                -h_infoArray[i], *(h_AarrayInput + (i * N * N) + (-h_infoArray[i])));
        }
    }

/
//status = cublasXgetrfBatched(handle, N, d_ptr_array, N, d_pivotArray, d_infoArray, BATCH_SIZE);
    //cublasDgetrfBatched(handle, n, Aarray, lda, PivotArray, infoArray, batchSize);
    DATA_TYPE*  d_Carray;
    DATA_TYPE** d_ptr_C_array;
    checkCudaErrors(cudaMalloc((void**)&d_Carray, BATCH_SIZE * matSize));
    checkCudaErrors(cudaMalloc((void**)&d_ptr_C_array, BATCH_SIZE * sizeof(DATA_TYPE*)));
    for (int i = 0; i < BATCH_SIZE; i++)
        h_ptr_array[i] = d_Carray + (i * N * N);

    // copy pointer array to device memory
    checkCudaErrors(cudaMemcpy(d_ptr_C_array, h_ptr_array, BATCH_SIZE * sizeof(DATA_TYPE*), cudaMemcpyHostToDevice));
    cublasSgetriBatched(handle, N, d_ptr_array, N, d_pivotArray, d_ptr_C_array, N, d_infoArray, BATCH_SIZE);


    
    DATA_TYPE*  h_getri_Aarray;

    h_getri_Aarray=(DATA_TYPE*)malloc(BATCH_SIZE * matSize);
    printf("LL: h_getri_Aarray= %llu\n", h_getri_Aarray);
    checkCudaErrors(cudaMemcpy(h_getri_Aarray, d_Aarray, matSize,     cudaMemcpyDeviceToHost));
    
//void printFloatMatrix(float* A, int N, int lda);
    printf("A_LU after getri =\n");
    printFloatMatrix(h_getri_Aarray, N, N);







/
    // free device variables
    checkCudaErrors(cudaFree(d_ptr_array));
    checkCudaErrors(cudaFree(d_infoArray));
    checkCudaErrors(cudaFree(d_pivotArray));
    checkCudaErrors(cudaFree(d_Aarray));

    // free host variables
    if (h_infoArray)    free(h_infoArray);
    if (h_pivotArray)   free(h_pivotArray);
    if (h_AarrayOutput) free(h_AarrayOutput);
    if (h_AarrayInput)  free(h_AarrayInput);

    // destroy cuBLAS handle
    status = cublasDestroy(handle);
    if (status != CUBLAS_STATUS_SUCCESS)
    {
        printf("> ERROR: cuBLAS uninitialization failed..\n");
        return(EXIT_FAILURE);
    }

    if (err_count > 0)
    {
        printf("> TEST FAILED for %d matrices, with precision: %g\n", err_count, MAX_ERROR);
        return(EXIT_FAILURE);
    }

    printf("> TEST SUCCESSFUL, with precision: %g\n", MAX_ERROR);
    return(EXIT_SUCCESS);
}