一个使用cublasSgetrfBatched计算逆矩阵的示例

最新推荐文章于 2023-01-27 15:32:45 发布

Eloudy

最新推荐文章于 2023-01-27 15:32:45 发布

阅读量565

点赞数

分类专栏： blas cuda 文章标签： cublas

本文链接：https://blog.csdn.net/eloudy/article/details/122032982

版权

blas 同时被 2 个专栏收录

65 篇文章 2 订阅

订阅专栏

cuda

34 篇文章 2 订阅

订阅专栏

//mat_inv.cu
#include <stdio.h>
#include <stdlib.h>
#include <cublas_v2.h>
#include<math.h>


#define cudacall(call)                                                                                                          \
    do                                                                                                                          \
    {                                                                                                                           \
        cudaError_t err = (call);                                                                                               \
        if(cudaSuccess != err)                                                                                                  \
        {                                                                                                                       \
            fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err));    \
            cudaDeviceReset();                                                                                                  \
            exit(EXIT_FAILURE);                                                                                                 \
        }                                                                                                                       \
    }                                                                                                                           \
    while (0)

#define cublascall(call)                                                                                        \
    do                                                                                                          \
    {                                                                                                           \
        cublasStatus_t status = (call);                                                                         \
        if(CUBLAS_STATUS_SUCCESS != status)                                                                     \
        {                                                                                                       \
            fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status);     \
            cudaDeviceReset();                                                                                  \
            exit(EXIT_FAILURE);                                                                                 \
        }                                                                                                       \
                                                                                                                \
    }                                                                                                           \
    while(0)


//
//

void invert(float** src, float** dst, int n, int batchSize)
{
    	cublasHandle_t handle;
    	cublascall(cublasCreate_v2(&handle));

    	int *P, *INFO;

    	cudacall(cudaMalloc(&P, n * batchSize * sizeof(int)));
    	cudacall(cudaMalloc(&INFO,  batchSize * sizeof(int)));

    	int lda = n;

    	float **A = (float **)malloc(batchSize*sizeof(float *));
    	float **A_d, *A_dflat;

    	cudacall(cudaMalloc(&A_d,batchSize*sizeof(float *)));
    	cudacall(cudaMalloc(&A_dflat, n*n*batchSize*sizeof(float)));

	A[0] = A_dflat;
    	for (int i = 1; i < batchSize; i++)
      		A[i] = A[i-1]+(n*n);

    	cudacall(cudaMemcpy(A_d,A,batchSize*sizeof(float *),cudaMemcpyHostToDevice));
   
 	for (int i = 0; i < batchSize; i++)
      		cudacall(cudaMemcpy(A_dflat+(i*n*n), src[i], n*n*sizeof(float), cudaMemcpyHostToDevice));


    	cublascall(cublasSgetrfBatched(handle,n,A_d,lda,P,INFO,batchSize));


    	int INFOh[batchSize];
    	cudacall(cudaMemcpy(INFOh,INFO,batchSize*sizeof(int),cudaMemcpyDeviceToHost));

    	for (int i = 0; i < batchSize; i++)
      		if(INFOh[i]  != 0)
      		{
        		fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i);
        		cudaDeviceReset();
        		exit(EXIT_FAILURE);
      		}

    	float **C = (float **)malloc(batchSize*sizeof(float *));
    	float **C_d, *C_dflat;

    	cudacall(cudaMalloc(&C_d,batchSize*sizeof(float *)));
    	cudacall(cudaMalloc(&C_dflat, n*n*batchSize*sizeof(float)));
    	C[0] = C_dflat;
    	for (int i = 1; i < batchSize; i++)
      		C[i] = C[i-1] + (n*n);
    	cudacall(cudaMemcpy(C_d,C,batchSize*sizeof(float *),cudaMemcpyHostToDevice));
    	cublascall(cublasSgetriBatched(handle,n,(const float **)A_d,lda,P,C_d,lda,INFO,batchSize));

    	cudacall(cudaMemcpy(INFOh,INFO,batchSize*sizeof(int),cudaMemcpyDeviceToHost));

    	for (int i = 0; i < batchSize; i++)
	      	if(INFOh[i] != 0)
      		{
        		fprintf(stderr, "Inversion of matrix %d Failed: Matrix may be singular\n", i);
        		cudaDeviceReset();
      			exit(EXIT_FAILURE);
      		}
    	for (int i = 0; i < batchSize; i++)
      	cudacall(cudaMemcpy(dst[i], C_dflat + (i*n*n), n*n*sizeof(float), cudaMemcpyDeviceToHost));
    	
	cudaFree(A_d); cudaFree(A_dflat); free(A);
	cudaFree(C_d); cudaFree(C_dflat); free(C);
    	cudaFree(P); cudaFree(INFO); cublasDestroy_v2(handle);
}



//
//



__global__ void MatrixMulKernel(float *Md, float *Nd, float *Pd, int Width) 
{
    //2D Thread ID
    int col = blockIdx.x*blockDim.x+threadIdx.x;
    int row = blockIdx.y*blockDim.y+threadIdx.y;

    //Pvalue stores the Pd element that is computed by the thread
    float Pvalue = 0;
	if(col<Width && row < Width)
	{
		    for(int k = 0; k < Width ; ++k) 
		    {
		        float Mdelement = Md[row*Width + k];
		        float Ndelement = Nd[k*Width + col];
		        Pvalue += (Mdelement*Ndelement);
		
		    }
		    Pd[row*Width + col] = Pvalue;
	}
}



void mul(float* M,float* N,int Width)
{
		
	float * P = (float *) malloc(Width*Width*sizeof(float));
	float *Md, *Nd, *Pd;



	unsigned long int size = Width*Width*sizeof(float);
  

    //Transfer M and N to device memory
    	cudaMalloc((void**)&Md, size);
    	cudaMemcpy(Md,M,size,cudaMemcpyHostToDevice);

    	cudaMalloc((void**)&Nd, size);
    	cudaMemcpy(Nd,N,size,cudaMemcpyHostToDevice);

    	//Allocate P on the device
    	cudaMalloc((void**)&Pd,size);

    	//Setup the execution configuration
    	dim3 dimBlock(Width,Width);
    	dim3 dimGrid(1,1);


	if (Width*Width > 1024)
	{
		//printf("\n\n enter inside if condi\n\n");
		
		dimGrid.x = (Width-1)/32+1;
        	dimGrid.y = (Width-1)/32+1;
	
		dimBlock.x = 32;
	        dimBlock.y = 32;



	}

     
    //Launch the device computation threads!
	MatrixMulKernel<<<dimGrid,dimBlock>>>(Md,Nd,Pd,Width);

    //Transfer P from device to host
    	cudaMemcpy(P,Pd,size,cudaMemcpyDeviceToHost);

    //Free device matrices
    	cudaFree(Md);
    	cudaFree(Nd);
    	cudaFree(Pd);

	int i;

	fprintf(stdout,"\n\n");

	if(Width<11)
	{


		fprintf(stdout,"\n\nMatrix Multiplication, M x Inv(M) :\n\n");
		for(i = 0; i < Width*Width; i++)
   		{
			if(P[i])
				fprintf(stdout,"%10f ",P[i]) ;
			else
				fprintf(stdout,"%9f ",P[i]) ;
	        	



			if((i+1)%Width==0)
				fprintf(stdout,"\n");
		}
  

	}
	else
	{
		FILE *fp;	
	
		fp = fopen("Mat_Inv_out", "a");

 		if (!fp) 
		{
	    		fprintf(stderr, "Failed to open matAdata.\n");
	    		exit(1);
	  	}
		fprintf(fp,"\n\nMatrix Multiplication, M x Inv(M) :\n\n");
	 	for(i = 0; i < Width*Width; i++)
   		{	if(P[i])
				fprintf(fp,"%10f ",P[i]) ;
			else
				fprintf(fp,"%9f ",P[i]) ;
			
		        if((i+1)%Width==0)
				fprintf(fp,"\n");
		}
    		fclose(fp);
	}

	
	//printf("\n Matrix multiplication completed !!\n\n"); 
	free(M);
	free(N);
	free(P);

}



//
//



void fill(float* h, int w)
{ 
	
	unsigned int i, num;
	int divide;
	FILE *f;

	f=fopen("/dev/urandom", "r");
	if (!f) {
        	fprintf(stderr, "Failed open file\n");
        	exit(1);
    	}
	for(i=0; i< w*w; i++)
	{
		fread(&num, sizeof(unsigned int), 1, f);
		fread(&divide, sizeof(int), 1, f);
		h[i] = ((float)num)/((float)divide);
		//scanf("%f",&h[i]);
	}
	fclose(f);
/*
	unsigned int i;
	srand((unsigned int)time(NULL));
	for(i=0; i< w*w; i++)
	{
		h[i] = ((float)rand()/(float)(RAND_MAX)) * 99;
		//scanf("%f",&h[i]);
	}
	
*/

} 


//
//


void test_invert(int n )
{
    	
	//printf("Enter the order of the square matrix :");
	//scanf("%d",&n);
    	const int mybatch = 1;


	//float* mat1[n * n];
	float mat1_size = sizeof(float) * n * n;
    	float* mat1 = (float*) malloc(mat1_size);

	fill(mat1, n);

    	float *result_flat = (float *)malloc(mybatch*n*n*sizeof(float));
    	float **results = (float **)malloc(mybatch*sizeof(float *));

    	for (int i = 0; i < mybatch; i++)
      		results[i] = result_flat + (i*n*n);

    	float **inputs = (float **)malloc(mybatch*sizeof(float *));
    	
	//inputs[0]  = zero_pivot;

	inputs[0]  = mat1;


	invert(inputs, results, n, mybatch);

	if(n<11)
	{

		for (int qq = 0; qq < mybatch; qq++)
		{
	      		if(mybatch==1)
				fprintf(stdout, "Input Matrix, M :\n\n");
			else
				fprintf(stdout, "Input Matrix %d:\n\n", qq);
			
	      		for(int i=0; i<n; i++)
	      		{
	        		for(int j=0; j<n; j++)
				{	
					if(inputs[qq][i*n+j])
		            			fprintf(stdout,"%12f ",inputs[qq][i*n+j]);
					else
						fprintf(stdout,"%11f ",inputs[qq][i*n+j]);
				}
	        			fprintf(stdout,"\n");
	      		}
	    	}
	    	fprintf(stdout,"\n\n");




	    	for (int qq = 0; qq < mybatch; qq++)
		{

			if(mybatch==1)
				fprintf(stdout, "Inverse of the Input Matrix, Inv(M):\n\n");
			else
				fprintf(stdout, "Inverse Matrix %d:\n\n", qq);
	      		for(int i=0; i<n; i++)
	      		{
	        		for(int j=0; j<n; j++)
				{
					if(results[qq][i*n+j])
		            			fprintf(stdout,"%10f ",results[qq][i*n+j]);
					else
		            			fprintf(stdout,"%9f ",results[qq][i*n+j]);
	        		
				}
				fprintf(stdout,"\n");
	      		}
	    	}
	}


	else // order of the matrix is more than 10 x 10 then output the results in the file
	{
		printf("\nThe order of matrix is too large to display in terminal\n, Please open the file : Mat_Inv_out.txt located in the current folder. To see the output.\n\n");
		
		FILE *fp;


 		fp = fopen("Mat_Inv_out", "w");

 		if (!fp) 
		{
    			fprintf(stderr, "Failed to open Mat_Inv_out.\n");
		    	exit(1);
  		}



		for (int qq = 0; qq < mybatch; qq++)
		{

			if(mybatch==1)
				fprintf(fp, "Input Matrix , M:\n\n");
			else
				fprintf(fp, "Input Matrix %d:\n\n", qq);


	      	
			
	      		for(int i=0; i<n; i++)
      			{
        			for(int j=0; j<n; j++)
				{
					if(inputs[qq][i*n+j])
		            			fprintf(fp,"%12f ",inputs[qq][i*n+j]);
					else
						fprintf(fp,"%11f ",inputs[qq][i*n+j]);
				}
		            		
        			fprintf(fp,"\n");
	      		}
    		}
	    	fprintf(fp,"\n\n");

		for (int qq = 0; qq < mybatch; qq++)
		{
			if(mybatch==1)
				fprintf(fp, "Inverse of the Input Matrix, Inv(M):\n\n");
	      		else
				fprintf(fp, "Inverse %d:\n\n", qq);
	      		for(int i=0; i<n; i++)
	      		{
	        		for(int j=0; j<n; j++)	
				{
					if(results[qq][i*n+j])
		            			fprintf(fp,"%10f ",results[qq][i*n+j]);
					else
		            			fprintf(fp,"%9f ",results[qq][i*n+j]);
	        		
				}

	        		fprintf(fp,"\n");
	      		}
	    	}

		fclose(fp);
			
	}// end of if else condition for output

	float *A, *B;

	A=inputs[0];
	B=results[0];
	mul(A, B, n );

	//mul(inputs[0][], results[0][], n );

}


//
//


int main(int argc, char *argv[])
{
	if(argc!=2)
	{
		printf("Usage: %s <matrix_width>\n", argv[0]);
		return 0;
	}

	int w;
	w = atoi( argv[1] );
	
   	test_invert(w);
    	return 0;
}

/*
$ nvcc -arch=sm_20 -o t540 t540.cu -lcublas
$ ./t540
Input 0:
0.000000        3.000000        4.000000
1.000000        3.000000        10.000000
4.000000        9.000000        16.000000
Input 1:
0.500000        3.000000        4.000000
1.000000        3.000000        10.000000
4.000000        9.000000        16.000000
Input 2:
0.000000        3.000000        4.000000
1.000000        5.000000        6.000000
9.000000        8.000000        2.000000
Input 3:
22.000000       3.000000        4.000000
1.000000        5.000000        6.000000
9.000000        8.000000        2.000000
Inverse 0:
-0.700000       -0.200000       0.300000
0.400000        -0.266667       0.066667
-0.050000       0.200000        -0.050000
Inverse 1:
-1.076923       -0.307692       0.461538
0.615385        -0.205128       -0.025641
-0.076923       0.192308        -0.038462
Inverse 2:
-4.750000       3.250000        -0.250000
6.500000        -4.500000       0.500000
-4.625000       3.375000        -0.375000
Inverse 3:
0.045894        -0.031401       0.002415
-0.062802       -0.009662       0.154589
0.044686        0.179952        -0.129227
$
$ nvcc -arch=sm_20 -o t540 t540.cu -lcublas
$ ./t540 
Enter the order of the aquare matrix :4
Input 0:
-0.100222 -2.553872 -69.072723 0.016120 
-2.752346 -1.230871 1.997387 0.606710 
-0.029929 -0.583444 2.733107 0.254404 
-1.844285 -0.070541 1.906255 10.758991 
Inverse 0:
0.017501	-0.374555	0.713068	0.004234	
-0.056876	-0.005437	-1.457745	0.034861	
-0.012399	0.000729	0.052888	-0.001273	
0.004824	-0.064370	0.103305	0.094125	
*/

//mat_inv_userInput.cu
#include <stdio.h>
#include <stdlib.h>
#include <cublas_v2.h>
#include<math.h>


#define cudacall(call)                                                                                                          \
    do                                                                                                                          \
    {                                                                                                                           \
        cudaError_t err = (call);                                                                                               \
        if(cudaSuccess != err)                                                                                                  \
        {                                                                                                                       \
            fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err));    \
            cudaDeviceReset();                                                                                                  \
            exit(EXIT_FAILURE);                                                                                                 \
        }                                                                                                                       \
    }                                                                                                                           \
    while (0)

#define cublascall(call)                                                                                        \
    do                                                                                                          \
    {                                                                                                           \
        cublasStatus_t status = (call);                                                                         \
        if(CUBLAS_STATUS_SUCCESS != status)                                                                     \
        {                                                                                                       \
            fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status);     \
            cudaDeviceReset();                                                                                  \
            exit(EXIT_FAILURE);                                                                                 \
        }                                                                                                       \
                                                                                                                \
    }                                                                                                           \
    while(0)


//
//

void invert(float** src, float** dst, int n, int batchSize)
{
    	cublasHandle_t handle;
    	cublascall(cublasCreate_v2(&handle));

    	int *P, *INFO;

    	cudacall(cudaMalloc(&P, n * batchSize * sizeof(int)));
    	cudacall(cudaMalloc(&INFO,  batchSize * sizeof(int)));

    	int lda = n;

    	float **A = (float **)malloc(batchSize*sizeof(float *));
    	float **A_d, *A_dflat;

    	cudacall(cudaMalloc(&A_d,batchSize*sizeof(float *)));
    	cudacall(cudaMalloc(&A_dflat, n*n*batchSize*sizeof(float)));

	A[0] = A_dflat;
    	for (int i = 1; i < batchSize; i++)
      		A[i] = A[i-1]+(n*n);

    	cudacall(cudaMemcpy(A_d,A,batchSize*sizeof(float *),cudaMemcpyHostToDevice));
   
 	for (int i = 0; i < batchSize; i++)
      		cudacall(cudaMemcpy(A_dflat+(i*n*n), src[i], n*n*sizeof(float), cudaMemcpyHostToDevice));


    	cublascall(cublasSgetrfBatched(handle,n,A_d,lda,P,INFO,batchSize));


    	int INFOh[batchSize];
    	cudacall(cudaMemcpy(INFOh,INFO,batchSize*sizeof(int),cudaMemcpyDeviceToHost));

    	for (int i = 0; i < batchSize; i++)
      		if(INFOh[i]  != 0)
      		{
        		fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i);
        		cudaDeviceReset();
        		exit(EXIT_FAILURE);
      		}

    	float **C = (float **)malloc(batchSize*sizeof(float *));
    	float **C_d, *C_dflat;

    	cudacall(cudaMalloc(&C_d,batchSize*sizeof(float *)));
    	cudacall(cudaMalloc(&C_dflat, n*n*batchSize*sizeof(float)));
    	C[0] = C_dflat;
    	for (int i = 1; i < batchSize; i++)
      		C[i] = C[i-1] + (n*n);
    	cudacall(cudaMemcpy(C_d,C,batchSize*sizeof(float *),cudaMemcpyHostToDevice));
    	cublascall(cublasSgetriBatched(handle,n,(const float **)A_d,lda,P,C_d,lda,INFO,batchSize));

    	cudacall(cudaMemcpy(INFOh,INFO,batchSize*sizeof(int),cudaMemcpyDeviceToHost));

    	for (int i = 0; i < batchSize; i++)
	      	if(INFOh[i] != 0)
      		{
        		fprintf(stderr, "Inversion of matrix %d Failed: Matrix may be singular\n", i);
        		cudaDeviceReset();
      			exit(EXIT_FAILURE);
      		}
    	for (int i = 0; i < batchSize; i++)
      	cudacall(cudaMemcpy(dst[i], C_dflat + (i*n*n), n*n*sizeof(float), cudaMemcpyDeviceToHost));
    	
	cudaFree(A_d); cudaFree(A_dflat); free(A);
	cudaFree(C_d); cudaFree(C_dflat); free(C);
    	cudaFree(P); cudaFree(INFO); cublasDestroy_v2(handle);
}



//
//



__global__ void MatrixMulKernel(float *Md, float *Nd, float *Pd, int Width) 
{
    //2D Thread ID
    int col = blockIdx.x*blockDim.x+threadIdx.x;
    int row = blockIdx.y*blockDim.y+threadIdx.y;

    //Pvalue stores the Pd element that is computed by the thread
    float Pvalue = 0;
	if(col<Width && row < Width)
	{
		    for(int k = 0; k < Width ; ++k) 
		    {
		        float Mdelement = Md[row*Width + k];
		        float Ndelement = Nd[k*Width + col];
		        Pvalue += (Mdelement*Ndelement);
		
		    }
		    Pd[row*Width + col] = Pvalue;
	}
}



void mul(float* M,float* N,int Width)
{
		
	float * P = (float *) malloc(Width*Width*sizeof(float));
	float *Md, *Nd, *Pd;



	unsigned long int size = Width*Width*sizeof(float);
  

    //Transfer M and N to device memory
    	cudaMalloc((void**)&Md, size);
    	cudaMemcpy(Md,M,size,cudaMemcpyHostToDevice);

    	cudaMalloc((void**)&Nd, size);
    	cudaMemcpy(Nd,N,size,cudaMemcpyHostToDevice);

    	//Allocate P on the device
    	cudaMalloc((void**)&Pd,size);

    	//Setup the execution configuration
    	dim3 dimBlock(Width,Width);
    	dim3 dimGrid(1,1);


	if (Width*Width > 1024)
	{
		//printf("\n\n enter inside if condi\n\n");
		
		dimGrid.x = (Width-1)/32+1;
        	dimGrid.y = (Width-1)/32+1;
	
		dimBlock.x = 32;
	        dimBlock.y = 32;



	}

     
    //Launch the device computation threads!
	MatrixMulKernel<<<dimGrid,dimBlock>>>(Md,Nd,Pd,Width);

    //Transfer P from device to host
    	cudaMemcpy(P,Pd,size,cudaMemcpyDeviceToHost);

    //Free device matrices
    	cudaFree(Md);
    	cudaFree(Nd);
    	cudaFree(Pd);

	int i;

	fprintf(stdout,"\n\n");

	if(Width<11)
	{


		fprintf(stdout,"\n\nMatrix Multiplication, M x Inv(M) :\n\n");
		for(i = 0; i < Width*Width; i++)
   		{
			if(P[i])
				fprintf(stdout,"%10f ",P[i]) ;
			else
				fprintf(stdout,"%9f ",P[i]) ;
	        	



			if((i+1)%Width==0)
				fprintf(stdout,"\n");
		}
  

	}
	else
	{
		FILE *fp;	
	
		fp = fopen("Mat_Inv_out", "a");

 		if (!fp) 
		{
	    		fprintf(stderr, "Failed to open matAdata.\n");
	    		exit(1);
	  	}
		fprintf(fp,"\n\nMatrix Multiplication, M x Inv(M) :\n\n");
	 	for(i = 0; i < Width*Width; i++)
   		{	if(P[i])
				fprintf(fp,"%10f ",P[i]) ;
			else
				fprintf(fp,"%9f ",P[i]) ;
			
		        if((i+1)%Width==0)
				fprintf(fp,"\n");
		}
    		fclose(fp);
	}

	
	//printf("\n Matrix multiplication completed !!\n\n"); 
	free(M);
	free(N);
	free(P);

}



//
//



void fill(float* h, int w)
{ 
	unsigned int i;
	for(i=0; i< w*w; i++)
	{
		scanf("%f",&h[i]);
	}
	


} 


//
//


void test_invert( )
{
	int n;
    	
	printf("Enter the order of the square matrix :");
	scanf("%d",&n);
    	const int mybatch = 1;


	float mat1_size = sizeof(float) * n * n;
    	float* mat1 = (float*) malloc(mat1_size);

	fill(mat1, n);

    	float *result_flat = (float *)malloc(mybatch*n*n*sizeof(float));
    	float **results = (float **)malloc(mybatch*sizeof(float *));

    	for (int i = 0; i < mybatch; i++)
      		results[i] = result_flat + (i*n*n);

    	float **inputs = (float **)malloc(mybatch*sizeof(float *));
    	
	//inputs[0]  = zero_pivot;

	inputs[0]  = mat1;


	invert(inputs, results, n, mybatch);

	if(n<11)
	{

		for (int qq = 0; qq < mybatch; qq++)
		{
	      		if(mybatch==1)
				fprintf(stdout, "Input Matrix, M :\n\n");
			else
				fprintf(stdout, "Input Matrix %d:\n\n", qq);
			
	      		for(int i=0; i<n; i++)
	      		{
	        		for(int j=0; j<n; j++)
				{	
					if(inputs[qq][i*n+j])
		            			fprintf(stdout,"%12f ",inputs[qq][i*n+j]);
					else
						fprintf(stdout,"%11f ",inputs[qq][i*n+j]);
				}
	        			fprintf(stdout,"\n");
	      		}
	    	}
	    	fprintf(stdout,"\n\n");




	    	for (int qq = 0; qq < mybatch; qq++)
		{

			if(mybatch==1)
				fprintf(stdout, "Inverse of the Input Matrix, Inv(M):\n\n");
			else
				fprintf(stdout, "Inverse Matrix %d:\n\n", qq);
	      		for(int i=0; i<n; i++)
	      		{
	        		for(int j=0; j<n; j++)
				{
					if(results[qq][i*n+j])
		            			fprintf(stdout,"%10f ",results[qq][i*n+j]);
					else
		            			fprintf(stdout,"%9f ",results[qq][i*n+j]);
	        		
				}
				fprintf(stdout,"\n");
	      		}
	    	}
	}


	else // order of the matrix is more than 10 x 10 then output the results in the file
	{
		printf("\nThe order of matrix is too large to display in terminal\n, Please open the file : Mat_Inv_out.txt located in the current folder. To see the output.\n\n");
		
		FILE *fp;


 		fp = fopen("Mat_Inv_out", "w");

 		if (!fp) 
		{
    			fprintf(stderr, "Failed to open Mat_Inv_out.\n");
		    	exit(1);
  		}



		for (int qq = 0; qq < mybatch; qq++)
		{

			if(mybatch==1)
				fprintf(fp, "Input Matrix , M:\n\n");
			else
				fprintf(fp, "Input Matrix %d:\n\n", qq);


	      	
			
	      		for(int i=0; i<n; i++)
      			{
        			for(int j=0; j<n; j++)
				{
					if(inputs[qq][i*n+j])
		            			fprintf(fp,"%12f ",inputs[qq][i*n+j]);
					else
						fprintf(fp,"%11f ",inputs[qq][i*n+j]);
				}
		            		
        			fprintf(fp,"\n");
	      		}
    		}
	    	fprintf(fp,"\n\n");

		for (int qq = 0; qq < mybatch; qq++)
		{
			if(mybatch==1)
				fprintf(fp, "Inverse of the Input Matrix, Inv(M):\n\n");
	      		else
				fprintf(fp, "Inverse %d:\n\n", qq);
	      		for(int i=0; i<n; i++)
	      		{
	        		for(int j=0; j<n; j++)	
				{
					if(results[qq][i*n+j])
		            			fprintf(fp,"%10f ",results[qq][i*n+j]);
					else
		            			fprintf(fp,"%9f ",results[qq][i*n+j]);
	        		
				}

	        		fprintf(fp,"\n");
	      		}
	    	}

		fclose(fp);
			
	}// end of if else condition for output

float *A, *B;

A=inputs[0];
B=results[0];
mul(A, B, n );

//mul(inputs[0][], results[0][], n );

}

//
//


int main()
{
	
	
   	test_invert();
    	return 0;
}

/*
$ nvcc -arch=sm_20 -o t540 t540.cu -lcublas
$ ./t540
Input 0:
0.000000        3.000000        4.000000
1.000000        3.000000        10.000000
4.000000        9.000000        16.000000
Input 1:
0.500000        3.000000        4.000000
1.000000        3.000000        10.000000
4.000000        9.000000        16.000000
Input 2:
0.000000        3.000000        4.000000
1.000000        5.000000        6.000000
9.000000        8.000000        2.000000
Input 3:
22.000000       3.000000        4.000000
1.000000        5.000000        6.000000
9.000000        8.000000        2.000000
Inverse 0:
-0.700000       -0.200000       0.300000
0.400000        -0.266667       0.066667
-0.050000       0.200000        -0.050000
Inverse 1:
-1.076923       -0.307692       0.461538
0.615385        -0.205128       -0.025641
-0.076923       0.192308        -0.038462
Inverse 2:
-4.750000       3.250000        -0.250000
6.500000        -4.500000       0.500000
-4.625000       3.375000        -0.375000
Inverse 3:
0.045894        -0.031401       0.002415
-0.062802       -0.009662       0.154589
0.044686        0.179952        -0.129227
$
$ nvcc -arch=sm_20 -o t540 t540.cu -lcublas
$ ./t540 
Enter the order of the aquare matrix :4
Input 0:
-0.100222 -2.553872 -69.072723 0.016120 
-2.752346 -1.230871 1.997387 0.606710 
-0.029929 -0.583444 2.733107 0.254404 
-1.844285 -0.070541 1.906255 10.758991 
Inverse 0:
0.017501	-0.374555	0.713068	0.004234	
-0.056876	-0.005437	-1.457745	0.034861	
-0.012399	0.000729	0.052888	-0.001273	
0.004824	-0.064370	0.103305	0.094125	
*/

#Makefile
default: all
.PHONY: default
	
all:
	nvcc -arch=sm_70 -o mat_inv mat_inv.cu -lcublas -lm
.PHONY: all

user: 
	nvcc -arch=sm_20 -o user_mat_inv mat_inv_userInput.cu -lcublas -lm

clean:
	rm mat_inv user_mat_inv
.PHONY: all

$make

$./mat_inv 3

std out:

Input Matrix, M :

   -3.662523    -1.507356     1.094427
   -0.853989     1.768155    45.548477
   -5.994538     6.792342  -323.404816


Inverse of the Input Matrix, Inv(M):

 -0.217016  -0.118222  -0.017385
 -0.135258   0.293317   0.040853
  0.001182   0.008352  -0.001912




Matrix Multiplication, M x Inv(M) :

  1.000000   0.000000  -0.000000
  0.000000   1.000000  -0.000000
  0.000000  -0.000000   1.000000

Eloudy

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
一个使用cublasSgetrfBatched计算逆矩阵的示例

//mat_inv.cu#include <stdio.h>#include <stdlib.h>#include <cublas_v2.h>#include<math.h>#define cudacall(call) \ ...
复制链接

扫一扫

专栏目录