在cusolver中,有一个 cusolverStatus_t 的返回值为
CUSOLVER_STATUS_NOT_INIT |
这个值是在使用cusolver 的math API时,忘记 cusolverCreate()导致的;
但是,如果cusolverDestroy()一个空指针时,它会直接挂掉,报segment fault,而不会报 CUSOLVER_STATUS_NOT_INIT;
1. 返回 CUSOLVER_STATUS_NOT_INIT 的示例
测试代码:
des.cpp
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusolverDn.h>
int main()
{
cusolverStatus_t st = CUSOLVER_STATUS_SUCCESS;
cusolverDnHandle_t handle = nullptr;
cusolverDnCreate(&handle);
st= cusolverDnDestroy(handle);
printf("st1=%d\n", st);
st= cusolverDnDestroy(nullptr);
printf("st2=%d\n", st);
printf("handle=%X, CUSOLVER_STATUS_NOT_INITIALIZED=%d\n",
handle, CUSOLVER_STATUS_NOT_INITIALIZED);
return 0;
}
Makefile:
TARGETS = des
all: des
LD_FLAGS = -L/usr/local/cuda/lib64 \
-lcudart -lcudadevrt \
-lcusolver -lcublas \
-lcublasLt -lpthread
%: %.cpp
g++ -o $@ $< -I/usr/local/cuda/include $(LD_FLAGS) -fopenmp
.PHONY:clean
clean:
-rm -f $(TARGETS)
效果图:
2. 在 cusolver的math API 函数中使用nullptr handle
效果相似
#include <cblas.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusolverDn.h>
#define N 8192
#define BILLION 1000000000L;
using namespace std;
int main (int argc , char * argv [])
{
struct timespec start , stop ; // variables for timing
double accum ; // elapsed time variable
cublasStatus_t stat ;
cudaError cudaStatus ;
cusolverStatus_t cusolverStatus ;
cusolverDnHandle_t handle = nullptr;
// declare arrays on the host
float *A, *B1 , *B; // A - NxN matrix , B1 - auxiliary N- vect .
// B=A*B1 - N- vector of rhs , all on the host
// declare arrays on the device
float *d_A , *d_B , * d_Work ; // coeff . matrix , rhs , workspace
int * d_pivot , *d_info , Lwork ; // pivots , info , worksp . size
int info_gpu = 0;
// prepare memory on the host
A = ( float *) malloc(N*N* sizeof( float ));
B = ( float *) malloc(N* sizeof( float ));
B1 = ( float *) malloc(N* sizeof( float ));
for(int i=0;i<N*N;i++) A[i]= rand()/( float ) RAND_MAX ;// A- rand
for(int i=0;i<N;i++) B[i] = 0.0; // initialize B
for(int i=0;i<N;i++) B1[i] = 1.0; // B1 - N- vector of ones
float al =1.0 , bet =0.0; // coefficients for sgemv
int incx =1, incy =1;
cblas_sgemv( CblasColMajor , CblasNoTrans ,N,N,al ,A,N,B1 ,incx ,bet ,B, incy ); // multiply B=A*B1
cudaStatus = cudaGetDevice(0);
cusolverStatus = cusolverDnCreate(& handle );
// prepare memory on the device
cudaStatus = cudaMalloc(( void **)& d_A ,N*N* sizeof( float ));
cudaStatus = cudaMalloc(( void **)& d_B , N* sizeof( float ));
cudaStatus = cudaMalloc(( void **)& d_pivot , N* sizeof(int ));
cudaStatus = cudaMalloc(( void **)& d_info , sizeof(int ));
cudaStatus = cudaMemcpy(d_A , A, N*N* sizeof( float ), cudaMemcpyHostToDevice ); // copy d_A <-A
cudaStatus = cudaMemcpy(d_B , B, N* sizeof( float ), cudaMemcpyHostToDevice ); // copy d_B <-B
cusolverStatus = cusolverDnSgetrf_bufferSize( handle , N, N, d_A , N, & Lwork ); // compute buffer size and prep . memory
printf("cu 1 status = %d\n", cusolverStatus);
cudaStatus = cudaMalloc(( void **)& d_Work , Lwork * sizeof( float ));
clock_gettime( CLOCK_REALTIME ,& start ); // timer start
// LU factorization of d_A , with partial pivoting and row
// interchanges ; row i is interchanged with row d_pivot (i);
cusolverStatus = cusolverDnSgetrf(handle,N,N,d_A,N,d_Work,d_pivot, d_info);
//LL::cusolverStatus = cusolverDnSgetrf((cusolverDnHandle_t)nullptr,N,N,d_A,N,d_Work,d_pivot, d_info);
printf("cu 2 status = %d\n", cusolverStatus);
// use the LU factorization to solve the system d_A *x= d_B ;
// the solution overwrites d_B
cusolverStatus = cusolverDnSgetrs(handle, CUBLAS_OP_N, N, 1,d_A, N, d_pivot, d_B,N, d_info);
cudaStatus = cudaDeviceSynchronize ();
clock_gettime( CLOCK_REALTIME ,&stop ); // timer stop
// elapsed time
accum = ( stop.tv_sec - start.tv_sec )+ ( stop.tv_nsec - start.tv_nsec )/( double )BILLION ;
printf(" getrf + getrs time : %lf sec .\n",accum ); // print el. time
cudaStatus = cudaMemcpy(&info_gpu , d_info , sizeof( int ), cudaMemcpyDeviceToHost ); // d_info -> info_gpu
printf(" after getrf + getrs : info_gpu = %d\n", info_gpu );
cudaStatus = cudaMemcpy(B1 , d_B , N* sizeof( float ), cudaMemcpyDeviceToHost ); // copy d_B ->B1
printf(" solution : ");
for(int i = 0; i < 5; i ++)
printf("%g, ", B1[i ]);
printf(" ... "); // print first components of the solution
printf("\n");
// free memory
cudaStatus = cudaFree( d_A );
cudaStatus = cudaFree( d_B );
cudaStatus = cudaFree( d_pivot );
cudaStatus = cudaFree( d_info );
cudaStatus = cudaFree( d_Work );
free(A); free(B); free(B1 );
cusolverStatus = cusolverDnDestroy( handle );
cudaStatus = cudaDeviceReset();
return 0;
}
// getrf + getrs time : 0.267574 sec .
// after getrf + getrs : info_gpu = 0
// solution : 1.04225 , 0.873826 , 1.05703 , 1.03822 , 0.883831 , ...
Makefile:
TARGETS = des lu
all: $(TARGETS)
LD_FLAGS = -L/usr/local/cuda/lib64 \
-lcudart -lcudadevrt \
-lcusolver -lcublas \
-lcublasLt -lpthread
%: %.cpp
g++ -o $@ $< -I/usr/local/cuda/include $(LD_FLAGS) -fopenmp -I./cblas_source -L./cblas_source/CBLAS/lib -lcblas_LINUX -L/usr/local/lib -lblas -lgfortran
.PHONY:clean
clean:
-rm -f $(TARGETS)
handle 为 nullptr时的效果: