#include <iostream>
#include<malloc.h>
#include "cuda_runtime.h"
#include <cusparse_v2.h>
#include "assert.h"
template<typename T>
int csrmatrix_mul_csrmatrix(const int rows_num_A, const int cols_num_A,
const int rows_num_B, const int cols_num_B,
const int nonzero_num_A, const int nonzero_num_B,
T *csrVal_A, int *csrColInd_A, int * csrRowPtr_A,
T *csrVal_B, int *csrColInd_B, int * csrRowPtr_B,
T *&csrVal_C, int *&csrColInd_C, int *&csrRowPtr_C){
assert(cols_num_A == rows_num_B);
const T alpha = 1;
const T beta = 0;
cudaMalloc((void**) &csrRowPtr_C, (rows_num_A + 1) * sizeof(int));
//--------------------------------------------------------------------------
// CUSPARSE APIs
cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
cusparseOperation_t opB = CUSPARSE_OPERATION_NON_TRANSPOSE;
cudaDataType computeType = CUDA_R_32F;
cusparseHandle_t handle = NULL;
cusparseCreate(&handle);
cusparseSpMatDescr_t matA, matB, matC;
void* dBuffer1 = NULL, *dBuffer2 = NULL;
size_t bufferSize1 = 0, bufferSize2 = 0;
// Create sparse matrix A in CSR format
cusparseCreateCsr(&matA, rows_num_A, cols_num_A, nonzero_num_A,
csrRowPtr_A, csrColInd_A, csrVal_A,
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
cusparseCreateCsr(&matB, rows_num_B, cols_num_B, nonzero_num_B,
csrRowPtr_B, csrColInd_B, csrVal_B,
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
cusparseCreateCsr(&matC, rows_num_A, cols_num_B, 0,NULL, NULL, NULL,
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
//--------------------------------------------------------------------------
// SpGEMM Computation
cusparseSpGEMMDescr_t spgemmDesc;
cusparseSpGEMM_createDescr(&spgemmDesc);
// ask bufferSize1 bytes for external memory
cusparseSpGEMM_workEstimation(handle, opA, opB,
&alpha, matA, matB, &beta, matC,
computeType, CUSPARSE_SPGEMM_DEFAULT,
spgemmDesc, &bufferSize1, NULL);
cudaMalloc((void**) &dBuffer1, bufferSize1);
// inspect the matrices A and B to understand the memory requirement for
// the next step
cusparseSpGEMM_workEstimation(handle, opA, opB,
&alpha, matA, matB, &beta, matC,
computeType, CUSPARSE_SPGEMM_DEFAULT,
spgemmDesc, &bufferSize1, dBuffer1);
// ask bufferSize2 bytes for external memory
cusparseSpGEMM_compute(handle, opA, opB,
&alpha, matA, matB, &beta, matC,
computeType, CUSPARSE_SPGEMM_DEFAULT,
spgemmDesc, &bufferSize2, NULL);
cudaMalloc((void**) &dBuffer2, bufferSize2);
// compute the intermediate product of A * B
cusparseSpGEMM_compute(handle, opA, opB,
&alpha, matA, matB, &beta, matC,
computeType, CUSPARSE_SPGEMM_DEFAULT,
spgemmDesc, &bufferSize2, dBuffer2);
// get matrix C non-zero entries nonzero_num_C
int64_t C_num_rows1, C_num_cols1, nonzero_num_C;
cusparseSpMatGetSize(matC, &C_num_rows1, &C_num_cols1, &nonzero_num_C);
// allocate matrix C
cudaMalloc((void**) &csrColInd_C, nonzero_num_C * sizeof(int));
cudaMalloc((void**) &csrVal_C, nonzero_num_C * sizeof(T));
// update matC with the new pointers
// std::cout<<&csrVal_C<<std::endl;
cusparseCsrSetPointers(matC, csrRowPtr_C, csrColInd_C, csrVal_C);
// if beta != 0, cusparseSpGEMM_copy reuses/updates the values of dC_values
cusparseSpGEMM_copy(handle, opA, opB,
&alpha, matA, matB, &beta, matC,
computeType, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc);
return nonzero_num_C;
}
int main() {
int rows_num_A = 2;
int rows_num_B = 3;
int nonzero_num_A = 6;
int nonzero_num_B = 3;
float *h_csrVal_A=new float[nonzero_num_A]{1,2,3,1,1,1},
*h_csrVal_B=new float[nonzero_num_B]{1,2,1};
int *h_csrColInd_A=new int[nonzero_num_A]{0,1,2,0,1,2},
*h_csrColInd_B=new int[nonzero_num_B]{0,1,2};
int *h_csrRowPtr_A=new int[rows_num_A+1]{0,3,6},
*h_csrRowPtr_B=new int[rows_num_B+1]{0,1,2,3};
float *csrVal_A, *csrVal_B, *csrVal_C= nullptr;
int *csrColInd_A, *csrColInd_B, *csrColInd_C= nullptr;
int *csrRowPtr_A, *csrRowPtr_B, *csrRowPtr_C= nullptr;
cudaMalloc(&csrVal_A, nonzero_num_A*sizeof(float));
cudaMalloc(&csrVal_B, nonzero_num_B*sizeof(float));
cudaMalloc(&csrColInd_A, nonzero_num_A*sizeof(int));
cudaMalloc(&csrColInd_B, nonzero_num_B*sizeof(int));
cudaMalloc(&csrRowPtr_A, (rows_num_A+1)*sizeof(int));
cudaMalloc(&csrRowPtr_B, (rows_num_B+1)*sizeof(int));
cudaMemcpy(csrVal_A, h_csrVal_A, nonzero_num_A*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(csrVal_B, h_csrVal_B, nonzero_num_B*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(csrColInd_A, h_csrColInd_A, nonzero_num_A*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(csrColInd_B, h_csrColInd_B, nonzero_num_B*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(csrRowPtr_A, h_csrRowPtr_A, (rows_num_A+1)*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(csrRowPtr_B, h_csrRowPtr_B, (rows_num_B+1)*sizeof(int), cudaMemcpyHostToDevice);
float *Hpl_Mul_Hll_inverse_csrVal;
int *Hpl_Mul_Hll_inverse_csrColInd, *Hpl_Mul_Hll_inverse_csrRowPtr;
int nonzero_num_C = csrmatrix_mul_csrmatrix(rows_num_A, rows_num_B,
rows_num_B, rows_num_B,
nonzero_num_A, nonzero_num_A,
csrVal_A, csrColInd_A, csrRowPtr_A,
csrVal_B, csrColInd_B, csrRowPtr_B,
Hpl_Mul_Hll_inverse_csrVal, Hpl_Mul_Hll_inverse_csrColInd, Hpl_Mul_Hll_inverse_csrRowPtr);
std::cout<<nonzero_num_C<<std::endl;
float *cpu_csrVal_C = new float[nonzero_num_C];
cudaMemcpy(cpu_csrVal_C, Hpl_Mul_Hll_inverse_csrVal, nonzero_num_C*sizeof(float), cudaMemcpyDeviceToHost);
for(int i=0; i<nonzero_num_C; i++)
printf("%f ", cpu_csrVal_C[i]);
std::cout<<std::endl;
int *cpu_csrColInd_C = new int[nonzero_num_C];
cudaMemcpy(cpu_csrColInd_C, Hpl_Mul_Hll_inverse_csrColInd, nonzero_num_C*sizeof(int), cudaMemcpyDeviceToHost);
for(int i=0; i<nonzero_num_C; i++)
printf("%d ", cpu_csrColInd_C[i]);
std::cout<<std::endl;
int *cpu_csrRowPtr_C = new int[rows_num_A];
cudaMemcpy(cpu_csrRowPtr_C, Hpl_Mul_Hll_inverse_csrRowPtr, (rows_num_A+1)*sizeof(int), cudaMemcpyDeviceToHost);
for(int i=0; i<=rows_num_A; i++)
printf("%d ", cpu_csrRowPtr_C[i]);
}
cuda operator稀疏矩阵csr相乘
最新推荐文章于 2024-03-26 14:45:11 发布