cuda operator稀疏矩阵csr相乘

最新推荐文章于 2024-03-26 14:45:11 发布

我想静静，

最新推荐文章于 2024-03-26 14:45:11 发布

阅读量847

点赞数 1

分类专栏： CUDA

本文链接：https://blog.csdn.net/weixin_42764932/article/details/119427125

版权

CUDA 专栏收录该内容

29 篇文章 12 订阅

订阅专栏

#include <iostream>
#include<malloc.h>
#include "cuda_runtime.h"
#include <cusparse_v2.h>
#include "assert.h"

template<typename T>
int csrmatrix_mul_csrmatrix(const int rows_num_A, const int cols_num_A,
                            const int rows_num_B, const int cols_num_B,
                            const int nonzero_num_A, const int nonzero_num_B,
                            T *csrVal_A, int *csrColInd_A, int * csrRowPtr_A,
                            T *csrVal_B, int *csrColInd_B, int * csrRowPtr_B,
                            T *&csrVal_C, int *&csrColInd_C, int *&csrRowPtr_C){
    assert(cols_num_A == rows_num_B);
    const T alpha = 1;
    const T beta = 0;
    cudaMalloc((void**) &csrRowPtr_C, (rows_num_A + 1) * sizeof(int));
    //--------------------------------------------------------------------------
    // CUSPARSE APIs
    cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
    cusparseOperation_t opB = CUSPARSE_OPERATION_NON_TRANSPOSE;
    cudaDataType computeType = CUDA_R_32F;
    cusparseHandle_t handle = NULL;
    cusparseCreate(&handle);
    cusparseSpMatDescr_t matA, matB, matC;
    void* dBuffer1 = NULL, *dBuffer2 = NULL;
    size_t bufferSize1 = 0, bufferSize2 = 0;
    // Create sparse matrix A in CSR format
    cusparseCreateCsr(&matA, rows_num_A, cols_num_A, nonzero_num_A,
                      csrRowPtr_A, csrColInd_A, csrVal_A,
                      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
                      CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
    cusparseCreateCsr(&matB, rows_num_B, cols_num_B, nonzero_num_B,
                      csrRowPtr_B, csrColInd_B, csrVal_B,
                      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
                      CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
    cusparseCreateCsr(&matC, rows_num_A, cols_num_B, 0,NULL, NULL, NULL,
                      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
                      CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
    //--------------------------------------------------------------------------
    // SpGEMM Computation
    cusparseSpGEMMDescr_t spgemmDesc;
    cusparseSpGEMM_createDescr(&spgemmDesc);

    // ask bufferSize1 bytes for external memory
    cusparseSpGEMM_workEstimation(handle, opA, opB,
                                  &alpha, matA, matB, &beta, matC,
                                  computeType, CUSPARSE_SPGEMM_DEFAULT,
                                  spgemmDesc, &bufferSize1, NULL);
    cudaMalloc((void**) &dBuffer1, bufferSize1);
    // inspect the matrices A and B to understand the memory requirement for
    // the next step
    cusparseSpGEMM_workEstimation(handle, opA, opB,
                                  &alpha, matA, matB, &beta, matC,
                                  computeType, CUSPARSE_SPGEMM_DEFAULT,
                                  spgemmDesc, &bufferSize1, dBuffer1);

    // ask bufferSize2 bytes for external memory
    cusparseSpGEMM_compute(handle, opA, opB,
                           &alpha, matA, matB, &beta, matC,
                           computeType, CUSPARSE_SPGEMM_DEFAULT,
                           spgemmDesc, &bufferSize2, NULL);
    cudaMalloc((void**) &dBuffer2, bufferSize2);

    // compute the intermediate product of A * B
    cusparseSpGEMM_compute(handle, opA, opB,
                           &alpha, matA, matB, &beta, matC,
                           computeType, CUSPARSE_SPGEMM_DEFAULT,
                           spgemmDesc, &bufferSize2, dBuffer2);
    // get matrix C non-zero entries nonzero_num_C
    int64_t C_num_rows1, C_num_cols1, nonzero_num_C;
    cusparseSpMatGetSize(matC, &C_num_rows1, &C_num_cols1, &nonzero_num_C);
    // allocate matrix C
    cudaMalloc((void**) &csrColInd_C, nonzero_num_C * sizeof(int));
    cudaMalloc((void**) &csrVal_C,  nonzero_num_C * sizeof(T));
    // update matC with the new pointers
    //    std::cout<<&csrVal_C<<std::endl;
    cusparseCsrSetPointers(matC, csrRowPtr_C, csrColInd_C, csrVal_C);
    // if beta != 0, cusparseSpGEMM_copy reuses/updates the values of dC_values
    cusparseSpGEMM_copy(handle, opA, opB,
                        &alpha, matA, matB, &beta, matC,
                        computeType, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc);


    return nonzero_num_C;
}

int main() {
    int rows_num_A = 2;
    int rows_num_B = 3;
    int nonzero_num_A = 6;
    int nonzero_num_B = 3;
    float *h_csrVal_A=new float[nonzero_num_A]{1,2,3,1,1,1},
          *h_csrVal_B=new float[nonzero_num_B]{1,2,1};

    int *h_csrColInd_A=new int[nonzero_num_A]{0,1,2,0,1,2},
        *h_csrColInd_B=new int[nonzero_num_B]{0,1,2};

    int *h_csrRowPtr_A=new int[rows_num_A+1]{0,3,6},
        *h_csrRowPtr_B=new int[rows_num_B+1]{0,1,2,3};

    float *csrVal_A, *csrVal_B, *csrVal_C= nullptr;
    int *csrColInd_A, *csrColInd_B, *csrColInd_C= nullptr;
    int *csrRowPtr_A, *csrRowPtr_B, *csrRowPtr_C= nullptr;
    cudaMalloc(&csrVal_A, nonzero_num_A*sizeof(float));
    cudaMalloc(&csrVal_B, nonzero_num_B*sizeof(float));
    cudaMalloc(&csrColInd_A, nonzero_num_A*sizeof(int));
    cudaMalloc(&csrColInd_B, nonzero_num_B*sizeof(int));
    cudaMalloc(&csrRowPtr_A, (rows_num_A+1)*sizeof(int));
    cudaMalloc(&csrRowPtr_B, (rows_num_B+1)*sizeof(int));
    cudaMemcpy(csrVal_A, h_csrVal_A, nonzero_num_A*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(csrVal_B, h_csrVal_B, nonzero_num_B*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(csrColInd_A, h_csrColInd_A, nonzero_num_A*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(csrColInd_B, h_csrColInd_B, nonzero_num_B*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(csrRowPtr_A, h_csrRowPtr_A, (rows_num_A+1)*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(csrRowPtr_B, h_csrRowPtr_B, (rows_num_B+1)*sizeof(int), cudaMemcpyHostToDevice);

    float *Hpl_Mul_Hll_inverse_csrVal;
    int *Hpl_Mul_Hll_inverse_csrColInd, *Hpl_Mul_Hll_inverse_csrRowPtr;
    int nonzero_num_C = csrmatrix_mul_csrmatrix(rows_num_A, rows_num_B,
                                                rows_num_B, rows_num_B,
                                                nonzero_num_A, nonzero_num_A,
                                                csrVal_A, csrColInd_A, csrRowPtr_A,
                                                csrVal_B, csrColInd_B, csrRowPtr_B,
                                                Hpl_Mul_Hll_inverse_csrVal, Hpl_Mul_Hll_inverse_csrColInd, Hpl_Mul_Hll_inverse_csrRowPtr);


    std::cout<<nonzero_num_C<<std::endl;
    float *cpu_csrVal_C = new float[nonzero_num_C];
    cudaMemcpy(cpu_csrVal_C, Hpl_Mul_Hll_inverse_csrVal, nonzero_num_C*sizeof(float), cudaMemcpyDeviceToHost);
    for(int i=0; i<nonzero_num_C; i++)
        printf("%f  ", cpu_csrVal_C[i]);

    std::cout<<std::endl;
    int *cpu_csrColInd_C = new int[nonzero_num_C];
    cudaMemcpy(cpu_csrColInd_C, Hpl_Mul_Hll_inverse_csrColInd, nonzero_num_C*sizeof(int), cudaMemcpyDeviceToHost);
    for(int i=0; i<nonzero_num_C; i++)
        printf("%d  ", cpu_csrColInd_C[i]);

    std::cout<<std::endl;
    int *cpu_csrRowPtr_C = new int[rows_num_A];
    cudaMemcpy(cpu_csrRowPtr_C, Hpl_Mul_Hll_inverse_csrRowPtr, (rows_num_A+1)*sizeof(int), cudaMemcpyDeviceToHost);
    for(int i=0; i<=rows_num_A; i++)
        printf("%d  ", cpu_csrRowPtr_C[i]);
}

我想静静，

关注

1
点赞
踩
4

收藏

觉得还不错? 一键收藏
3
评论
cuda operator稀疏矩阵csr相乘

#include <iostream>#include<malloc.h>#include "cuda_runtime.h"#include <cusparse_v2.h>template<typename T>int csrmatrix_mul_csrmatrix(const int rows_num, const int nonzero_num_A, const int nonzero_num_B, T *c
复制链接

扫一扫