cublas学习笔记
cusparse学习笔记@zjd
编译器vs2017
cuda 14.2
仅供自己学习备份使用
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"//动态链接库添加cublas.lib
#include <cusparse.h>//动态链接库添加cusparse.lib
int main()
{
double k_all_CSR_value[14] = { 10,-2,2,-2,10,-1,1,2,-1,11,-3,1,-3,6 };
int k_all_CSR_count[5] = { 0,3,7,11,14 };
int k_all_CSR_col[14] = { 0,1,2,0,1,2,3,0,1,2,3,1,2,3 };
double x[4] = { 10,8,9,4 };
double y[4] = { 0,0,0,0 };
double ans_CSR[4] = { 0,0,0,0 };
double alpha = 1.0;
double beat = 0.0;
double *value_d, *x_d, *y_d,*ans_CSR_d;
cudaMalloc((void**)&value_d, sizeof(double) * 14);
cudaMalloc((void**)&x_d, sizeof(double) * 4);
cudaMalloc((void**)&y_d, sizeof(double) * 4);
cudaMalloc((void**)&ans_CSR_d, sizeof(double) * 4);
cudaMemcpy(value_d, k_all_CSR_value, sizeof(double) * 14, cudaMemcpyHostToDevice);
cudaMemcpy(x_d, x, sizeof(double) * 4, cudaMemcpyHostToDevice);
cudaMemcpy(y_d, y, sizeof(double) * 4, cudaMemcpyHostToDevice);
cudaMemcpy(ans_CSR_d, ans_CSR, sizeof(double) * 4, cudaMemcpyHostToDevice);
int*count_d, *col_d;
cudaMalloc((void**)&count_d, sizeof(int) * 5);
cudaMalloc((void**)&col_d, sizeof(int) * 14);
cudaMemcpy(count_d, k_all_CSR_count, sizeof(int) * 5, cudaMemcpyHostToDevice);
cudaMemcpy(col_d, k_all_CSR_col, sizeof(int) * 14, cudaMemcpyHostToDevice);
cusparseHandle_t handle1 = NULL;
cusparseSpMatDescr_t matA;
cusparseDnVecDescr_t vecX, vecY;
void* dBuffer = NULL;
size_t bufferSize = 0;
cusparseCreate(&handle1);// Create sparse matrix A in CSR format
cusparseCreateCsr(&matA, 4, 4, 14, count_d, col_d, value_d, CUSPARSE_INDEX_32I,
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);//创建cuspares里A的csr矩阵
cusparseCreateDnVec(&vecX, 4, x_d, CUDA_R_64F);
// Create dense vector y
cusparseCreateDnVec(&vecY, 4, y_d, CUDA_R_64F);
// allocate an external buffer if needed
cusparseSpMV_bufferSize(handle1, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, &beat, vecY,
CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, &bufferSize);
cudaMalloc(&dBuffer, bufferSize);
cublasHandle_t handle;//cublas事件
cublasCreate(&handle);//创建cublas事件
double*r_0_d, *p_0_d, *middle_d, *r_1_d;
cudaMalloc((void**)&r_0_d, sizeof(double) * 4);
cudaMalloc((void**)&p_0_d, sizeof(double) * 4);
cudaMalloc((void**)&middle_d, sizeof(double) * 4);
cudaMalloc((void**)&r_1_d, sizeof(double) * 4);
cublasDcopy(handle, 4, x_d, 1, r_0_d, 1);
cublasDcopy(handle, 4, x_d, 1, p_0_d, 1);
cublasDcopy(handle, 4, x_d, 1, r_1_d, 1);
double break_num[1];
double a_k,b_k,a_k_nagative;
double middle_a_0[1];
double middle_a_1[1];
double middle_b_0[1];
double middle_b_1[1];
int CG_num = 0;
for (int ii = 0; ii < 1000; ii++)
{
cublasDasum(handle, 4, r_1_d, 1, break_num);//求向量内各元素绝对值和
//printf("%lf\n", break_num[0]);
if (break_num[0] < 1e-10)
{
CG_num = ii;
break;//迭代停止
}
cublasDdot(handle, 4, r_0_d, 1, r_0_d, 1, middle_a_0);
//printf("%lf\n", middle_a_0[0]);
cusparseCreateDnVec(&vecX, 4, p_0_d, CUDA_R_64F);
cusparseSpMV(handle1, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, &beat, vecY,
CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, dBuffer);// vecY = alpha * matA*vecX + beat(0) * vecY
cudaMemcpy(y, y_d, sizeof(double) * 4, cudaMemcpyDeviceToHost);
cudaMemcpy(middle_d, y, sizeof(double) * 4, cudaMemcpyHostToDevice);
cublasDdot(handle, 4, p_0_d, 1, middle_d, 1, middle_a_1);
a_k = middle_a_0[0] / middle_a_1[0];
//printf("%lf", a_k);
a_k_nagative = -a_k;
cublasDaxpy(handle, 4, &a_k, p_0_d, 1, ans_CSR_d, 1);//ans_CSR_d=ans_CSR_d+alpha*p_0_d
cublasDcopy(handle, 4, r_0_d, 1, r_1_d, 1);
cublasDaxpy(handle, 4, &a_k_nagative, middle_d, 1, r_0_d, 1);//r_0_d=r_0_d+a_k_nagative*middle_d
cublasDdot(handle, 4, r_1_d, 1, r_1_d, 1, middle_b_1);
cublasDdot(handle, 4, r_0_d, 1, r_0_d, 1, middle_b_0);
b_k = middle_b_0[0] / middle_b_1[0];
//printf("%lf", b_k);
cublasDscal(handle, 4, &b_k, p_0_d, 1);
cublasDaxpy(handle, 4, &alpha, r_0_d, 1, p_0_d, 1);
}
cudaMemcpy(ans_CSR, ans_CSR_d, sizeof(double) * 4, cudaMemcpyDeviceToHost);
for (int i = 0; i < 4; i++)
{
printf("%lf\t", ans_CSR[i]);
}
printf("\n迭代%d次", CG_num);
cublasDestroy(handle);
cusparseDestroy(handle1);
//内存管理
//释放内存
//不想写了
return 0;
}