有关示例代码参考,请参见下面的两个示例。它们显示了使用具有两种索引样式的cuBLAS库API以C编写的应用程序(示例1)。“使用C和cuBLAS的应用程序:基于1的索引”和示例2。“使用C和cuBLAS的应用程序:基于0的索引”)。
//Example 1. Application Using C and cuBLAS: 1-based indexing
//-----------------------------------------------------------
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#define M 6
#define N 5
#define IDX2F(i,j,ld) ((((j)-1)*(ld))+((i)-1))
static __inline__ void modify (cublasHandle_t handle, float *m, int ldm, int n, int p, int q, float alpha, float beta){
cublasSscal (handle, n-q+1, &alpha, &m[IDX2F(p,q,ldm)], ldm);
cublasSscal (handle, ldm-p+1, &beta, &m[IDX2F(p,q,ldm)], 1);
}
int main (void){
cudaError_t cudaStat;
cublasStatus_t stat;
cublasHandle_t handle;
int i, j;
float* devPtrA;
float* a = 0;
a = (float *)malloc (M * N * sizeof (*a));
if (!a) {
printf ("host memory allocation failed");
return EXIT_FAILURE;
}
for (j = 1; j <= N; j++) {
for (i = 1; i <= M; i++) {
a[IDX2F(i,j,M)] = (float)((i-1) * N + j);
}
}
cudaStat = cudaMalloc ((void**)&devPtrA, M*N*sizeof(*a));
if (cudaStat != cudaSuccess) {
printf ("device memory allocation failed");
return EXIT_FAILURE;
}
stat = cublasCreate(&handle);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("CUBLAS initialization failed\n");
return EXIT_FAILURE;
}
stat = cublasSetMatrix (M, N, sizeof(*a), a, M, devPtrA, M);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("data download failed");
cudaFree (devPtrA);
cublasDestroy(handle);
return EXIT_FAILURE;
}
modify (handle, devPtrA, M, N, 2, 3, 16.0f, 12.0f);
stat = cublasGetMatrix (M, N, sizeof(*a), devPtrA, M, a, M);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("data upload failed");
cudaFree (devPtrA);
cublasDestroy(handle);
return EXIT_FAILURE;
}
cudaFree (devPtrA);
cublasDestroy(handle);
for (j = 1; j <= N; j++) {
for (i = 1; i <= M; i++) {
printf ("%7.0f", a[IDX2F(i,j,M)]);
}
printf ("\n");
}
free(a);
return EXIT_SUCCESS;
}
//Example 2. Application Using C and cuBLAS: 0-based indexing
//-----------------------------------------------------------
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#define M 6
#define N 5
#define IDX2C(i,j,ld) (((j)*(ld))+(i))
static __inline__ void modify (cublasHandle_t handle, float *m, int ldm, int n, int p, int q, float alpha, float beta){
cublasSscal (handle, n-q, &alpha, &m[IDX2C(p,q,ldm)], ldm);
cublasSscal (handle, ldm-p, &beta, &m[IDX2C(p,q,ldm)], 1);
}
int main (void){
cudaError_t cudaStat;
cublasStatus_t stat;
cublasHandle_t handle;
int i, j;
float* devPtrA;
float* a = 0;
a = (float *)malloc (M * N * sizeof (*a));
if (!a) {
printf ("host memory allocation failed");
return EXIT_FAILURE;
}
for (j = 0; j < N; j++) {
for (i = 0; i < M; i++) {
a[IDX2C(i,j,M)] = (float)(i * N + j + 1);
}
}
cudaStat = cudaMalloc ((void**)&devPtrA, M*N*sizeof(*a));
if (cudaStat != cudaSuccess) {
printf ("device memory allocation failed");
return EXIT_FAILURE;
}
stat = cublasCreate(&handle);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("CUBLAS initialization failed\n");
return EXIT_FAILURE;
}
stat = cublasSetMatrix (M, N, sizeof(*a), a, M, devPtrA, M);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("data download failed");
cudaFree (devPtrA);
cublasDestroy(handle);
return EXIT_FAILURE;
}
modify (handle, devPtrA, M, N, 1, 2, 16.0f, 12.0f);
stat = cublasGetMatrix (M, N, sizeof(*a), devPtrA, M, a, M);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("data upload failed");
cudaFree (devPtrA);
cublasDestroy(handle);
return EXIT_FAILURE;
}
cudaFree (devPtrA);
cublasDestroy(handle);
for (j = 0; j < N; j++) {
for (i = 0; i < M; i++) {
printf ("%7.0f", a[IDX2C(i,j,M)]);
}
printf ("\n");
}
free(a);
return EXIT_SUCCESS;
}
cublasSetMatrix()
cublasStatus_t
cublasSetMatrix(int rows, int cols, int elemSize,
const void *A, int lda, void *B, int ldb)
此函数支持64位整数接口。
这个函数用来拷贝rows x cols个元素从矩阵A拷贝到GPU中的矩阵B中,假设每个元素需要存储elemSize字节,并且两个矩阵都以列主格式存储,源矩阵A和目的矩阵B的首维分别以lda和ldb给出。前导维指示分配的矩阵的行数,即使仅使用其子矩阵。
Return Value | Meaning |
---|---|
| the operation completed successfully |
| the parameters |
| there was an error accessing GPU memory |
cublasGetMatrix()
cublasStatus_t
cublasGetMatrix(int rows, int cols, int elemSize,
const void *A, int lda, void *B, int ldb)
此函数支持64位整数接口。
此函数将行x列元素的举证从GPU内存空间中的矩阵A复制到主机内存空间中的矩阵B。假设每个元素需要存储elemSize字节,并且两个矩阵都以列为主格式存储,源矩阵A和目标矩阵B的前导维度分别以lda和ldb给出。前导维指示分配的矩阵的行数,即使仅使用其子矩阵。
Return Value | Meaning |
---|---|
| the operation completed successfully |
| the parameters |
| there was an error accessing GPU memory |
cublasSscal()
cublasSscal(cublasHandle_t handle, int n,const float *alpha,
float *x, int incx)
传入矩阵x,总大小为n,每隔incx个数,就乘以alpha
cublasgemm()
https://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-gemm
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <iostream>
#include <stdlib.h>
using namespace std;
int const A_ROW = 5;
int const A_COL = 6;
int const B_ROW = 6;
int const B_COL = 7;
int main(){
cublasStatus_t status;
float *h_A;
float *h_B;
float *h_C;
// 进行对应的内存分配
h_A = (float*)malloc(sizeof(float)*A_ROW*A_COL); //在内存中开辟空间
h_B = (float*)malloc(sizeof(float)*B_ROW*B_COL);
h_C = (float*)malloc(sizeof(float)*A_ROW*B_COL);
// 为待运算矩阵的元素赋予 0-10 范围内的随机数
for (int i=0; i<A_ROW*A_COL; i++) {
h_A[i] = (float)(rand()%10+1);
}
for(int i=0;i<B_ROW*B_COL; i++) {
h_B[i] = (float)(rand()%10+1);
}
// 打印待测试的矩阵
cout << "矩阵 A :" << endl;
for (int i=0; i<A_ROW*A_COL; i++){
cout << h_A[i] << " \t";
if ((i+1)%A_COL == 0) cout << endl;
}
cout << endl;
cout << "矩阵 B :" << endl;
for (int i=0; i<B_ROW*B_COL; i++){
cout << h_B[i] << "\t ";
if ((i+1)%B_COL == 0) cout << endl;
}
cout << endl;
// 存储于显卡中
float *d_A;
float *d_B;
float *d_C;
cudaMalloc((void**)&d_A,sizeof(float)*A_ROW*A_COL); //在显存中开辟空间
cudaMalloc((void**)&d_B,sizeof(float)*B_ROW*B_COL);
cudaMalloc((void**)&d_C,sizeof(float)*A_ROW*B_COL);
// 内存中数据进行转移
cublasHandle_t handle;
cublasCreate(&handle);
cudaMemcpy(d_A,h_A,sizeof(float)*A_ROW*A_COL,cudaMemcpyHostToDevice); //数据从内存拷贝到显存
cudaMemcpy(d_B,h_B,sizeof(float)*B_ROW*B_COL,cudaMemcpyHostToDevice);
float a = 1, b = 0;
cublasSgemm(
handle,
CUBLAS_OP_T, //矩阵A的属性参数,转置,按行优先
CUBLAS_OP_T, //矩阵B的属性参数,转置,按行优先
A_ROW, //矩阵A、C的行数
B_COL, //矩阵B、C的列数
A_COL, //A的列数,B的行数,此处也可为B_ROW,一样的
&a, //alpha的值
d_A, //左矩阵,为A
A_COL, //A的leading dimension,此时选择转置,按行优先,则leading dimension为A的列数
d_B, //右矩阵,为B
B_COL, //B的leading dimension,此时选择转置,按行优先,则leading dimension为B的列数
&b, //beta的值
d_C, //结果矩阵C
A_ROW //C的leading dimension,C矩阵一定按列优先,则leading dimension为C的行数
);
//此时得到的结果便是C=AB,但由于C是按列优先,故此时得到的C应该是正确结果的转置
std::cout << "计算结果的转置 ( (A*B)的转置 ):" << std::endl;
cudaMemcpy(h_C,d_C,sizeof(float)*A_ROW*B_COL,cudaMemcpyDeviceToHost);
for(int i=0;i<A_ROW*B_COL;++i) {
std::cout<<h_C[i]<<" \t";
if((i+1)%B_COL==0) std::cout<<std::endl;
}
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cublasDestroy(handle);
free(h_A);
free(h_B);
free(h_C);
return 0;
}