环境变量配置
vim ~/.bashrc
最后一行添加cublas-v2所在的目录
export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/local/cuda-11.0/targets/x86_64-linux/include/
因为在编译过程中还需要用到nvcc,所以在bashrc中还需添加:
export PATH=/usr/local/cuda-11.0/bin:$PATH
(临时配置)
最后
source ~/.bashrc
代码
cublas的编译过程与常规c文件不一样,本文主要实现的是用cublas实现double类型的矩阵相乘(cublasDgemm()
)的完整Demo。
main.c:
#include <stdio.h>
extern int func(void);
void main() {
printf("Running Demo \n");
func();
}
kernel.cu:
#include <stdio.h>
#include "cuda_runtime.h"
#include "cublas_v2.h"
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <ctime>
#include<sys/time.h>
using namespace std;
extern "C" int func()
{
srand(time(0));
/*
int M = 20480; //矩阵A的行,矩阵C的行
int N = 64000; //矩阵A的列,矩阵B的行
int K = 20480; //矩阵B的列,矩阵C的列
*/
int M=2;
int N=3;
int K=4;
struct timeval start,end;
double *h_A = (double*)malloc(sizeof(double)*M*N);
double *h_B = (double*)malloc(sizeof(double)*N*K);
double *h_C = (double*)malloc(sizeof(double)*M*K);
cout.precision(10);
printf("矩阵A为:\n");
for (int i = 0; i < M*N; i++)
{
//h_A[i] = rand() % 10;
h_A[i] = (rand()%10000+1)/1000.0;
cout << h_A[i] << " ";
if ((i + 1) % N == 0)
cout << endl;
}
cout << endl;
printf("矩阵B为:\n");
for (int i = 0; i < N*K; i++)
{
h_B[i] = (rand()%10000+1)/1000.0;
cout << h_B[i] << " ";
if ((i + 1) % K == 0)
cout << endl;
}
cout << endl;
double *d_A, *d_B, *d_C;
cudaMalloc((void**)&d_A, sizeof(double)*M*N);
cudaMalloc((void**)&d_B, sizeof(double)*N*K);
cudaMalloc((void**)&d_C, sizeof(double)*M*K);
// gettimeofday(&start,NULL);
cudaMemcpy(d_A, h_A, M*N * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, N*K * sizeof(double), cudaMemcpyHostToDevice);
double alpha = 1;
double beta = 0;
gettimeofday(&start,NULL);
//C=A*B
cublasHandle_t handle;
cublasCreate(&handle);
cublasDgemm(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
K, //矩阵B的列数
M, //矩阵A的行数
N, //矩阵A的列数
&alpha,
d_B,
K,
d_A,
N,
&beta,
d_C,
K);
gettimeofday(&end,NULL);
cudaMemcpy(h_C, d_C, M*K * sizeof(double), cudaMemcpyDeviceToHost);
//gettimeofday(&end,NULL);
printf("结果矩阵C为:\n");
// print_matrix(M,K,h_C[M*K]);
for (int i = 0; i < M*K; i++)
{
cout << h_C[i] << " ";
if ((i+1)%K==0)
cout << endl;
}
printf("\n");
printf("dgemm计算时间为:");
long timeuse=1000000*(end.tv_sec-start.tv_sec)+end.tv_usec-start.tv_usec;
printf("time=%fsec\n",timeuse/1000000.0);
printf("\n");
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
return 0;
} /* end func */
编译过程
1 nvcc -c kernel.cu
2 ar cr libkernel.a kernel.o
3 gcc main.c -o main -L/usr/local/cuda-11.0/targets/x86_64-linux/lib libkernel.a -lcublas -lcudart -lstdc++
4 ./main
结果
Running Demo
矩阵A为:
7.592 4.1 9.99
6.567 1.327 9.611
矩阵B为:
6.232 0.844 7.125 4.542
2.324 9.89 2.77 2.718
8.69 2.795 2.448 7.86
结果矩阵C为:
143.654844 74.878698 89.90552 124.148064
127.529082 45.529323 73.993393 108.97656
dgemm计算时间为:time=0.406958sec