找了很久都没有发现 调用cuda库 中 cublasCgemmBatched 使用复数矩阵的乘法,自己尝试写了一个。
1,新建一个cuda项目
配置相关路径和依赖项 ()
C:\ProgramData\NVIDIA Corporation\CUDA Samples\v10.0\common\inc
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.0\include
C:\ProgramData\NVIDIA Corporation\CUDA Samples\v10.0\common\lib
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.0\lib
cublas.lib
cuda.lib
cudadevrt.lib
cudart.lib
cudart_static.lib
cufft.lib
cufftw.lib
curand.lib
cusolver.lib
cusparse.lib
粘贴源代码。
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <helper_cuda.h>
//#include "batchCUBLAS.h"
#define T_ELEM cuComplex
#define CUBLASTEST_FAILED 1
const char *sSDKname = "batchCUBLAS";
static __inline__ int imax(int x, int y)
{
return (x > y) ? x : y;
}
static __inline__ unsigned cuRand(void)
{
/* George Marsaglia's fast inline random number generator */
#define CUDA_ZNEW (cuda_z=36969*(cuda_z&65535)+(cuda_z>>16))
#define CUDA_WNEW (cuda_w=18000*(cuda_w&65535)+(cuda_w>>16))
#define CUDA_MWC ((CUDA_ZNEW<<16)+CUDA_WNEW)
#define CUDA_SHR3 (cuda_jsr=cuda_jsr^(cuda_jsr<<17), \
cuda_jsr = cuda_jsr ^ (cuda_jsr >> 13), \
cuda_jsr = cuda_jsr ^ (cuda_jsr << 5))
#define CUDA_CONG (cuda_jcong=69069*cuda_jcong+1234567)
#define KISS ((CUDA_MWC^CUDA_CONG)+CUDA_SHR3)
static unsigned int cuda_z = 362436069, cuda_w = 521288629;
static unsigned int cuda_jsr = 123456789, cuda_jcong = 380116160;
return KISS;
}
#include <windows.h>
static __inline__ double second(void)
{
LARGE_INTEGER t;
static double oofreq;
static int checkedForHighResTimer;
static BOOL hasHighResTimer;
if (!checkedForHighResTimer)
{
hasHighResTimer = QueryPerformanceFrequency(&t);
oofreq = 1.0 / (double)t.QuadPart;
checkedForHighResTimer = 1;
}
if (hasHighResTimer)
{
QueryPerformanceCounter(&t);
return (double)t.QuadPart * oofreq;
}
else
{
return (double)GetTickCount() / 1000.0;
}
}
struct gemmOpts
{
int m;
int n;
int k;
//testMethod test_method;
char *elem_type;
int N; // number of multiplications
};
struct gemmTestParams
{
cublasOperation_t transa;
cublasOperation_t transb;
int m;
int n;
int k;
T_ELEM alpha;
T_ELEM beta;
};
#define CLEANUP() \
do { \
if (A) free (A); \
if (B) free (B); \
if (C) free (C); \
for(int i = 0; i < opts.N; ++i) { \
if(devPtrA[i]) cudaFree(devPtrA[i]);\
if(devPtrB[i]) cudaFree(devPtrB[i]);\
if(devPtrC[i]) cudaFree(devPtrC[i]);\
} \
if (devPtrA) free(devPtrA); \
if (devPtrB) free(devPtrB); \
if (devPtrC) free(devPtrC); \
if (devPtrA_dev) cudaFree(devPtrA_dev); \
if (devPtrB_dev) cudaFree(devPtrB_dev); \
if (devPtrC_dev) cudaFree(devPtrC_dev); \
fflush (stdout); \
} while (0)
void fillupMatrixDebug(T_ELEM *A, int lda, int rows, int cols)
{
for (int j = 0; j < cols; j++)
{
for (int i = 0; i < rows; i++)
{
A[i + lda*j].x = (i + 0);
A[i + lda*j].y = (j + 0);
}
}
}
void PrintMatrixDebug(T_ELEM *A, int lda, int rows, int cols)
{
printf("========The matrix is \n");
for (int j = 0; j < cols; j++)
{
for (int i = 0; i < rows; i++)
{
printf("%+.02f%+.02fi\t", A[i + lda*j].x, A[i + lda*j].y);
//A[i + lda*j] = (i + j);
}
printf(" \n");
}
}
int main(int argc, char *argv[])
{
printf("%s Starting...\n\n", sSDKname);
int dev = findCudaDevice(argc, (const char **)argv);
if (dev == -1)
{
return CUBLASTEST_FAILED;
}
cublasHandle_t handle;
if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS)
{
fprintf(stdout, "CUBLAS initialization failed!\n");
exit(EXIT_FAILURE);
}
cublasStatus_t status1, status2, status3;
T_ELEM *A = NULL;
T_ELEM *B = NULL;
T_ELEM *C = NULL;
T_ELEM **devPtrA = 0;
T_ELEM **devPtrB = 0;
T_ELEM **devPtrC = 0;
T_ELEM **devPtrA_dev = NULL;
T_ELEM **devPtrB_dev = NULL;
T_ELEM **devPtrC_dev = NULL;
int matrixM, matrixN, matrixK;
int rowsA, rowsB, rowsC;
int colsA, colsB, colsC;
int m, n, k;
int matrixSizeA, matrixSizeB, matrixSizeC;
int errors;
double start, stop;
gemmOpts opts;
opts.N = 20;
gemmTestParams params;
printf("Testing Batch INV Cublas\n");
matrixM = 5;
matrixN = 5;
matrixK = 5;
rowsA = imax(1, matrixM);
colsA = imax(1, matrixK);
rowsB = imax(1, matrixK);
colsB = imax(1, matrixN);
rowsC = imax(1, matrixM);
colsC = imax(1, matrixN);
matrixSizeA = rowsA * colsA;
matrixSizeB = rowsB * colsB;
matrixSizeC = rowsC * colsC;
params.transa = CUBLAS_OP_N;
params.transb = CUBLAS_OP_N;
m = matrixM;
n = matrixN;
k = matrixK;
params.m = m;
params.n = n;
params.k = k;
params.alpha.x = 1.0; params.alpha.y = 0.0;
params.beta.x = 0.0; params.beta.y = 0.0;
devPtrA = (T_ELEM **)malloc(opts.N * sizeof(T_ELEM *));
devPtrB = (T_ELEM **)malloc(opts.N * sizeof(*devPtrB));
devPtrC = (T_ELEM **)malloc(opts.N * sizeof(*devPtrC));
for (int i = 0; i < opts.N; i++)
{
cudaError_t err1 = cudaMalloc((void **)&devPtrA[i], matrixSizeA * sizeof(T_ELEM ));
cudaError_t err2 = cudaMalloc((void **)&devPtrB[i], matrixSizeB * sizeof(devPtrB[0][0]));
cudaError_t err3 = cudaMalloc((void **)&devPtrC[i], matrixSizeC * sizeof(devPtrC[0][0]));
}
// For batched processing we need those arrays on the device
cudaError_t err1 = cudaMalloc((void **)&devPtrA_dev, opts.N * sizeof(*devPtrA));
cudaError_t err2 = cudaMalloc((void **)&devPtrB_dev, opts.N * sizeof(*devPtrB));
cudaError_t err3 = cudaMalloc((void **)&devPtrC_dev, opts.N * sizeof(*devPtrC));
err1 = cudaMemcpy(devPtrA_dev, devPtrA, opts.N * sizeof(*devPtrA), cudaMemcpyHostToDevice);
err2 = cudaMemcpy(devPtrB_dev, devPtrB, opts.N * sizeof(*devPtrB), cudaMemcpyHostToDevice);
err3 = cudaMemcpy(devPtrC_dev, devPtrC, opts.N * sizeof(*devPtrC), cudaMemcpyHostToDevice);
A = (T_ELEM *)malloc(matrixSizeA * sizeof(A[0]));
B = (T_ELEM *)malloc(matrixSizeB * sizeof(B[0]));
C = (T_ELEM *)malloc(matrixSizeC * sizeof(C[0]));
printf("#### args: lda=%d ldb=%d ldc=%d\n", rowsA, rowsB, rowsC);
m = cuRand() % matrixM;
n = cuRand() % matrixN;
k = cuRand() % matrixK;
memset(A, 0xFF, matrixSizeA* sizeof(A[0]));
fillupMatrixDebug(A, rowsA, rowsA, rowsA);
memset(B, 0xFF, matrixSizeB* sizeof(B[0]));
fillupMatrixDebug(B, rowsB, rowsA, rowsA);
for (int i = 0; i < opts.N; i++)
{
status1 = cublasSetMatrix(rowsA, colsA, sizeof(A[0]), A, rowsA, devPtrA[i], rowsA);
status2 = cublasSetMatrix(rowsB, colsB, sizeof(B[0]), B, rowsB, devPtrB[i], rowsB);
status3 = cublasSetMatrix(rowsC, colsC, sizeof(C[0]), C, rowsC, devPtrC[i], rowsC);
}
start = second();
status1 = cublasCgemmBatched(handle, params.transa, params.transb, params.m, params.n,
params.k, ¶ms.alpha, (const T_ELEM **)devPtrA_dev, rowsA,
(const T_ELEM **)devPtrB_dev, rowsB, ¶ms.beta, devPtrC_dev, rowsC, opts.N);
//cudaMemcpy(devPtrC_dev, devPtrC, opts.N * sizeof(*devPtrC), cudaMemcpyHostToDevice);
for (int i = 0; i < opts.N; i++)
{
//status1 = cublasSetMatrix(rowsA, colsA, sizeof(A[0]), A, rowsA, devPtrA[i], rowsA);
//status2 = cublasSetMatrix(rowsB, colsB, sizeof(B[0]), B, rowsB, devPtrB[i], rowsB);
status3 = cublasGetMatrix(rowsC, colsC, sizeof(C[0]), devPtrC[i], rowsC, C, rowsC);
}
PrintMatrixDebug(A, params.m, params.n, params.k);
PrintMatrixDebug(B, params.m, params.n, params.k);
PrintMatrixDebug(C, params.m, params.n, params.k);
stop = second();
fprintf(stdout, "^^^^ elapsed = %10.8f sec \n", (stop - start));
CLEANUP();
cublasDestroy(handle);
printf("\nTest Summary\n");
system("pause");
return 0;
}
输出结果,验证乘法结果是正确的。