VS中Cuda使用cublas库进行复数矩阵的乘法

找了很久都没有发现 调用cuda库  中 cublasCgemmBatched 使用复数矩阵的乘法,自己尝试写了一个。

1,新建一个cuda项目 

配置相关路径和依赖项 ()

 

C:\ProgramData\NVIDIA Corporation\CUDA Samples\v10.0\common\inc
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.0\include

C:\ProgramData\NVIDIA Corporation\CUDA Samples\v10.0\common\lib
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.0\lib

 

cublas.lib

cuda.lib
cudadevrt.lib
cudart.lib
cudart_static.lib
cufft.lib
cufftw.lib
curand.lib
cusolver.lib
cusparse.lib

粘贴源代码。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <math.h>

#include <cuda_runtime.h>
#include <cublas_v2.h>

#include <helper_cuda.h>
//#include "batchCUBLAS.h"


#define T_ELEM cuComplex
#define CUBLASTEST_FAILED  1
const char *sSDKname = "batchCUBLAS";



static __inline__ int imax(int x, int y)
{
	return (x > y) ? x : y;
}
static __inline__ unsigned cuRand(void)
{
	/* George Marsaglia's fast inline random number generator */
#define CUDA_ZNEW  (cuda_z=36969*(cuda_z&65535)+(cuda_z>>16))
#define CUDA_WNEW  (cuda_w=18000*(cuda_w&65535)+(cuda_w>>16))
#define CUDA_MWC   ((CUDA_ZNEW<<16)+CUDA_WNEW)
#define CUDA_SHR3  (cuda_jsr=cuda_jsr^(cuda_jsr<<17), \
	cuda_jsr = cuda_jsr ^ (cuda_jsr >> 13), \
	cuda_jsr = cuda_jsr ^ (cuda_jsr << 5))
#define CUDA_CONG  (cuda_jcong=69069*cuda_jcong+1234567)
#define KISS  ((CUDA_MWC^CUDA_CONG)+CUDA_SHR3)
	static unsigned int cuda_z = 362436069, cuda_w = 521288629;
	static unsigned int cuda_jsr = 123456789, cuda_jcong = 380116160;
	return KISS;
}
#include <windows.h>
static __inline__ double second(void)
{
	LARGE_INTEGER t;
	static double oofreq;
	static int checkedForHighResTimer;
	static BOOL hasHighResTimer;

	if (!checkedForHighResTimer)
	{
		hasHighResTimer = QueryPerformanceFrequency(&t);
		oofreq = 1.0 / (double)t.QuadPart;
		checkedForHighResTimer = 1;
	}

	if (hasHighResTimer)
	{
		QueryPerformanceCounter(&t);
		return (double)t.QuadPart * oofreq;
	}
	else
	{
		return (double)GetTickCount() / 1000.0;
	}
}
struct gemmOpts
{
	int m;
	int n;
	int k;
	//testMethod test_method;
	char *elem_type;
	int N;    // number of multiplications
};
struct gemmTestParams
{
	cublasOperation_t transa;
	cublasOperation_t transb;
	int   m;
	int   n;
	int   k;
	T_ELEM alpha;
	T_ELEM beta;
};
#define CLEANUP()                           \
    do {                                        \
        if (A) free (A);                        \
        if (B) free (B);                        \
        if (C) free (C);                        \
        for(int i = 0; i < opts.N; ++i) {       \
            if(devPtrA[i]) cudaFree(devPtrA[i]);\
            if(devPtrB[i]) cudaFree(devPtrB[i]);\
            if(devPtrC[i]) cudaFree(devPtrC[i]);\
        }                                       \
        if (devPtrA) free(devPtrA);             \
        if (devPtrB) free(devPtrB);             \
        if (devPtrC) free(devPtrC);             \
        if (devPtrA_dev) cudaFree(devPtrA_dev); \
        if (devPtrB_dev) cudaFree(devPtrB_dev); \
        if (devPtrC_dev) cudaFree(devPtrC_dev); \
        fflush (stdout);                        \
    } while (0)

void fillupMatrixDebug(T_ELEM *A, int lda, int rows, int cols)
{
	for (int j = 0; j < cols; j++)
	{
		for (int i = 0; i < rows; i++)
		{
			A[i + lda*j].x = (i + 0);
			A[i + lda*j].y = (j + 0);
		}
	}
}
void PrintMatrixDebug(T_ELEM *A, int lda, int rows, int cols)
{
	printf("========The matrix is  \n");
	for (int j = 0; j < cols; j++)
	{
		for (int i = 0; i < rows; i++)
		{
			printf("%+.02f%+.02fi\t", A[i + lda*j].x, A[i + lda*j].y);
			//A[i + lda*j] = (i + j);
		}
		printf("  \n");
	}
}

int main(int argc, char *argv[])
{
	printf("%s Starting...\n\n", sSDKname);

	int dev = findCudaDevice(argc, (const char **)argv);

	if (dev == -1)
	{
		return CUBLASTEST_FAILED;
	}
	cublasHandle_t handle;

	if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS)
	{
		fprintf(stdout, "CUBLAS initialization failed!\n");
		exit(EXIT_FAILURE);
	}

	cublasStatus_t status1, status2, status3;
	T_ELEM *A = NULL;
	T_ELEM *B = NULL;
	T_ELEM *C = NULL;
	T_ELEM **devPtrA = 0;
	T_ELEM **devPtrB = 0;
	T_ELEM **devPtrC = 0;
	T_ELEM **devPtrA_dev = NULL;
	T_ELEM **devPtrB_dev = NULL;
	T_ELEM **devPtrC_dev = NULL;

	int matrixM, matrixN, matrixK;
	int rowsA, rowsB, rowsC;
	int colsA, colsB, colsC;
	int m, n, k;
	int matrixSizeA, matrixSizeB, matrixSizeC;
	int errors;
	double start, stop;
	gemmOpts opts;
	opts.N = 20;
	gemmTestParams params;

	printf("Testing Batch INV Cublas\n");

	matrixM = 5;
	matrixN = 5;
	matrixK = 5;

	rowsA = imax(1, matrixM);
	colsA = imax(1, matrixK);
	rowsB = imax(1, matrixK);
	colsB = imax(1, matrixN);
	rowsC = imax(1, matrixM);
	colsC = imax(1, matrixN);

	matrixSizeA = rowsA * colsA;
	matrixSizeB = rowsB * colsB;
	matrixSizeC = rowsC * colsC;

	params.transa = CUBLAS_OP_N;
	params.transb = CUBLAS_OP_N;
	m = matrixM;
	n = matrixN;
	k = matrixK;
	params.m = m;
	params.n = n;
	params.k = k;
	params.alpha.x = 1.0; params.alpha.y = 0.0;
	params.beta.x = 0.0; params.beta.y = 0.0;

	devPtrA = (T_ELEM **)malloc(opts.N * sizeof(T_ELEM *));
	devPtrB = (T_ELEM **)malloc(opts.N * sizeof(*devPtrB));
	devPtrC = (T_ELEM **)malloc(opts.N * sizeof(*devPtrC));

	for (int i = 0; i < opts.N; i++)
	{
		cudaError_t err1 = cudaMalloc((void **)&devPtrA[i], matrixSizeA * sizeof(T_ELEM ));
		cudaError_t err2 = cudaMalloc((void **)&devPtrB[i], matrixSizeB * sizeof(devPtrB[0][0]));
		cudaError_t err3 = cudaMalloc((void **)&devPtrC[i], matrixSizeC * sizeof(devPtrC[0][0]));


	}
	// For batched processing we need those arrays on the device

	cudaError_t err1 = cudaMalloc((void **)&devPtrA_dev, opts.N * sizeof(*devPtrA));
	cudaError_t err2 = cudaMalloc((void **)&devPtrB_dev, opts.N * sizeof(*devPtrB));
	cudaError_t err3 = cudaMalloc((void **)&devPtrC_dev, opts.N * sizeof(*devPtrC));



	err1 = cudaMemcpy(devPtrA_dev, devPtrA, opts.N * sizeof(*devPtrA), cudaMemcpyHostToDevice);
	err2 = cudaMemcpy(devPtrB_dev, devPtrB, opts.N * sizeof(*devPtrB), cudaMemcpyHostToDevice);
	err3 = cudaMemcpy(devPtrC_dev, devPtrC, opts.N * sizeof(*devPtrC), cudaMemcpyHostToDevice);

	A = (T_ELEM *)malloc(matrixSizeA * sizeof(A[0]));
	B = (T_ELEM *)malloc(matrixSizeB * sizeof(B[0]));
	C = (T_ELEM *)malloc(matrixSizeC * sizeof(C[0]));

	printf("#### args: lda=%d ldb=%d ldc=%d\n", rowsA, rowsB, rowsC);
	m = cuRand() % matrixM;
	n = cuRand() % matrixN;
	k = cuRand() % matrixK;
	memset(A, 0xFF, matrixSizeA* sizeof(A[0]));
	fillupMatrixDebug(A, rowsA, rowsA, rowsA);
	memset(B, 0xFF, matrixSizeB* sizeof(B[0]));
	fillupMatrixDebug(B, rowsB, rowsA, rowsA);

	for (int i = 0; i < opts.N; i++)
	{
		status1 = cublasSetMatrix(rowsA, colsA, sizeof(A[0]), A, rowsA, devPtrA[i], rowsA);
		status2 = cublasSetMatrix(rowsB, colsB, sizeof(B[0]), B, rowsB, devPtrB[i], rowsB);
		status3 = cublasSetMatrix(rowsC, colsC, sizeof(C[0]), C, rowsC, devPtrC[i], rowsC);

	}
	start = second();

	status1 = cublasCgemmBatched(handle, params.transa, params.transb, params.m, params.n,
		params.k, &params.alpha, (const T_ELEM **)devPtrA_dev, rowsA,
		(const T_ELEM **)devPtrB_dev, rowsB, &params.beta, devPtrC_dev, rowsC, opts.N);

	
	//cudaMemcpy(devPtrC_dev, devPtrC, opts.N * sizeof(*devPtrC), cudaMemcpyHostToDevice);

	for (int i = 0; i < opts.N; i++)
	{
		//status1 = cublasSetMatrix(rowsA, colsA, sizeof(A[0]), A, rowsA, devPtrA[i], rowsA);
		//status2 = cublasSetMatrix(rowsB, colsB, sizeof(B[0]), B, rowsB, devPtrB[i], rowsB);
		status3 = cublasGetMatrix(rowsC, colsC, sizeof(C[0]), devPtrC[i], rowsC, C, rowsC);

	}

	PrintMatrixDebug(A, params.m, params.n, params.k);
	PrintMatrixDebug(B, params.m, params.n, params.k);
	PrintMatrixDebug(C, params.m, params.n, params.k);

	stop = second();
	fprintf(stdout, "^^^^ elapsed = %10.8f sec  \n", (stop - start));




	CLEANUP();
	cublasDestroy(handle);
	printf("\nTest Summary\n");
	system("pause");
	return 0;

}

输出结果,验证乘法结果是正确的。

 

  • 0
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
以下是使用CUBLAS进行批处理矩阵乘法的示例代码: ```c #include <stdio.h> #include <stdlib.h> #include <cuda_runtime.h> #include <cublas_v2.h> #define N 3 // 矩阵大小 #define BATCH_SIZE 2 // 批处理大小 void printMatrix(float* matrix, int rows, int cols) { for (int i = 0; i < rows; i++) { for (int j = 0; j < cols; j++) { printf("%f ", matrix[i * cols + j]); } printf("\n"); } printf("\n"); } int main() { cublasHandle_t handle; cublasCreate(&handle); float* h_A[BATCH_SIZE]; // 批处理输入矩阵A float* h_B[BATCH_SIZE]; // 批处理输入矩阵B float* h_C[BATCH_SIZE]; // 批处理输出矩阵C float* d_A[BATCH_SIZE]; // GPU上的输入矩阵A float* d_B[BATCH_SIZE]; // GPU上的输入矩阵B float* d_C[BATCH_SIZE]; // GPU上的输出矩阵C // 为批处理矩阵分配内存 for (int i = 0; i < BATCH_SIZE; i++) { h_A[i] = (float*)malloc(N * N * sizeof(float)); h_B[i] = (float*)malloc(N * N * sizeof(float)); h_C[i] = (float*)malloc(N * N * sizeof(float)); cudaMalloc((void**)&d_A[i], N * N * sizeof(float)); cudaMalloc((void**)&d_B[i], N * N * sizeof(float)); cudaMalloc((void**)&d_C[i], N * N * sizeof(float)); } // 初始化输入矩阵A和B for (int i = 0; i < BATCH_SIZE; i++) { for (int j = 0; j < N * N; j++) { h_A[i][j] = i + j; h_B[i][j] = i - j; } } // 将输入矩阵A和B从主机内存复制到GPU内存 for (int i = 0; i < BATCH_SIZE; i++) { cudaMemcpy(d_A[i], h_A[i], N * N * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_B[i], h_B[i], N * N * sizeof(float), cudaMemcpyHostToDevice); } const float alpha = 1.0f; const float beta = 0.0f; // 执行批处理矩阵乘法 cublasSgemmBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, (const float**)d_A, N, (const float**)d_B, N, &beta, d_C, N, BATCH_SIZE); // 将输出矩阵C从GPU内存复制到主机内存 for (int i = 0; i < BATCH_SIZE; i++) { cudaMemcpy(h_C[i], d_C[i], N * N * sizeof(float), cudaMemcpyDeviceToHost); } // 打印输入矩阵A和B printf("Matrix A:\n"); for (int i = 0; i < BATCH_SIZE; i++) { printf("Batch %d:\n", i); printMatrix(h_A[i], N, N); } printf("Matrix B:\n"); for (int i = 0; i < BATCH_SIZE; i++) { printf("Batch %d:\n", i); printMatrix(h_B[i], N, N); } // 打印输出矩阵C printf("Matrix C:\n"); for (int i = 0; i < BATCH_SIZE; i++) { printf("Batch %d:\n", i); printMatrix(h_C[i], N, N); } // 释放内存 for (int i = 0; i < BATCH_SIZE; i++) { free(h_A[i]); free(h_B[i]); free(h_C[i]); cudaFree(d_A[i]); cudaFree(d_B[i]); cudaFree(d_C[i]); } cublasDestroy(handle); return 0; } ``` 这个示例代码演示了如何使用CUBLAS进行批处理矩阵乘法。它创建了两个批处理的输入矩阵A和B,并分配了相应的GPU内存。然后,它将输入矩阵从主机内存复制到GPU内存,并使用`cublasSgemmBatched`函数执行批处理矩阵乘法。最后,它将输出矩阵C从GPU内存复制到主机内存,并打印输入矩阵A和B以及输出矩阵C的内容。请确保您已正确安装CUDACUBLAS,并将代码与适当的编译器和进行编译和链接。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值