使用CUBLAS的一些小例子

1. 矩阵相乘再加 C = a*A*B + b*C

#include "cuda_runtime.h"
#include "cublas_v2.h"

#include <time.h>
#include <iostream>

using namespace std;

int const M = 6;
int const N = 10;

int main()
{
	cublasStatus_t status;

	//Host memory malloc
	float *h_A = (float*)malloc(N*M*sizeof(float));
	float *h_B = (float*)malloc(N*M*sizeof(float));
	float *h_C = (float*)malloc(M*M*sizeof(float));
	float *h_C_cpu = (float*)malloc(M*M*sizeof(float));
	memset(h_C_cpu,0,M*M*sizeof(float));

	//Initialize and print
	for (int i=0; i<M*N; i++)
	{
		h_A[i] = (float)(rand()%10+1);
		h_B[i] = (float)(rand()%10+1);
	}
	cout << "Matrix A is:" << endl;
	for (int i=0; i<M*N; i++)
	{
		cout << h_A[i] << " ";
		if ((i+1)%N == 0)
		{
			cout << endl;
		}
	}
	cout << endl;
	cout << "Matrix B is:" << endl;
	for (int i=0; i<M*N; i++)
	{
		cout << h_B[i] << " ";
		if ((i+1)%M == 0)
		{
			cout << endl;
		}
	}
	cout << endl;

	//CPU caculate
	for(int i = 0; i<M; i++)
	{
		for(int j=0; j<M; j++)
		{
			for(int k=0; k<N; k++)
			{
				h_C_cpu[i*M+j] = h_C_cpu[i*M+j]+h_A[i*N+k]*h_B[k*M+j]*1 + 0;
			}
		}
	}
	cout << "The result from CPU is:" << endl;
	for (int i=0; i<M*M; i++)
	{
		cout << h_C_cpu[i] << " ";
		if ((i+1)%M == 0)
		{
			cout << endl;
		}
	}
	cout << endl;
	//Create handle;
	cublasHandle_t handle;
	status = cublasCreate(&handle);
	if (status != CUBLAS_STATUS_SUCCESS)
	{
		if (status == CUBLAS_STATUS_NOT_INITIALIZED)
		{
			cout << "Fail to get an instance of blas object! Check whether you have free the handle!" << endl;
		}
		getchar();
		return EXIT_FAILURE;
	}

	//Device memory malloc and initialize
	float *d_A, *d_B, *d_C;
	cudaMalloc((void**)&d_A, N*M*sizeof(float));
	cudaMalloc((void**)&d_B, N*M*sizeof(float));
	cudaMalloc((void**)&d_C, M*M*sizeof(float));

	cublasSetVector(N*M, sizeof(float), h_A, 1, d_A, 1);
	cublasSetVector(N*M, sizeof(float), h_B, 1, d_B, 1);

	cudaThreadSynchronize();

	//Call gpu operation
	float a = 1;
	float b = 0;
	const float *ca = &a;
	const float *cb = &b;

	cublasSgemm(
		handle,
		CUBLAS_OP_T,
		CUBLAS_OP_T,
		M,
		M,
		N,
		ca,
		d_A,
		N,
		d_B,
		M,
		cb,
		d_C,
		M
		);
	cudaThreadSynchronize();
	cublasGetVector(M*M, sizeof(float), d_C, 1, h_C, 1);

	cout << "The result from GPU is:" << endl;
	for (int i=0; i<M; i++)
	{
		for(int j=0; j<M; j++)
		{
			cout << h_C[j*M + i] << " ";
		}
		cout << endl;
	}
	
	//Cleaning...
	free(h_A);
	free(h_B);
	free(h_C);
	cudaFree(d_A);
	cudaFree(d_B);
	cudaFree(d_C);

	cublasDestroy(handle);

	return 0;
}


2.更规范的一个例子,由SDK中的例子改编,与1类似,比较了CPU与GPU运算效率:

/*
 * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO USER:
 *
 * This source code is subject to NVIDIA ownership rights under U.S. and
 * international Copyright laws.  Users and possessors of this source code
 * are hereby granted a nonexclusive, royalty-free license to use this code
 * in individual and commercial software.
 *
 * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
 * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
 * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
 * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
 * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
 * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
 * OR PERFORMANCE OF THIS SOURCE CODE.
 *
 * U.S. Government End Users.   This source code is a "commercial item" as
 * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
 * "commercial computer  software"  and "commercial computer software
 * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
 * and is provided to the U.S. Government only as a commercial end item.
 * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
 * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
 * source code with only those rights set forth herein.
 *
 * Any use of this source code in individual and commercial software must
 * include, in the user documentation and internal comments to the code,
 * the above Disclaimer and U.S. Government End Users Notice.
 */

/* This example demonstrates how to use the CUBLAS library
 * by scaling an array of floating-point values on the device
 * and comparing the result to the same operation performed
 * on the host.
 */

/* Includes, system */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <iostream>
#include "time.h"
/* Includes, cuda */
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <helper_cuda.h>
using namespace std;
/* Matrix size */
#define N  (600)

/* Host implementation of a simple version of sgemm */
static void simple_sgemm(int n, float alpha, const float *A, const float *B,
                         float beta, float *C)
{
    int i;
    int j;
    int k;

    for (i = 0; i < n; ++i)
    {
        for (j = 0; j < n; ++j)
        {
            float prod = 0;

            for (k = 0; k < n; ++k)
            {
                prod += A[k * n + i] * B[j * n + k];
            }

            C[j * n + i] = alpha * prod + beta * C[j * n + i];
        }
    }
}

/* Main */
int main(int argc, char **argv)
{
    cublasStatus_t status;
    float *h_A;
    float *h_B;
    float *h_C;
    float *h_C_ref;
    float *d_A = 0;
    float *d_B = 0;
    float *d_C = 0;
    float alpha = 1.0f;
    float beta = 0.0f;
    int n2 = N * N;
    int i;
    float error_norm;
    float ref_norm;
    float diff;
    cublasHandle_t handle;


	clock_t cuStart,cuFinish;
	clock_t start,finish;

	
    int dev = findCudaDevice(argc, (const char **) argv);

    if (dev == -1)
    {
        return EXIT_FAILURE;
    }

    /* Initialize CUBLAS */
    printf("simpleCUBLAS test running..\n");

	

    status = cublasCreate(&handle);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! CUBLAS initialization error\n");
        return EXIT_FAILURE;
    }

    /* Allocate host memory for the matrices */
    h_A = (float *)malloc(n2 * sizeof(h_A[0]));

    if (h_A == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (A)\n");
        return EXIT_FAILURE;
    }

    h_B = (float *)malloc(n2 * sizeof(h_B[0]));

    if (h_B == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (B)\n");
        return EXIT_FAILURE;
    }

    h_C = (float *)malloc(n2 * sizeof(h_C[0]));

    if (h_C == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (C)\n");
        return EXIT_FAILURE;
    }

    /* Fill the matrices with test data */
    for (i = 0; i < n2; i++)
    {
        h_A[i] = rand() / (float)RAND_MAX;
        h_B[i] = rand() / (float)RAND_MAX;
        h_C[i] = rand() / (float)RAND_MAX;
    }
	cuStart = clock();
    /* Allocate device memory for the matrices */
    if (cudaMalloc((void **)&d_A, n2 * sizeof(d_A[0])) != cudaSuccess)
    {
        fprintf(stderr, "!!!! device memory allocation error (allocate A)\n");
        return EXIT_FAILURE;
    }

    if (cudaMalloc((void **)&d_B, n2 * sizeof(d_B[0])) != cudaSuccess)
    {
        fprintf(stderr, "!!!! device memory allocation error (allocate B)\n");
        return EXIT_FAILURE;
    }

    if (cudaMalloc((void **)&d_C, n2 * sizeof(d_C[0])) != cudaSuccess)
    {
        fprintf(stderr, "!!!! device memory allocation error (allocate C)\n");
        return EXIT_FAILURE;
    }

    /* Initialize the device matrices with the host matrices */
    status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (write A)\n");
        return EXIT_FAILURE;
    }

    status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (write B)\n");
        return EXIT_FAILURE;
    }

    status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (write C)\n");
        return EXIT_FAILURE;
    }
	
    /* Performs operation using cublas */
    status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_B, N, &beta, d_C, N);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! kernel execution error.\n");
        return EXIT_FAILURE;
    }

    /* Allocate host memory for reading back the result from device memory */
    h_C_ref = (float *)malloc(n2 * sizeof(h_C[0]));

    if (h_C_ref == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (C)\n");
        return EXIT_FAILURE;
    }

    /* Read the result back */
    status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C_ref, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (read C)\n");
        return EXIT_FAILURE;
    }


	cuFinish = clock();

	start = clock();
	/* Performs operation using plain C code */
	simple_sgemm(N, alpha, h_A, h_B, beta, h_C);
	finish = clock();

	cout<<"GPU TIME:"<<(double)(cuFinish - cuStart)/CLOCKS_PER_SEC<<endl;
	cout<<"CPU TIME:"<<(double)(finish - start)/CLOCKS_PER_SEC<<endl;


    /* Check result against reference */
    error_norm = 0;
    ref_norm = 0;

    for (i = 0; i < n2; ++i)
    {
        diff = h_C_ref[i] - h_C[i];
        error_norm += diff * diff;
        ref_norm += h_C_ref[i] * h_C_ref[i];
    }

    error_norm = (float)sqrt((double)error_norm);
    ref_norm = (float)sqrt((double)ref_norm);
	std::cout<<error_norm<<"\t"<<ref_norm<<std::endl;


    if (fabs(ref_norm) < 1e-7)
    {
        fprintf(stderr, "!!!! reference norm is 0\n");
        return EXIT_FAILURE;
    }

    /* Memory clean up */
    free(h_A);
    free(h_B);
    free(h_C);
    free(h_C_ref);

    if (cudaFree(d_A) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (A)\n");
        return EXIT_FAILURE;
    }

    if (cudaFree(d_B) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (B)\n");
        return EXIT_FAILURE;
    }

    if (cudaFree(d_C) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (C)\n");
        return EXIT_FAILURE;
    }

    /* Shutdown */
    status = cublasDestroy(handle);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! shutdown error (A)\n");
        return EXIT_FAILURE;
    }

    if (error_norm / ref_norm < 1e-6f)
    {
        printf("simpleCUBLAS test passed.\n");
        exit(EXIT_SUCCESS);
    }
    else
    {
        printf("simpleCUBLAS test failed.\n");
        exit(EXIT_FAILURE);
    }
}



阅读更多

没有更多推荐了,返回首页