显卡性能测试工具BenchMarkTool

最新推荐文章于 2025-03-26 20:15:27 发布

乌衣湘

最新推荐文章于 2025-03-26 20:15:27 发布

阅读量7.1k

点赞数

分类专栏：知识分享文章标签：测试工具

本文链接：https://blog.csdn.net/qq_39807056/article/details/118963202

版权

知识分享专栏收录该内容

4 篇文章

订阅专栏

本文档介绍了CUDA-Z工具的使用，它能查询显卡信息并测试其性能。CUDA-Z提供了Timestamp、Compute能力、Host到Device拷贝速度、Device到Host拷贝速度等关键数据。代码示例展示了如何构建基础框架，包括日志记录和内存管理，以及CUDA运算的错误检查。此外，还包含了内存带宽测试类，用于衡量不同类型的设备间拷贝速度。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

设计自己的CUDAZ

CUDAZ是一款用于查询显卡信息，测试显卡性能的工具，具体参见：
http://cuda-z.sourceforge.net/#block-windows
获取信息包括 TimeStamp Compute能力(Gflops) fHost2Device拷贝性能 fDevice2Host拷贝性能 fHost2DevicePin fDevice2HostPin fDevice2Device(GiB/s)
基本框架搭建：
LabBase.h

#pragma once
#ifndef LAB_BASE
#define LAB_BASE 
#include <stdio.h> 
#include <fstream>
#include <windows.h>
#include <vector>
#include <list>
#include <iostream>
#include <iomanip>
#include <sstream>
#include <math.h>
#include <string>
#include <time.h>
#include <tchar.h>
#include <cstring>
#include <psapi.h>
#pragma comment(lib,"psapi.lib")
#include <direct.h>
#include <io.h>

typedef _ULonglong uint64_t;

class LabBase
{
public:
	//Singleton lazy alloc
    static LabBase getInstance();
	static uint64_t getTime_UTC();
	static std::string getTime_Str();
public:

template<typename T>
static void log2File(const T &Info,const std::string &logfile){
    std::ostringstream streamInfo;
    streamInfo.clear();
    streamInfo.str("");
    streamInfo<<Info;
    std::string sInfo = streamInfo.str();
    LabBase obj = LabBase::getInstance();
    obj.log2File(sInfo,logfile);
}
template<typename T>
static void log2File(const T &Info,const char *const file, int const line,const std::string &logfile) {
    std::string sfile(file);
    std::ostringstream streamInfo;
    streamInfo.clear();
	streamInfo.str("");
    streamInfo<<Info;
    std::string sInfo = streamInfo.str();
    LabBase obj = LabBase::getInstance();
    obj.log2File(sInfo,sfile,line,logfile);
}
    /// \brief Save file.
template<typename T>
static void SaveFile(const T* data, size_t len, const std::string &file) {
    std::ofstream ofs(file, std::ios::binary|std::ios::ate);
    if(ofs.fail()) {
        std::cout<<"failed to open file: " + file<<std::endl;
        return;
    }
    ofs.write((char*)data, len*sizeof(T));
    ofs.close();
}

private:
    void log2File(const std::string &sInfo,const std::string &file="1.log");
    void log2File(const char* charArray,const std::string &file="1.log");
    void log2File(const std::string &sInfo,const std::string &file, int const line,const std::string &logfile="1.log");
};

class LogAlgoPerf
{
public:
     explicit LogAlgoPerf(void);
     virtual ~LogAlgoPerf(void);
public:
    // set Log file name & will be saved to "D:\\LogAlgoPerf\\"
    void setLogFile(const std::string &strFileName);
    // set start logging timer 
	void setLogStart();
    // get logging time
	void getLogEnd();
    // get Process Memory info WorkSet Paged Pinned
	
	template<typename T>
	void logInfo(T info){
		//decltype
		m_streamInfo<<info<<"	";
	}

	void logMemInfo(); 
	// Log other info 

	double getTotalTime();
private:
    
	void log2File();
	void mkdir();
	void reset();
private:
	LARGE_INTEGER m_nBegTime;
	LARGE_INTEGER m_nEndTime;
	LARGE_INTEGER m_nFreq;
	std::string		m_strFile;
	std::ostringstream			m_streamInfo;
	std::vector<LARGE_INTEGER>	m_vBegTime;
	std::vector<LARGE_INTEGER>	m_vEndTime;
	std::vector<std::string>	m_vInfo;
};

//#define LOGPERF(_ins, _m, ...)
#define LOGPERF(_ins, _m, ...) {_ins.##_m(##__VA_ARGS__);}

#endif LAB_BASE

CudaBase.h

#pragma once
#ifndef __CUDA_BASE__
#define __CUDA_BASE__

#include"cuda_runtime.h"
#include"cublas.h"
#include "device_launch_parameters.h"
#include "LabBase.h"

template< typename T >
inline bool checkOpt(T cudaFunc, char const *const cudaFuncName, const char *const file, int const line){
	bool bSuc = true;
	auto result = cudaFunc;
	if (result != cudaSuccess){
		bSuc = false;
		std::ostringstream oss;
		oss<<"Throw cudaError: "<<result<<"File: "<<file<<", cudaFunction: "<<cudaFuncName<<", Line: "<<line;
		size_t nBytesFree = 0, nBytesTotal = 0;
		if (cudaMemGetInfo(&nBytesFree, &nBytesTotal) == cudaSuccess){
			int DeviceId =-1;
			cudaGetDevice(&DeviceId);
			cudaGetLastError();// reset cuda error 
			oss<<" GPU "<<DeviceId<<" FreeMemory: "<<nBytesFree<<" bytes, TotalMemory: "<<nBytesTotal<<" bytes";
		}
		std::string sInfo = oss.str();
		LabBase::log2File(sInfo,__FILE__, __LINE__,"checkCudaErrors.log");
		printf("%s",oss.str());
		throw std::logic_error(oss.str());
	}
	return bSuc;
}

#define checkCudaErrors(cudaFunc)           checkOpt ( (cudaFunc),  #cudaFunc, __FILE__, __LINE__ )

#endif	__CUDA_BASE__

BenchMark.h

#include "../../common/CudaBase.h"

// Test memory Opts
enum GPUCOPYMODEL{
	COPY_MODE_Hpage2D	= 0,		/*!< Host pageable memory to device data copy mode. */
	COPY_MODE_D2Hpage	= 1,		/*!< Device to host pageable memory data copy mode. */
	COPY_MODE_Hpin2D	= 2,		/*!< Host pinned memory to device data copy mode. */
	COPY_MODE_D2Hpin	= 3,		/*!< Device to host pinned memory data copy mode. */
	COPY_MODE_D2D		= 4			/*!< Device to device data copy mode. */
};

struct memBuff {
	size_t	szMem;
	void	*memHostPage;	/*!< Pageable host memory. */
	void	*memHostPin;	/*!< Pinned host memory. */
	void	*memDevice1;	/*!< Device memory buffer 1. */
	void	*memDevice2;	/*!< Device memory buffer 2. */
	memBuff()
		:szMem(0)
		,memHostPage(NULL)
		,memHostPin(NULL)
		,memDevice1(NULL)
		,memDevice2(NULL)
	{
	}
};

struct memInfo{
	float fH2DPage;
	float fD2HPage;
	float fH2DPin;
	float fD2HPin;
	float fD2D;
	memInfo()
		:fH2DPage(0.0f)
		,fD2HPage(0.0f)
		,fH2DPin(0.0f)
		,fD2HPin(0.0f)
		,fD2D(0.0f){}
};
class BandWidth{
public:
	BandWidth(void);
	~BandWidth(void);
	void memBandInit(memBuff sData);
	memInfo getBandWidth();
private:

	float getMemSpeed(GPUCOPYMODEL model,int iterNum=10);
	void memAllocFree();
private:
	memBuff m_sData;
};

BenchMark.cuh

#include"BenchMarks.h"
extern "C" float TestFunc_Launchlatency();
extern "C" float TestFunc_CalcPerformance();
extern "C" memInfo TestFunc_BandWidth();

BenchMark.cu

#include"BenchMarks.cuh"
//empty Kernel
__global__ void empty() {}

#define CALC_FMAD_16(a, b) \
	a = a * a + a; b = b * b + b; a = a * a + a; b = b * b + b; \
	a = a * a + a; b = b * b + b; a = a * a + a; b = b * b + b; \
	a = a * a + a; b = b * b + b; a = a * a + a; b = b * b + b; \
	a = a * a + a; b = b * b + b; a = a * a + a; b = b * b + b; \

#define CALC_FMAD_256(a, b) \
	CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) \
	CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) \
	CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) \
	CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) \

__global__ void CalcKernelFloat(
	void *buf,					/*!<[in] Data buffer. */
	int iters
	) {
	int index = blockIdx.x * blockDim.x + threadIdx.x;
	float *arr = (float*)buf;
	float val1 = index;
	float val2 = arr[index];
	int i;

	for(i = 0; i < iters; i++) {
		CALC_FMAD_256(val1, val2);
		CALC_FMAD_256(val1, val2);
		CALC_FMAD_256(val1, val2);
		CALC_FMAD_256(val1, val2);
		CALC_FMAD_256(val1, val2);
		CALC_FMAD_256(val1, val2);
		CALC_FMAD_256(val1, val2);
		CALC_FMAD_256(val1, val2);
	}

	arr[index] = val1 + val2;
}



float TestFunc_Launchlatency(){
	const int iters = 100;
	cudaFree(0);
	// Warmup phase
	empty<<<1,1>>>();
	float timeMs	= 0.0;
	
	cudaEvent_t start;
	cudaEvent_t stop;
	checkCudaErrors(cudaEventCreate(&start));
	checkCudaErrors(cudaEventCreate(&stop));
	checkCudaErrors(cudaEventRecord(start, 0));

	checkCudaErrors(cudaDeviceSynchronize());
	float totalTime = 0;
	
	// Benchmark phase
	for (int i = 0; i < iters; ++i) {
		float loopMs = 0;
		empty<<<1,1024>>>();
		cudaEventRecord(stop, 0);
		cudaEventSynchronize(stop);
		cudaEventElapsedTime(&loopMs, start, stop);

		//std::cout<<i<<" "<<loopMs<<std::endl;
		timeMs += loopMs;
	}
	
	float averTime = timeMs/iters;
	return averTime;
	//printf(" Average Launch Time %f ms \r\n",averTime);
}

memInfo TestFunc_BandWidth(){
	memBuff sData;
	BandWidth obj;
	sData.szMem = 1024*1024*100;
	obj.memBandInit(sData);
	memInfo info = obj.getBandWidth();
	return info;
}

float TestFunc_CalcPerformance(){
	cudaDeviceProp prop;
	cudaGetDeviceProperties(&prop, 0);
	

	int nRepeatNum			= 10;
	int iterNumInKernel		= 32;
	int threadsNum			= prop.maxThreadsPerBlock;
	int blocksNum			= 1;
	if(threadsNum == 0) {
		int warpSize = prop.warpSize;
		if(warpSize == 0)
			warpSize = 32;
		threadsNum = warpSize * 2;
		if(threadsNum > 512)
			threadsNum = 512;
	}

	float* pfDiviceBuff		= NULL;
	checkCudaErrors(cudaMalloc((void**)&pfDiviceBuff,threadsNum*sizeof(float)));
	checkCudaErrors(cudaMemset(pfDiviceBuff,0,threadsNum*sizeof(float)));
	std::shared_ptr<float> pArray(new float[threadsNum]);
	float*ptrArray = pArray.get();
	for(int i = 0;i<threadsNum;++i){
		ptrArray[i] = i*0.03141592653f;
	}
	checkCudaErrors(cudaMemcpy(pfDiviceBuff,ptrArray,threadsNum*sizeof(float),cudaMemcpyHostToDevice));
	float timeMs	= 0.0;
	cudaEvent_t start;
	cudaEvent_t stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	cudaEventRecord(start, 0);
	for(int i = 0;i<nRepeatNum;++i){
		float loopMs = 0.0;
		cudaEventRecord(start, 0);
		CalcKernelFloat<<<blocksNum, threadsNum>>>(pfDiviceBuff,iterNumInKernel);
		checkCudaErrors(cudaGetLastError());
		cudaEventRecord(stop, 0);
		cudaEventSynchronize(stop);
		cudaEventElapsedTime(&loopMs, start, stop);
		timeMs += loopMs;
	}
	int nOpsNumofFMA	= 2;		// Number of operations per one loop
	int nFMAInstNum		= 256;		// Size of instruction block		
	int nIterofFMAOps	= 8;		// Number of instruction blocks in loop
	
	float GFLOPs = 
		((float)prop.multiProcessorCount * (float)threadsNum									// max cores * warp
		*(float)iterNumInKernel*(float)nIterofFMAOps *(float)nFMAInstNum *(float)nOpsNumofFMA 	// ops of cores
		*(float)nRepeatNum *1000.0/(float)timeMs)												// ops per Second
		/1000/1000/1000;																		// G
	cudaEventDestroy(start);
	cudaEventDestroy(stop);
	cudaFree(pfDiviceBuff);
	//printf("GFLOPs %10f \r\n",GFLOPs);
	return GFLOPs;
}

BenchMark.cpp

#include "BenchMarks.h"

BandWidth::BandWidth(void){

}

BandWidth::~BandWidth(void){
	memAllocFree();
}

void BandWidth::memBandInit(memBuff sData){
	m_sData = sData;
	if (m_sData.szMem==0){
		m_sData.szMem = 1024*1024*1;
	}
	m_sData.memHostPage = (void*)malloc(m_sData.szMem);
	auto state = cudaMallocHost((void**)&m_sData.memHostPin,m_sData.szMem);
	cudaMalloc((void**)&m_sData.memDevice1,m_sData.szMem);
	cudaMalloc((void**)&m_sData.memDevice2,m_sData.szMem);
	
}

float BandWidth::getMemSpeed(GPUCOPYMODEL model,int iterNum){
	float timeMs	= 0.0;
	cudaEvent_t start;
	cudaEvent_t stop;
	checkCudaErrors(cudaEventCreate(&start));
	checkCudaErrors(cudaEventCreate(&stop));
	for(int i = 0;i<iterNum;++i){
		float loopMs = 0.0;
		cudaEventRecord(start, 0);
		switch(model){
		case COPY_MODE_Hpage2D:
			checkCudaErrors(cudaMemcpy(m_sData.memDevice1, m_sData.memHostPage, m_sData.szMem, cudaMemcpyHostToDevice));
			break;
		case COPY_MODE_D2Hpage:
			checkCudaErrors(cudaMemcpy(m_sData.memHostPage, m_sData.memDevice1, m_sData.szMem, cudaMemcpyDeviceToHost));
			break;
		case COPY_MODE_Hpin2D:
			checkCudaErrors(cudaMemcpy(m_sData.memDevice2, m_sData.memHostPin, m_sData.szMem, cudaMemcpyHostToDevice));
			break;
		case COPY_MODE_D2Hpin:
			checkCudaErrors(cudaMemcpy(m_sData.memHostPin, m_sData.memDevice2, m_sData.szMem, cudaMemcpyDeviceToHost));
			break;
		case COPY_MODE_D2D:
			checkCudaErrors(cudaMemcpy(m_sData.memDevice1, m_sData.memDevice2, m_sData.szMem, cudaMemcpyDeviceToHost));
			break;
		default:
			break;
		}
		cudaEventRecord(stop, 0);
		checkCudaErrors(cudaEventSynchronize(stop));
		cudaEventElapsedTime(&loopMs, start, stop);
		timeMs += loopMs;
	}
	checkCudaErrors(cudaEventDestroy(start));
	checkCudaErrors(cudaEventDestroy(stop));

	float bandwidthGiBs = (1000 *m_sData.szMem*iterNum) / (timeMs *(float)(1 << 30));
	return bandwidthGiBs;
}

void BandWidth::memAllocFree(){

	if (m_sData.memDevice1!=NULL){
		cudaFree(m_sData.memDevice1);
	}
	if (m_sData.memDevice2!=NULL){
		cudaFree(m_sData.memDevice2);
	}
	if (m_sData.memHostPage!=NULL){
		free(m_sData.memHostPage);
	}
	if (m_sData.memHostPin!=NULL){
		cudaFreeHost(m_sData.memHostPin);
	}
}

memInfo BandWidth::getBandWidth(){
	memInfo info;
	info.fH2DPage	= getMemSpeed(COPY_MODE_Hpage2D);
	info.fD2HPage	= getMemSpeed(COPY_MODE_D2Hpage);
	info.fH2DPin	= getMemSpeed(COPY_MODE_Hpin2D);
	info.fD2HPin	= getMemSpeed(COPY_MODE_D2Hpin);
	info.fD2D		= getMemSpeed(COPY_MODE_D2D);
	return info;
}

Main.cpp

void TestFunc_PerfMon(){
	TestFunc_Launchlatency();
	printf("	TimeStamp	Compute(Gflops) fH2D	 fD2H	    fH2DPin    fD2HPin	 fD2D(GiB/s) \r\n");
	for (int i=0;i<10000;++i){
		std::string strtime = LabBase::getTime_Str();
		memInfo info = TestFunc_BandWidth();
		float Gflops = TestFunc_CalcPerformance();
		printf("%s %10f %10f %10f %10f %10f %10f \r\n",strtime.c_str(),Gflops,info.fH2DPage,info.fD2HPage,info.fH2DPin,info.fD2HPin,info.fD2D);
		Sleep(1000);
	}
}