设计自己的CUDAZ
CUDAZ是一款用于查询显卡信息,测试显卡性能的工具,具体参见:
http://cuda-z.sourceforge.net/#block-windows
获取信息包括 TimeStamp Compute能力(Gflops) fHost2Device拷贝性能 fDevice2Host拷贝性能 fHost2DevicePin fDevice2HostPin fDevice2Device(GiB/s)
基本框架搭建:
LabBase.h
#pragma once
#ifndef LAB_BASE
#define LAB_BASE
#include <stdio.h>
#include <fstream>
#include <windows.h>
#include <vector>
#include <list>
#include <iostream>
#include <iomanip>
#include <sstream>
#include <math.h>
#include <string>
#include <time.h>
#include <tchar.h>
#include <cstring>
#include <psapi.h>
#pragma comment(lib,"psapi.lib")
#include <direct.h>
#include <io.h>
typedef _ULonglong uint64_t;
class LabBase
{
public:
//Singleton lazy alloc
static LabBase getInstance();
static uint64_t getTime_UTC();
static std::string getTime_Str();
public:
template<typename T>
static void log2File(const T &Info,const std::string &logfile){
std::ostringstream streamInfo;
streamInfo.clear();
streamInfo.str("");
streamInfo<<Info;
std::string sInfo = streamInfo.str();
LabBase obj = LabBase::getInstance();
obj.log2File(sInfo,logfile);
}
template<typename T>
static void log2File(const T &Info,const char *const file, int const line,const std::string &logfile) {
std::string sfile(file);
std::ostringstream streamInfo;
streamInfo.clear();
streamInfo.str("");
streamInfo<<Info;
std::string sInfo = streamInfo.str();
LabBase obj = LabBase::getInstance();
obj.log2File(sInfo,sfile,line,logfile);
}
/// \brief Save file.
template<typename T>
static void SaveFile(const T* data, size_t len, const std::string &file) {
std::ofstream ofs(file, std::ios::binary|std::ios::ate);
if(ofs.fail()) {
std::cout<<"failed to open file: " + file<<std::endl;
return;
}
ofs.write((char*)data, len*sizeof(T));
ofs.close();
}
private:
void log2File(const std::string &sInfo,const std::string &file="1.log");
void log2File(const char* charArray,const std::string &file="1.log");
void log2File(const std::string &sInfo,const std::string &file, int const line,const std::string &logfile="1.log");
};
class LogAlgoPerf
{
public:
explicit LogAlgoPerf(void);
virtual ~LogAlgoPerf(void);
public:
// set Log file name & will be saved to "D:\\LogAlgoPerf\\"
void setLogFile(const std::string &strFileName);
// set start logging timer
void setLogStart();
// get logging time
void getLogEnd();
// get Process Memory info WorkSet Paged Pinned
template<typename T>
void logInfo(T info){
//decltype
m_streamInfo<<info<<" ";
}
void logMemInfo();
// Log other info
double getTotalTime();
private:
void log2File();
void mkdir();
void reset();
private:
LARGE_INTEGER m_nBegTime;
LARGE_INTEGER m_nEndTime;
LARGE_INTEGER m_nFreq;
std::string m_strFile;
std::ostringstream m_streamInfo;
std::vector<LARGE_INTEGER> m_vBegTime;
std::vector<LARGE_INTEGER> m_vEndTime;
std::vector<std::string> m_vInfo;
};
//#define LOGPERF(_ins, _m, ...)
#define LOGPERF(_ins, _m, ...) {_ins.##_m(##__VA_ARGS__);}
#endif LAB_BASE
CudaBase.h
#pragma once
#ifndef __CUDA_BASE__
#define __CUDA_BASE__
#include"cuda_runtime.h"
#include"cublas.h"
#include "device_launch_parameters.h"
#include "LabBase.h"
template< typename T >
inline bool checkOpt(T cudaFunc, char const *const cudaFuncName, const char *const file, int const line){
bool bSuc = true;
auto result = cudaFunc;
if (result != cudaSuccess){
bSuc = false;
std::ostringstream oss;
oss<<"Throw cudaError: "<<result<<"File: "<<file<<", cudaFunction: "<<cudaFuncName<<", Line: "<<line;
size_t nBytesFree = 0, nBytesTotal = 0;
if (cudaMemGetInfo(&nBytesFree, &nBytesTotal) == cudaSuccess){
int DeviceId =-1;
cudaGetDevice(&DeviceId);
cudaGetLastError();// reset cuda error
oss<<" GPU "<<DeviceId<<" FreeMemory: "<<nBytesFree<<" bytes, TotalMemory: "<<nBytesTotal<<" bytes";
}
std::string sInfo = oss.str();
LabBase::log2File(sInfo,__FILE__, __LINE__,"checkCudaErrors.log");
printf("%s",oss.str());
throw std::logic_error(oss.str());
}
return bSuc;
}
#define checkCudaErrors(cudaFunc) checkOpt ( (cudaFunc), #cudaFunc, __FILE__, __LINE__ )
#endif __CUDA_BASE__
BenchMark.h
#include "../../common/CudaBase.h"
// Test memory Opts
enum GPUCOPYMODEL{
COPY_MODE_Hpage2D = 0, /*!< Host pageable memory to device data copy mode. */
COPY_MODE_D2Hpage = 1, /*!< Device to host pageable memory data copy mode. */
COPY_MODE_Hpin2D = 2, /*!< Host pinned memory to device data copy mode. */
COPY_MODE_D2Hpin = 3, /*!< Device to host pinned memory data copy mode. */
COPY_MODE_D2D = 4 /*!< Device to device data copy mode. */
};
struct memBuff {
size_t szMem;
void *memHostPage; /*!< Pageable host memory. */
void *memHostPin; /*!< Pinned host memory. */
void *memDevice1; /*!< Device memory buffer 1. */
void *memDevice2; /*!< Device memory buffer 2. */
memBuff()
:szMem(0)
,memHostPage(NULL)
,memHostPin(NULL)
,memDevice1(NULL)
,memDevice2(NULL)
{
}
};
struct memInfo{
float fH2DPage;
float fD2HPage;
float fH2DPin;
float fD2HPin;
float fD2D;
memInfo()
:fH2DPage(0.0f)
,fD2HPage(0.0f)
,fH2DPin(0.0f)
,fD2HPin(0.0f)
,fD2D(0.0f){}
};
class BandWidth{
public:
BandWidth(void);
~BandWidth(void);
void memBandInit(memBuff sData);
memInfo getBandWidth();
private:
float getMemSpeed(GPUCOPYMODEL model,int iterNum=10);
void memAllocFree();
private:
memBuff m_sData;
};
BenchMark.cuh
#include"BenchMarks.h"
extern "C" float TestFunc_Launchlatency();
extern "C" float TestFunc_CalcPerformance();
extern "C" memInfo TestFunc_BandWidth();
BenchMark.cu
#include"BenchMarks.cuh"
//empty Kernel
__global__ void empty() {}
#define CALC_FMAD_16(a, b) \
a = a * a + a; b = b * b + b; a = a * a + a; b = b * b + b; \
a = a * a + a; b = b * b + b; a = a * a + a; b = b * b + b; \
a = a * a + a; b = b * b + b; a = a * a + a; b = b * b + b; \
a = a * a + a; b = b * b + b; a = a * a + a; b = b * b + b; \
#define CALC_FMAD_256(a, b) \
CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) \
CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) \
CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) \
CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) \
__global__ void CalcKernelFloat(
void *buf, /*!<[in] Data buffer. */
int iters
) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
float *arr = (float*)buf;
float val1 = index;
float val2 = arr[index];
int i;
for(i = 0; i < iters; i++) {
CALC_FMAD_256(val1, val2);
CALC_FMAD_256(val1, val2);
CALC_FMAD_256(val1, val2);
CALC_FMAD_256(val1, val2);
CALC_FMAD_256(val1, val2);
CALC_FMAD_256(val1, val2);
CALC_FMAD_256(val1, val2);
CALC_FMAD_256(val1, val2);
}
arr[index] = val1 + val2;
}
float TestFunc_Launchlatency(){
const int iters = 100;
cudaFree(0);
// Warmup phase
empty<<<1,1>>>();
float timeMs = 0.0;
cudaEvent_t start;
cudaEvent_t stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
checkCudaErrors(cudaEventRecord(start, 0));
checkCudaErrors(cudaDeviceSynchronize());
float totalTime = 0;
// Benchmark phase
for (int i = 0; i < iters; ++i) {
float loopMs = 0;
empty<<<1,1024>>>();
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&loopMs, start, stop);
//std::cout<<i<<" "<<loopMs<<std::endl;
timeMs += loopMs;
}
float averTime = timeMs/iters;
return averTime;
//printf(" Average Launch Time %f ms \r\n",averTime);
}
memInfo TestFunc_BandWidth(){
memBuff sData;
BandWidth obj;
sData.szMem = 1024*1024*100;
obj.memBandInit(sData);
memInfo info = obj.getBandWidth();
return info;
}
float TestFunc_CalcPerformance(){
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
int nRepeatNum = 10;
int iterNumInKernel = 32;
int threadsNum = prop.maxThreadsPerBlock;
int blocksNum = 1;
if(threadsNum == 0) {
int warpSize = prop.warpSize;
if(warpSize == 0)
warpSize = 32;
threadsNum = warpSize * 2;
if(threadsNum > 512)
threadsNum = 512;
}
float* pfDiviceBuff = NULL;
checkCudaErrors(cudaMalloc((void**)&pfDiviceBuff,threadsNum*sizeof(float)));
checkCudaErrors(cudaMemset(pfDiviceBuff,0,threadsNum*sizeof(float)));
std::shared_ptr<float> pArray(new float[threadsNum]);
float*ptrArray = pArray.get();
for(int i = 0;i<threadsNum;++i){
ptrArray[i] = i*0.03141592653f;
}
checkCudaErrors(cudaMemcpy(pfDiviceBuff,ptrArray,threadsNum*sizeof(float),cudaMemcpyHostToDevice));
float timeMs = 0.0;
cudaEvent_t start;
cudaEvent_t stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
for(int i = 0;i<nRepeatNum;++i){
float loopMs = 0.0;
cudaEventRecord(start, 0);
CalcKernelFloat<<<blocksNum, threadsNum>>>(pfDiviceBuff,iterNumInKernel);
checkCudaErrors(cudaGetLastError());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&loopMs, start, stop);
timeMs += loopMs;
}
int nOpsNumofFMA = 2; // Number of operations per one loop
int nFMAInstNum = 256; // Size of instruction block
int nIterofFMAOps = 8; // Number of instruction blocks in loop
float GFLOPs =
((float)prop.multiProcessorCount * (float)threadsNum // max cores * warp
*(float)iterNumInKernel*(float)nIterofFMAOps *(float)nFMAInstNum *(float)nOpsNumofFMA // ops of cores
*(float)nRepeatNum *1000.0/(float)timeMs) // ops per Second
/1000/1000/1000; // G
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaFree(pfDiviceBuff);
//printf("GFLOPs %10f \r\n",GFLOPs);
return GFLOPs;
}
BenchMark.cpp
#include "BenchMarks.h"
BandWidth::BandWidth(void){
}
BandWidth::~BandWidth(void){
memAllocFree();
}
void BandWidth::memBandInit(memBuff sData){
m_sData = sData;
if (m_sData.szMem==0){
m_sData.szMem = 1024*1024*1;
}
m_sData.memHostPage = (void*)malloc(m_sData.szMem);
auto state = cudaMallocHost((void**)&m_sData.memHostPin,m_sData.szMem);
cudaMalloc((void**)&m_sData.memDevice1,m_sData.szMem);
cudaMalloc((void**)&m_sData.memDevice2,m_sData.szMem);
}
float BandWidth::getMemSpeed(GPUCOPYMODEL model,int iterNum){
float timeMs = 0.0;
cudaEvent_t start;
cudaEvent_t stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
for(int i = 0;i<iterNum;++i){
float loopMs = 0.0;
cudaEventRecord(start, 0);
switch(model){
case COPY_MODE_Hpage2D:
checkCudaErrors(cudaMemcpy(m_sData.memDevice1, m_sData.memHostPage, m_sData.szMem, cudaMemcpyHostToDevice));
break;
case COPY_MODE_D2Hpage:
checkCudaErrors(cudaMemcpy(m_sData.memHostPage, m_sData.memDevice1, m_sData.szMem, cudaMemcpyDeviceToHost));
break;
case COPY_MODE_Hpin2D:
checkCudaErrors(cudaMemcpy(m_sData.memDevice2, m_sData.memHostPin, m_sData.szMem, cudaMemcpyHostToDevice));
break;
case COPY_MODE_D2Hpin:
checkCudaErrors(cudaMemcpy(m_sData.memHostPin, m_sData.memDevice2, m_sData.szMem, cudaMemcpyDeviceToHost));
break;
case COPY_MODE_D2D:
checkCudaErrors(cudaMemcpy(m_sData.memDevice1, m_sData.memDevice2, m_sData.szMem, cudaMemcpyDeviceToHost));
break;
default:
break;
}
cudaEventRecord(stop, 0);
checkCudaErrors(cudaEventSynchronize(stop));
cudaEventElapsedTime(&loopMs, start, stop);
timeMs += loopMs;
}
checkCudaErrors(cudaEventDestroy(start));
checkCudaErrors(cudaEventDestroy(stop));
float bandwidthGiBs = (1000 *m_sData.szMem*iterNum) / (timeMs *(float)(1 << 30));
return bandwidthGiBs;
}
void BandWidth::memAllocFree(){
if (m_sData.memDevice1!=NULL){
cudaFree(m_sData.memDevice1);
}
if (m_sData.memDevice2!=NULL){
cudaFree(m_sData.memDevice2);
}
if (m_sData.memHostPage!=NULL){
free(m_sData.memHostPage);
}
if (m_sData.memHostPin!=NULL){
cudaFreeHost(m_sData.memHostPin);
}
}
memInfo BandWidth::getBandWidth(){
memInfo info;
info.fH2DPage = getMemSpeed(COPY_MODE_Hpage2D);
info.fD2HPage = getMemSpeed(COPY_MODE_D2Hpage);
info.fH2DPin = getMemSpeed(COPY_MODE_Hpin2D);
info.fD2HPin = getMemSpeed(COPY_MODE_D2Hpin);
info.fD2D = getMemSpeed(COPY_MODE_D2D);
return info;
}
Main.cpp
void TestFunc_PerfMon(){
TestFunc_Launchlatency();
printf(" TimeStamp Compute(Gflops) fH2D fD2H fH2DPin fD2HPin fD2D(GiB/s) \r\n");
for (int i=0;i<10000;++i){
std::string strtime = LabBase::getTime_Str();
memInfo info = TestFunc_BandWidth();
float Gflops = TestFunc_CalcPerformance();
printf("%s %10f %10f %10f %10f %10f %10f \r\n",strtime.c_str(),Gflops,info.fH2DPage,info.fD2HPage,info.fH2DPin,info.fD2HPin,info.fD2D);
Sleep(1000);
}
}