GPU CPU运算时间测试

GPU CPU运算时间测试

本文主要探讨GPU,CPU在做一些复杂运算的时间测试

实验任务

1.向量加法

两个相同维度的向量a,b做加法,分别测试GPU并行时间(包含数据拷贝时间),CPU串行时间。

2.双边滤波

简要介绍:

双边滤波(Bilateral filter)是一种非线性的滤波方法,是结合图像的空间邻近度和像素值相似度的一种折衷处理,同时考虑空域信息和灰度相似性,达到保边去噪的目的。具有简单、非迭代、局部的特点。

双边滤波在计算上是昂贵的,并且具有使优化复杂化的非线性分量。当在中央处理单元(CPU)上顺序运行时,这会花费很多时间,并且双边滤波算法的快速逼近范围很广,使得其算法优化极其困难。

任务描述:

测试\(sigmaColor = 15.0, sigmaSpace = 15.0\),高斯核直径\(d\)变化时, 在大小为 \(800 \times 1068\)图片上的效率。

样例图片:

Young_girl_smiling_in_sunshine_(2)

实验环境

  • Ubuntu 18.04
  • Intel(R) Xeon(R) Silver 4210 CPU @ 2.20GHz
  • RTX3090

ps:之前在本地电脑测过向量加法,现在突然放假回家,因此两个任务统一环境,重新测一遍。

实验结果

典型的 CUDA 程序的执行流程如下:

image-20221215120613208

因此对于cuda程序的计时可以分为三部分:数据从内存拷贝到显存执行计算计算结果从显存拷贝回内存

对于CPU的计时只有执行计算

1.向量加法

向量a,b的维度262144\((512 \times 512)\)1048576\((1024 \times 1024)\)4194304\((2048 \times 2048)\)
实验结果截图
image-20221215152828472
image-20221215152703955
image-20221215152555449
内存数据拷贝到GPU时间消耗0.000631 sec0.001988 sec0.008227sec
GPU计算时间0.000016 sec0.000013 sec0.000016 sec
结果从显存拷贝到内存时间消耗0.000415sec0.001169 sec0.002168 sec
显存计算总时间(上述相加)0.001062sec0.003170 sec0.010411 sec
CPU 计算时间0.001103 sec0.003196 sec0.017829 sec

分析:

  1. 对比GPU和CPU的计算时间,随着数据维度增大,GPU并行计算时间没有明显的增大,CPU计算时间逐渐增大。
  2. 随着数据维度增大,数据在内存到显存双向拷贝时间逐渐增大。
  3. 在简单的向量相加任务上,GPU并没有突出的优势

2.双边滤波

d = 2d = 4d = 8d = 64
实验截图
image-20221215155716184
image-20221215155904698
image-20221215160000779
image-20221215160218732
数据拷贝总时间1.532352 ms1.566624 ms1.563072 ms1.494659 ms
GPU计算时间0.287392 ms0.697184 ms2.123936 ms107.015038 ms
GPU总时间1.819744 ms2.263808 ms3.687008 ms108.509697 ms
CPU计算时间376.712 ms365.958 ms504.757 ms7407.72 ms

可以发现GPU还是很猛的。

放一下d=8的结果,实现了类似于磨皮的效果:

原图CPUGPU
space
spaceCPU
spaceGPU

实验代码

1.向量加法

  • sum.cu

    #include <cuda_runtime.h>
    #include <stdio.h>
    #include <time.h>
    #include "freshman.h"
    
    
    // CPU 加法
    void sumArrays(float *a, float *b, float *res, const int size)
    {
      for (int i = 0; i < size; i += 1)
      {
        res[i] = a[i] + b[i];
      }
    }
    
    // GPU 加法
    __global__ void sumArraysGPU(float *a, float *b, float *res, int N)
    {
      int i = blockIdx.x * blockDim.x + threadIdx.x;
      if (i < N)
        res[i] = a[i] + b[i];
    }
    
    
    int main(int argc, char **argv)
    {
      // set up device
      initDevice(0);
    
       int nElem = 512*512;
      //int nElem = 1024*1024;
      //int nElem = 2048*2048;
    
    
      printf("Vector size:%d\n", nElem);
    
      // 内存数据申请空间
      int nByte = sizeof(float) * nElem;
      float *a_h = (float *)malloc(nByte);
      float *b_h = (float *)malloc(nByte);
      float *res_h = (float *)malloc(nByte);
      float *res_from_gpu_h = (float *)malloc(nByte);
      memset(res_h, 0, nByte);
      memset(res_from_gpu_h, 0, nByte);
    
      // 内存数据随机初始化
      initialData(a_h, nElem);
      initialData(b_h, nElem);
    
      // 显存申请空间
      float *a_d, *b_d, *res_d;
      CHECK(cudaMalloc((float **)&a_d, nByte));
      CHECK(cudaMalloc((float **)&b_d, nByte));
      CHECK(cudaMalloc((float **)&res_d, nByte));
    
    
      // 内存到显存数据拷贝
      clock_t start, end;
      start = clock();
      CHECK(cudaMemcpy(a_d, a_h, nByte, cudaMemcpyHostToDevice));
      CHECK(cudaMemcpy(b_d, b_h, nByte, cudaMemcpyHostToDevice));
      end = clock();
    
      double copytime = (double)(end - start) / CLOCKS_PER_SEC;
      printf("内存数据拷贝到GPU时间消耗\t %f sec\n", copytime);
    
    
    
      dim3 block(512);
      dim3 grid((nElem - 1) / block.x + 1);
    
      // GPU 加法
      double iStart, iElaps;
      iStart = cpuSecond();
      sumArraysGPU<<<grid, block>>>(a_d, b_d, res_d, nElem);
      iElaps = cpuSecond() - iStart;
    
      double GPU_Cla = iElaps;
      printf("GPU计算时间 \t\t\t\t %f sec\n", GPU_Cla);
    
    
      //显存到内存数据拷贝
      start = clock();
      CHECK(cudaMemcpy(res_from_gpu_h, res_d, nByte, cudaMemcpyDeviceToHost));
      end = clock();
      double copytime2 = (double)(end - start) / CLOCKS_PER_SEC;
      printf("结果从显存拷贝到内存时间消耗\t %f sec\n", copytime2);
    
      printf("GPU时间总消耗\t\t\t\t %f sec\n", copytime2 + GPU_Cla + copytime);
    
      // CPU 加法
      start = clock();
      sumArrays(a_h,b_h,res_h,nElem);
      end = clock();
      printf("CPU 计算时间\t\t\t\t %f sec\n", (double)(end - start) / CLOCKS_PER_SEC);
    
    
    
      checkResult(res_h, res_from_gpu_h, nElem);
      cudaFree(a_d);
      cudaFree(b_d);
      cudaFree(res_d);
    
      free(a_h);
      free(b_h);
      free(res_h);
      free(res_from_gpu_h);
    
      return 0;
    }
  • freshman.h

    #ifndef FRESHMAN_H
    #define FRESHMAN_H
    #define CHECK(call)\
    {\
      const cudaError_t error=call;\
      if(error!=cudaSuccess)\
      {\
          printf("ERROR: %s:%d,",__FILE__,__LINE__);\
          printf("code:%d,reason:%s\n",error,cudaGetErrorString(error));\
          exit(1);\
      }\
    }
    
    
    #include <time.h>
    #ifdef _WIN32
    #	include <windows.h>
    #else
    #	include <sys/time.h>
    #endif
    #ifdef _WIN32
    int gettimeofday(struct timeval *tp, void *tzp)
    {
      time_t clock;
      struct tm tm;
      SYSTEMTIME wtm;
      GetLocalTime(&wtm);
      tm.tm_year   = wtm.wYear - 1900;
      tm.tm_mon   = wtm.wMonth - 1;
      tm.tm_mday   = wtm.wDay;
      tm.tm_hour   = wtm.wHour;
      tm.tm_min   = wtm.wMinute;
      tm.tm_sec   = wtm.wSecond;
      tm. tm_isdst  = -1;
      clock = mktime(&tm);
      tp->tv_sec = clock;
      tp->tv_usec = wtm.wMilliseconds * 1000;
      return (0);
    }
    #endif
    double cpuSecond()
    {
      struct timeval tp;
      gettimeofday(&tp,NULL);
      return((double)tp.tv_sec+(double)tp.tv_usec*1e-6);
    
    }
    void initialData(float* ip,int size)
    {
      time_t t;
      srand((unsigned )time(&t));
      for(int i=0;i<size;i++)
      {
        ip[i]=(float)(rand()&0xffff)/1000.0f;
      }
    }
    void initialData_int(int* ip, int size)
    {
    	time_t t;
    	srand((unsigned)time(&t));
    	for (int i = 0; i<size; i++)
    	{
    		ip[i] = int(rand()&0xff);
    	}
    }
    void printMatrix(float * C,const int nx,const int ny)
    {
      float *ic=C;
      printf("Matrix<%d,%d>:",ny,nx);
      for(int i=0;i<ny;i++)
      {
        for(int j=0;j<nx;j++)
        {
          printf("%6f ",C[j]);
        }
        ic+=nx;
        printf("\n");
      }
    }
    
    void initDevice(int devNum)
    {
      int dev = devNum;
      cudaDeviceProp deviceProp;
      CHECK(cudaGetDeviceProperties(&deviceProp,dev));
      printf("Using device %d: %s\n",dev,deviceProp.name);
      CHECK(cudaSetDevice(dev));
    
    }
    void checkResult(float * hostRef,float * gpuRef,const int N)
    {
      double epsilon=1.0E-8;
      for(int i=0;i<N;i++)
      {
        if(abs(hostRef[i]-gpuRef[i])>epsilon)
        {
          printf("Results don\'t match!\n");
          printf("%f(hostRef[%d] )!= %f(gpuRef[%d])\n",hostRef[i],i,gpuRef[i],i);
          return;
        }
      }
      printf("Check result success!\n");
    }
    #endif//FRESHMAN_H

2. \(双边滤波^{[1]}\)

  • kernel.cu

    #include <iostream>
    #include <algorithm>
    #include <ctime>
    #include <opencv2/opencv.hpp>
    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    #include <cuda.h>
    #include <device_functions.h>
    #define M_PI           3.14159265358979323846
    
    
    using namespace std;
    using namespace cv;
    
    //一维高斯kernel数组
    __constant__ float cGaussian[64];
    //声明纹理参照系,以全局变量形式出现
    texture<unsigned char, 2, cudaReadModeElementType> inTexture;
    
    
    //计算一维高斯距离权重,二维高斯权重可由一维高斯权重做积得到
    void updateGaussian(int r, double sd)
    {
    	float fGaussian[64];
    	for (int i = 0; i < 2 * r + 1; i++)
    	{
    		float x = i - r;
    		fGaussian[i] = 1 / (sqrt(2 * M_PI) * sd) * expf(-(x * x) / (2 * sd * sd));
    	}
    	cudaMemcpyToSymbol(cGaussian, fGaussian, sizeof(float) * (2 * r + 1));
    }
    
    // 一维高斯函数,计算像素差异权重
    __device__ inline double gaussian(float x, double sigma)
    {
    	return 1 / (sqrt(2 * M_PI) * sigma) * __expf(-(powf(x, 2)) / (2 * powf(sigma, 2)));
    }
    
    __global__ void gpuCalculation(unsigned char* input, unsigned char* output, int width ,int height, int r,double sigmaColor)
    {
    	int txIndex = blockIdx.x * blockDim.x + threadIdx.x;
    	int tyIndex = blockIdx.y * blockDim.y + threadIdx.y;
    
    	if ((txIndex < width) && (tyIndex < height))
    	{
    		double iFiltered = 0;
    		double k = 0;
    		//纹理拾取,得到要计算的中心像素点
    		unsigned char centrePx = tex2D(inTexture, txIndex, tyIndex);
    		//进行卷积运算
    		for (int dy = -r; dy <= r; dy++) {
    			for (int dx = -r; dx <= r; dx++) {
    				//得到kernel区域内另一像素点
    				unsigned char currPx = tex2D(inTexture, txIndex + dx, tyIndex + dy);
    				// Weight = 1D Gaussian(x_axis) * 1D Gaussian(y_axis) * Gaussian(Color difference)
    				double w = (cGaussian[dy + r] * cGaussian[dx + r]) * gaussian(centrePx - currPx, sigmaColor);
    				iFiltered += w * currPx;
    				k += w;
    			}
    		}
    		output[tyIndex * width + txIndex] = iFiltered / k;
    	}
    }
    
    void MyBilateralFilter(const Mat& input, Mat& output, int r, double sigmaColor, double sigmaSpace)
    {
    	//GPU计时事件
    	cudaEvent_t start, stop, cal_start, cal_stop;
    	float time_copy, total_time;
    	cudaEventCreate(&start);
    	cudaEventCreate(&stop);
    
    	cudaEventCreate(&cal_start);
    	cudaEventCreate(&cal_stop);
    
    	cudaEventRecord(start, 0);
    
    	//计算图片大小
    	int gray_size = input.step * input.rows;
    
    	//在device上开辟2维数据空间保存输入输出数据
    	unsigned char* d_input = NULL;
    	unsigned char* d_output;
    
    	updateGaussian(r, sigmaSpace);
    
    	//分配device内存
    	cudaMalloc<unsigned char>(&d_output, gray_size);
    
    	//纹理绑定
    	size_t pitch;
    	cudaMallocPitch(&d_input, &pitch, sizeof(unsigned char) * input.step, input.rows);
    	cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
    	cudaMemcpy2D(d_input, pitch, input.ptr(), sizeof(unsigned char) * input.step, sizeof(unsigned char) * input.step, input.rows, cudaMemcpyHostToDevice);
    	//将纹理参照系绑定到一个CUDA数组
    	cudaBindTexture2D(0, inTexture, d_input, desc, input.step, input.rows, pitch);
    	
    	dim3 block(16, 16);
    	dim3 grid((input.cols + block.x - 1) / block.x, (input.rows + block.y - 1) / block.y);
    
    	cudaEventRecord(cal_start, 0);
    	gpuCalculation <<< grid, block >>> (d_input, d_output, input.cols, input.rows, r, sigmaColor);
    	cudaEventRecord(cal_stop, 0);
    	cudaEventSynchronize(cal_stop);
    
    	cudaEventElapsedTime(&time_copy, cal_start, cal_stop);
    	printf("Cal Time for the GPU: %f ms\n", time_copy);
    
    	//将device上的运算结果拷贝到host上
    	cudaMemcpy(output.ptr(), d_output, gray_size, cudaMemcpyDeviceToHost);
    
    
    	cudaEventRecord(stop, 0);
    	cudaEventSynchronize(stop);
    
    	//释放device和host上分配的内存
    	cudaFree(d_input);
    	cudaFree(d_output);
    
    	// Calculate and print kernel run time
    	cudaEventElapsedTime(&total_time, start, stop);
    	printf("Copy Time for the GPU: %f ms\n", total_time - time_copy);
    	printf("Toal Time for the GPU: %f ms\n", total_time);
    	
    
    }
  • main.cpp

    #include <opencv2/opencv.hpp>
    #include <iostream>
    
    using namespace std;
    using namespace cv;
    
    void MyBilateralFilter(const Mat& input, Mat& output, int r, double sI, double sS);
    
    
    int main() {
    
    	//高斯核直径
    	int d = 64;
    	double sigmaColor = 15.0, sigmaSpace = 15.0;
    	//将原始图像转化为灰度图像再打开
    	Mat srcImg = imread("1.jpg", IMREAD_GRAYSCALE);
    	//分配host内存
    	Mat dstImg(srcImg.rows, srcImg.cols, CV_8UC1);
    	Mat dstImgCV;
    
    	//在GPU上运行测速
    	MyBilateralFilter(srcImg, dstImg, d/2, sigmaColor, sigmaSpace);
    
    	//使用OpenCV bilateral filter在cpu上测速
    	clock_t start_s = clock();
    	bilateralFilter(srcImg, dstImgCV, d, sigmaColor, sigmaSpace);
    	clock_t stop_s = clock();
    	cout << "Time for the CPU: " << (stop_s - start_s) / double(CLOCKS_PER_SEC) * 1000 << " ms" << endl;
    	//展示图片
    	imshow("原图", srcImg);
    	imwrite("space2.jpg", srcImg);
    	imshow("GPU加速双边滤波", dstImg);
    	imwrite("space2.jpg", dstImg);
    	imshow("CPU双边滤波", dstImgCV);
    	imwrite("space2.jpg", dstImgCV);
    	cv::waitKey();
    }

进一步的讨论(双边滤波)

  1. gird,block的划分对速度的影响?

    要回答这个问题需要一些先验知识,所以最近去学习了一些GPU的组织架构,熟悉了一下相关概念。

    由于 SM 的基本执行单元是包含 32 个线程的线程束,所以 block 大小一般要设置为 32 的倍数。

    因此在d=8、32的时候,分别设置了以下五种情况:

    d = 8d = 32
    image-20221222202504478
    image-20221222202944396
    image-20221222203806111
    image-20221222200016580
    image-20221222203038017
    image-20221222203636531
    image-20221222202640203
    image-20221222203122139
    image-20221222203604794
    image-20221222203217915
    image-20221222203159267
    image-20221222203525667
    image-20221222203310392
    image-20221222203303418
    image-20221222203443844

    只关注第一行,Cal time for GPU就行。

    就这个任务总体来看,对RTX3090来说,对于一张分辨率确定的图片,在dim3 block(64,64)上发生了改变。我查了一下3090的SM数量是82。

    image-20221222204128294

img

想了一下,还没法回答这个问题,需要以后继续深入探讨。

  1. 关于使用纹理内存的问题

    我看互联网上的描述,差不多就是纹理内存有一些特性(比如可以归一化等等),访问二维矩阵的邻域会获得加速

    image-20221222205810074

但是这个访问邻域应该更多地体现在差值上:

image-20221222210129009

我又回头看了一下作者的代码:

image-20221222210345934

都是整型,其实也就是没有插值(因为都精确的落在了某个像素上),所以修改为直接从input读取数据,两种效果应该差不多,果不其然:

texture memoryglobal memory
d = 8
image-20221222203038017
image-20221222210925648
d = 32
image-20221222203636531
image-20221222210851978

相关代码:

kernel_global_input.cu

#include <iostream>
#include <algorithm>
#include <ctime>
#include <opencv2/opencv.hpp>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda.h>
#include <device_functions.h>
#define M_PI  3.14159265358979323846


using namespace std;
using namespace cv;

//一维高斯kernel数组
__constant__ float cGaussian[64];
//声明纹理参照系,以全局变量形式出现
texture<unsigned char, 2, cudaReadModeElementType> inTexture;


//计算一维高斯距离权重,二维高斯权重可由一维高斯权重做积得到
void updateGaussian(int r, double sd)
{
	float fGaussian[64];
	for (int i = 0; i < 2 * r + 1; i++)
	{
		float x = i - r;
		fGaussian[i] = 1 / (sqrt(2 * M_PI) * sd) * expf(-(x * x) / (2 * sd * sd));
	}
	cudaMemcpyToSymbol(cGaussian, fGaussian, sizeof(float) * (2 * r + 1));
}

// 一维高斯函数,计算像素差异权重
__device__ inline double gaussian(float x, double sigma)
{
	return 1 / (sqrt(2 * M_PI) * sigma) * __expf(-(powf(x, 2)) / (2 * powf(sigma, 2)));
}


__global__ void gpuCalculation(unsigned char* input, unsigned char* output, int width ,int height, int r,double sigmaColor)
{
	int txIndex = blockIdx.x * blockDim.x + threadIdx.x;
	int tyIndex = blockIdx.y * blockDim.y + threadIdx.y;

	if ((txIndex < width) && (tyIndex < height))
	{
		double iFiltered = 0;
		double k = 0;
		unsigned char centrePx = input[tyIndex * width + txIndex];
		//进行卷积运算
		for (int dy = -r; dy <= r; dy++) {
			for (int dx = -r; dx <= r; dx++) {
				if(txIndex+dx >= 0 && tyIndex+dy >=0 && txIndex+dx <= width && tyIndex+dy <= height)
				{
					//得到kernel区域内另一像素点
					unsigned char currPx = input[(tyIndex+dy) * width + txIndex+dx];
					// Weight = 1D Gaussian(x_axis) * 1D Gaussian(y_axis) * Gaussian(Color difference)
					double w = (cGaussian[dy + r] * cGaussian[dx + r]) * gaussian(centrePx - currPx, sigmaColor);
					iFiltered += w * currPx;
					k += w;
				}
				
			}
		}
		output[tyIndex * width + txIndex] = iFiltered / k;
	}
}

void MyBilateralFilter(const Mat& input, Mat& output, int r, double sigmaColor, double sigmaSpace)
{
	//GPU计时事件
	cudaEvent_t start, stop, cal_start, cal_stop;
	float time_copy, total_time;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	cudaEventCreate(&cal_start);
	cudaEventCreate(&cal_stop);

	cudaEventRecord(start, 0);

	//计算图片大小
	int gray_size = input.step * input.rows;

	//在device上开辟2维数据空间保存输入输出数据
	unsigned char* d_input;
	unsigned char* d_output;

	updateGaussian(r, sigmaSpace);

	//分配device内存
	cudaMalloc<unsigned char>(&d_input, gray_size);
	cudaMalloc<unsigned char>(&d_output, gray_size);

	// global memory 图片数据拷贝
	cudaMemcpy(d_input, input.ptr(), gray_size,  cudaMemcpyHostToDevice);
	

	dim3 block(16, 16);
	dim3 grid((input.cols + block.x - 1) / block.x, (input.rows + block.y - 1) / block.y);

	cudaEventRecord(cal_start, 0);
	gpuCalculation <<< grid, block >>> (d_input, d_output, input.cols, input.rows, r, sigmaColor);
	cudaEventRecord(cal_stop, 0);
	cudaEventSynchronize(cal_stop);
	cudaEventElapsedTime(&time_copy, cal_start, cal_stop);
	printf("Cal Time for the GPU: %f ms\n", time_copy);

	//将device上的运算结果拷贝到host上
	cudaMemcpy(output.ptr(), d_output, gray_size, cudaMemcpyDeviceToHost);


	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);

	//释放device和host上分配的内存
	cudaFree(d_input);
	cudaFree(d_output);

	// Calculate and print kernel run time
	cudaEventElapsedTime(&total_time, start, stop);
	printf("Copy Time for the GPU: %f ms\n", total_time - time_copy);
	printf("Toal Time for the GPU: %f ms\n", total_time);
	

}

引用

[1] https://github.com/xytroot/Bilateral-Filter

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

王行知

您的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值