OpenCV2.x-OpenCL的郁闷体验

之前体验了下OpenCV3.x的OpenCL,体验简直糟糕《OpenCV3.x-OpenCL的糟糕体验》!于是今天打算用OpenCV2.x的OpenCL做一下GPU加速实验,因为2.x和3.x的编码实现不一样。

实验环境:Windows10 + OpenCV249 + AMD Radeon R5 M430 

实验算法:OpenCV的灰度模板匹配。

为什么说是郁闷的体验呢?因为我看到了GPU的加速效果,但是出现了一个无语的问题(如下所述),所以很郁闷。具体实验过程如下:

测试CPU和GPU整个匹配过程的时间:

#include <iostream>
#include <stdio.h>
#include "opencv2/core/core.hpp"
#include "opencv2/highgui/highgui.hpp"
#include "opencv2/ocl/ocl.hpp"
#include "opencv2/nonfree/ocl.hpp"
#include "opencv2/calib3d/calib3d.hpp"
#include "opencv2/nonfree/nonfree.hpp"

using namespace cv;
using namespace cv::ocl;

#define SRC_IMG "1.bmp"
#define TMP_IMG "tmp.png"
#define RUN_TIME	1

int64 work_begin = 0;
int64 work_end = 0;

static void _work_begin_();
static double _work_end_(std::string msg);
void runMatchGrayUseCpu(int method);
void runMatchGrayUseGpu(int method);

int main(int argc, char* argv[])
{
	std::cout
	<< "Device name:"
	<< cv::ocl::Context::getContext()->getDeviceInfo().deviceName
	<< std::endl
	<< "Device Vendor:"
	<< cv::ocl::Context::getContext()->getDeviceInfo().deviceVendor
	<< std::endl
	<< "Device Profile:"
	<< cv::ocl::Context::getContext()->getDeviceInfo().deviceProfile
	<< std::endl << std::endl;

	int method = CV_TM_SQDIFF;

	std::cout << "===Test Match Template Use CPU===" << std::endl;
	_work_begin_();
	runMatchGrayUseCpu(method);
	_work_end_("CPU: test matchTemplate time");

	std::cout << "===Test Match Template Use GPU===" << std::endl;
	_work_begin_();
	runMatchGrayUseGpu(method);
	_work_end_("GPU: test matchTemplate time");
	
	waitKey();
	return 0;
}

void runMatchGrayUseCpu(int method){

	cv::Mat src = cv::imread(SRC_IMG, 1);
	cv::Mat tmp = cv::imread(TMP_IMG, 1);

	cv::Mat gray_src, gray_tmp;
	if (src.channels() == 1) gray_src = src;
	else cv::cvtColor(src, gray_src, CV_RGB2GRAY);
	if (tmp.channels() == 1) gray_tmp = tmp;
	else cv::cvtColor(tmp, gray_tmp, CV_RGB2GRAY);

	int result_cols = gray_src.cols - gray_tmp.cols + 1;
	int result_rows = gray_src.rows - gray_tmp.rows + 1;
	
	cv::Mat result = cv::Mat(result_cols, result_rows, CV_32FC1);

	cv::matchTemplate(gray_src, gray_tmp, result, method);

	cv::Point point;
	double minVal, maxVal;
	cv::Point minLoc, maxLoc;

	cv::minMaxLoc(result, &minVal, &maxVal, &minLoc, &maxLoc, cv::Mat());

	switch (method){

	case CV_TM_SQDIFF:
		point = minLoc;
		break;
	case CV_TM_SQDIFF_NORMED:
		point = minLoc;
		break;
	case CV_TM_CCORR:
	case CV_TM_CCOEFF:
		point = maxLoc;
		break;
	case CV_TM_CCORR_NORMED:
	case CV_TM_CCOEFF_NORMED:
	default:
		point = maxLoc;
		break;
	}
	std::cout << "object.x :" << point.x << " object.y :" << point.y << std::endl;
}

void runMatchGrayUseGpu(int method){

	cv::Mat src = cv::imread(SRC_IMG, 1);
	cv::Mat tmp = cv::imread(TMP_IMG, 1);
	oclMat ocl_src, ocl_tmp;
	oclMat gray_src, gray_tmp;

	ocl_src.upload(src);
	ocl_tmp.upload(tmp);
	
	if (src.channels() == 1) gray_src = src;
	else ocl::cvtColor(ocl_src, gray_src, CV_RGB2GRAY);
	if (tmp.channels() == 1) gray_tmp = tmp;
	else ocl::cvtColor(ocl_tmp, gray_tmp, CV_RGB2GRAY);
	int result_cols = gray_src.cols - gray_tmp.cols + 1;
	int result_rows = gray_src.rows - gray_tmp.rows + 1;
	oclMat ocl_result = oclMat(result_cols, result_rows, CV_32FC1);

	ocl::matchTemplate(gray_src, gray_tmp, ocl_result, method);

	cv::Point point;
	double minVal, maxVal;
	cv::Point minLoc, maxLoc;
	cv::Mat mat_result;
	ocl_result.download(mat_result);

	cv::minMaxLoc(mat_result, &minVal, &maxVal, &minLoc, &maxLoc, cv::Mat());

	switch (method){

	case CV_TM_SQDIFF:
		point = minLoc;
		break;
	case CV_TM_SQDIFF_NORMED:
		point = minLoc;
		break;
	case CV_TM_CCORR:
	case CV_TM_CCOEFF:
		point = maxLoc;
		break;
	case CV_TM_CCORR_NORMED:
	case CV_TM_CCOEFF_NORMED:
	default:
		point = maxLoc;
		break;
	}
	std::cout << "object.x :" << point.x << " object.y :" << point.y << std::endl;
}

static void _work_begin_(){

	work_begin = getTickCount();
}

static double _work_end_(std::string msg){

	work_end = getTickCount() - work_begin;
	double t = work_end / ((double)cvGetTickFrequency() * 1000.);
	std::cout << msg << "(ms) :" << t << std::endl << std::endl;
	return t;
}
实验结果:

可以看到整个过程CPU用了315毫秒,GPU用了4071毫秒。这数据显然不对(我猜想可能大部分时间用于CPU和GPU之间的数据拷贝了吧)!所以我单独卡每个算法API的执行时间,代码如下:

#include <iostream>
#include <stdio.h>
#include "opencv2/core/core.hpp"
#include "opencv2/highgui/highgui.hpp"
#include "opencv2/ocl/ocl.hpp"
#include "opencv2/nonfree/ocl.hpp"
#include "opencv2/calib3d/calib3d.hpp"
#include "opencv2/nonfree/nonfree.hpp"

using namespace cv;
using namespace cv::ocl;

#define SRC_IMG "1.bmp"
#define TMP_IMG "tmp.png"
#define RUN_TIME	1

int64 work_begin = 0;
int64 work_end = 0;

static void _work_begin_();
static double _work_end_(std::string msg);
void runMatchGrayUseCpu(int method);
void runMatchGrayUseGpu(int method);

int main(int argc, char* argv[])
{
	std::cout
	<< "Device name:"
	<< cv::ocl::Context::getContext()->getDeviceInfo().deviceName
	<< std::endl
	<< "Device Vendor:"
	<< cv::ocl::Context::getContext()->getDeviceInfo().deviceVendor
	<< std::endl
	<< "Device Profile:"
	<< cv::ocl::Context::getContext()->getDeviceInfo().deviceProfile
	<< std::endl << std::endl;

	int method = CV_TM_SQDIFF;

	std::cout << "===Test Match Template Use CPU===" << std::endl;
	runMatchGrayUseCpu(method);

	std::cout << std::endl << "===Test Match Template Use GPU===" << std::endl;
	runMatchGrayUseGpu(method);
	
	waitKey();
	return 0;
}

void runMatchGrayUseCpu(int method){

	cv::Mat src = cv::imread(SRC_IMG, 1);
	cv::Mat tmp = cv::imread(TMP_IMG, 1);

	cv::Mat gray_src, gray_tmp;
	if (src.channels() == 1) gray_src = src;
	else cv::cvtColor(src, gray_src, CV_RGB2GRAY);
	if (tmp.channels() == 1) gray_tmp = tmp;
	else cv::cvtColor(tmp, gray_tmp, CV_RGB2GRAY);

	int result_cols = gray_src.cols - gray_tmp.cols + 1;
	int result_rows = gray_src.rows - gray_tmp.rows + 1;
	
	cv::Mat result = cv::Mat(result_cols, result_rows, CV_32FC1);


	_work_begin_();
	cv::matchTemplate(gray_src, gray_tmp, result, method);
	_work_end_("CPU: matchTemplate");

	cv::Point point;
	double minVal, maxVal;
	cv::Point minLoc, maxLoc;

	_work_begin_();
	cv::minMaxLoc(result, &minVal, &maxVal, &minLoc, &maxLoc, cv::Mat());
	_work_end_("CPU: minMaxLoc");

	switch (method){

	case CV_TM_SQDIFF:
		point = minLoc;
		break;
	case CV_TM_SQDIFF_NORMED:
		point = minLoc;
		break;
	case CV_TM_CCORR:
	case CV_TM_CCOEFF:
		point = maxLoc;
		break;
	case CV_TM_CCORR_NORMED:
	case CV_TM_CCOEFF_NORMED:
	default:
		point = maxLoc;
		break;
	}
	

	std::cout << "object.x :" << point.x << " object.y :" << point.y << std::endl;
}

void runMatchGrayUseGpu(int method){

	cv::Mat src = cv::imread(SRC_IMG, 1);
	cv::Mat tmp = cv::imread(TMP_IMG, 1);
	oclMat ocl_src, ocl_tmp;
	oclMat gray_src, gray_tmp;

	_work_begin_();
	ocl_src.upload(src);
	ocl_tmp.upload(tmp);
	_work_end_("GPU: Upload image from host to device");
	

	if (src.channels() == 1) gray_src = src;
	else ocl::cvtColor(ocl_src, gray_src, CV_RGB2GRAY);
	if (tmp.channels() == 1) gray_tmp = tmp;
	else ocl::cvtColor(ocl_tmp, gray_tmp, CV_RGB2GRAY);
	int result_cols = gray_src.cols - gray_tmp.cols + 1;
	int result_rows = gray_src.rows - gray_tmp.rows + 1;
	oclMat ocl_result = oclMat(result_cols, result_rows, CV_32FC1);

	_work_begin_();
	ocl::matchTemplate(gray_src, gray_tmp, ocl_result, method);
	_work_end_("GPU: matchTemplate");

	cv::Point point;
	double minVal, maxVal;
	cv::Point minLoc, maxLoc;

	cv::Mat mat_result;
	_work_begin_();
	ocl_result.download(mat_result);
	_work_end_("GPU: Download image from device to host");


	_work_begin_();
	cv::minMaxLoc(mat_result, &minVal, &maxVal, &minLoc, &maxLoc, cv::Mat());
	_work_end_("GPU: minMaxLoc");

	switch (method){

	case CV_TM_SQDIFF:
		point = minLoc;
		break;
	case CV_TM_SQDIFF_NORMED:
		point = minLoc;
		break;
	case CV_TM_CCORR:
	case CV_TM_CCOEFF:
		point = maxLoc;
		break;
	case CV_TM_CCORR_NORMED:
	case CV_TM_CCOEFF_NORMED:
	default:
		point = maxLoc;
		break;
	}
	std::cout << "object.x :" << point.x << " object.y :" << point.y << std::endl;
}

static void _work_begin_(){

	work_begin = getTickCount();
}

static double _work_end_(std::string msg){

	work_end = getTickCount() - work_begin;
	double t = work_end / ((double)cvGetTickFrequency() * 1000.);
	std::cout << msg << "(ms) :" << t << std::endl;
	return t;
}
实验结果:

可以看到单独在模板匹配的这个环节GPU比CPU快了进50倍,符合官方给出的数据,但是在从GPU下载数据到CPU的这个过程用时也太长了吧!简直不科学!官方给出的模板匹配数据如下:


于是我又写了一小段代码测试下CPU和GPU之间传输数据是不是真的这么的慢,代码如下:

cv::Mat src = cv::imread(SRC_IMG, 1);
cv::Mat dst;
oclMat ocl_src;
_work_begin_();
ocl_src.upload(src);
ocl_src.download(dst);
_work_end_("GPU: Upload & download CPU<->GPU speed Test");
实验结果:

我靠,太TM不科学了吧!图片大小和模板匹配实验使用的是一样的,但是这次的upload和download的总时间才53毫秒。为何之前那次download要接近4秒?这是个神马情况???神呐,谁能告诉我这是为虾米啊······
以上给出的都是完整的代码,最近研究OpenCV-OCL的兄弟姐妹门可以copy运行一下看是不是一样的结果。还有对于上面的实验结果有什么想法和疑问吗,请一定要指教指教···。

目前的显卡是A卡的,下次换个N卡的机器试一下OCL吧!然后再在N卡上跑下CUDA看看什么情况!

总结:

1、可以看到单单对于模板匹配的这个过程,opencl是有加速效果的,而且加速效果还挺明显的!

2、为什么CPU和GPU之间传输数据时,差别这么大?(如上实验,其中一次用了近4秒,一次才几十毫秒?)。

3、在大量重复计算的时候用OCL加速比较合适,少量的单次计算还不如CPU来的快!





  • 1
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值