之前体验了下OpenCV3.x的OpenCL,体验简直糟糕《OpenCV3.x-OpenCL的糟糕体验》!于是今天打算用OpenCV2.x的OpenCL做一下GPU加速实验,因为2.x和3.x的编码实现不一样。
实验环境:Windows10 + OpenCV249 + AMD Radeon R5 M430
实验算法:OpenCV的灰度模板匹配。
为什么说是郁闷的体验呢?因为我看到了GPU的加速效果,但是出现了一个无语的问题(如下所述),所以很郁闷。具体实验过程如下:
测试CPU和GPU整个匹配过程的时间:
#include <iostream>
#include <stdio.h>
#include "opencv2/core/core.hpp"
#include "opencv2/highgui/highgui.hpp"
#include "opencv2/ocl/ocl.hpp"
#include "opencv2/nonfree/ocl.hpp"
#include "opencv2/calib3d/calib3d.hpp"
#include "opencv2/nonfree/nonfree.hpp"
using namespace cv;
using namespace cv::ocl;
#define SRC_IMG "1.bmp"
#define TMP_IMG "tmp.png"
#define RUN_TIME 1
int64 work_begin = 0;
int64 work_end = 0;
static void _work_begin_();
static double _work_end_(std::string msg);
void runMatchGrayUseCpu(int method);
void runMatchGrayUseGpu(int method);
int main(int argc, char* argv[])
{
std::cout
<< "Device name:"
<< cv::ocl::Context::getContext()->getDeviceInfo().deviceName
<< std::endl
<< "Device Vendor:"
<< cv::ocl::Context::getContext()->getDeviceInfo().deviceVendor
<< std::endl
<< "Device Profile:"
<< cv::ocl::Context::getContext()->getDeviceInfo().deviceProfile
<< std::endl << std::endl;
int method = CV_TM_SQDIFF;
std::cout << "===Test Match Template Use CPU===" << std::endl;
_work_begin_();
runMatchGrayUseCpu(method);
_work_end_("CPU: test matchTemplate time");
std::cout << "===Test Match Template Use GPU===" << std::endl;
_work_begin_();
runMatchGrayUseGpu(method);
_work_end_("GPU: test matchTemplate time");
waitKey();
return 0;
}
void runMatchGrayUseCpu(int method){
cv::Mat src = cv::imread(SRC_IMG, 1);
cv::Mat tmp = cv::imread(TMP_IMG, 1);
cv::Mat gray_src, gray_tmp;
if (src.channels() == 1) gray_src = src;
else cv::cvtColor(src, gray_src, CV_RGB2GRAY);
if (tmp.channels() == 1) gray_tmp = tmp;
else cv::cvtColor(tmp, gray_tmp, CV_RGB2GRAY);
int result_cols = gray_src.cols - gray_tmp.cols + 1;
int result_rows = gray_src.rows - gray_tmp.rows + 1;
cv::Mat result = cv::Mat(result_cols, result_rows, CV_32FC1);
cv::matchTemplate(gray_src, gray_tmp, result, method);
cv::Point point;
double minVal, maxVal;
cv::Point minLoc, maxLoc;
cv::minMaxLoc(result, &minVal, &maxVal, &minLoc, &maxLoc, cv::Mat());
switch (method){
case CV_TM_SQDIFF:
point = minLoc;
break;
case CV_TM_SQDIFF_NORMED:
point = minLoc;
break;
case CV_TM_CCORR:
case CV_TM_CCOEFF:
point = maxLoc;
break;
case CV_TM_CCORR_NORMED:
case CV_TM_CCOEFF_NORMED:
default:
point = maxLoc;
break;
}
std::cout << "object.x :" << point.x << " object.y :" << point.y << std::endl;
}
void runMatchGrayUseGpu(int method){
cv::Mat src = cv::imread(SRC_IMG, 1);
cv::Mat tmp = cv::imread(TMP_IMG, 1);
oclMat ocl_src, ocl_tmp;
oclMat gray_src, gray_tmp;
ocl_src.upload(src);
ocl_tmp.upload(tmp);
if (src.channels() == 1) gray_src = src;
else ocl::cvtColor(ocl_src, gray_src, CV_RGB2GRAY);
if (tmp.channels() == 1) gray_tmp = tmp;
else ocl::cvtColor(ocl_tmp, gray_tmp, CV_RGB2GRAY);
int result_cols = gray_src.cols - gray_tmp.cols + 1;
int result_rows = gray_src.rows - gray_tmp.rows + 1;
oclMat ocl_result = oclMat(result_cols, result_rows, CV_32FC1);
ocl::matchTemplate(gray_src, gray_tmp, ocl_result, method);
cv::Point point;
double minVal, maxVal;
cv::Point minLoc, maxLoc;
cv::Mat mat_result;
ocl_result.download(mat_result);
cv::minMaxLoc(mat_result, &minVal, &maxVal, &minLoc, &maxLoc, cv::Mat());
switch (method){
case CV_TM_SQDIFF:
point = minLoc;
break;
case CV_TM_SQDIFF_NORMED:
point = minLoc;
break;
case CV_TM_CCORR:
case CV_TM_CCOEFF:
point = maxLoc;
break;
case CV_TM_CCORR_NORMED:
case CV_TM_CCOEFF_NORMED:
default:
point = maxLoc;
break;
}
std::cout << "object.x :" << point.x << " object.y :" << point.y << std::endl;
}
static void _work_begin_(){
work_begin = getTickCount();
}
static double _work_end_(std::string msg){
work_end = getTickCount() - work_begin;
double t = work_end / ((double)cvGetTickFrequency() * 1000.);
std::cout << msg << "(ms) :" << t << std::endl << std::endl;
return t;
}
实验结果:
可以看到整个过程CPU用了315毫秒,GPU用了4071毫秒。这数据显然不对(我猜想可能大部分时间用于CPU和GPU之间的数据拷贝了吧)!所以我单独卡每个算法API的执行时间,代码如下:
#include <iostream>
#include <stdio.h>
#include "opencv2/core/core.hpp"
#include "opencv2/highgui/highgui.hpp"
#include "opencv2/ocl/ocl.hpp"
#include "opencv2/nonfree/ocl.hpp"
#include "opencv2/calib3d/calib3d.hpp"
#include "opencv2/nonfree/nonfree.hpp"
using namespace cv;
using namespace cv::ocl;
#define SRC_IMG "1.bmp"
#define TMP_IMG "tmp.png"
#define RUN_TIME 1
int64 work_begin = 0;
int64 work_end = 0;
static void _work_begin_();
static double _work_end_(std::string msg);
void runMatchGrayUseCpu(int method);
void runMatchGrayUseGpu(int method);
int main(int argc, char* argv[])
{
std::cout
<< "Device name:"
<< cv::ocl::Context::getContext()->getDeviceInfo().deviceName
<< std::endl
<< "Device Vendor:"
<< cv::ocl::Context::getContext()->getDeviceInfo().deviceVendor
<< std::endl
<< "Device Profile:"
<< cv::ocl::Context::getContext()->getDeviceInfo().deviceProfile
<< std::endl << std::endl;
int method = CV_TM_SQDIFF;
std::cout << "===Test Match Template Use CPU===" << std::endl;
runMatchGrayUseCpu(method);
std::cout << std::endl << "===Test Match Template Use GPU===" << std::endl;
runMatchGrayUseGpu(method);
waitKey();
return 0;
}
void runMatchGrayUseCpu(int method){
cv::Mat src = cv::imread(SRC_IMG, 1);
cv::Mat tmp = cv::imread(TMP_IMG, 1);
cv::Mat gray_src, gray_tmp;
if (src.channels() == 1) gray_src = src;
else cv::cvtColor(src, gray_src, CV_RGB2GRAY);
if (tmp.channels() == 1) gray_tmp = tmp;
else cv::cvtColor(tmp, gray_tmp, CV_RGB2GRAY);
int result_cols = gray_src.cols - gray_tmp.cols + 1;
int result_rows = gray_src.rows - gray_tmp.rows + 1;
cv::Mat result = cv::Mat(result_cols, result_rows, CV_32FC1);
_work_begin_();
cv::matchTemplate(gray_src, gray_tmp, result, method);
_work_end_("CPU: matchTemplate");
cv::Point point;
double minVal, maxVal;
cv::Point minLoc, maxLoc;
_work_begin_();
cv::minMaxLoc(result, &minVal, &maxVal, &minLoc, &maxLoc, cv::Mat());
_work_end_("CPU: minMaxLoc");
switch (method){
case CV_TM_SQDIFF:
point = minLoc;
break;
case CV_TM_SQDIFF_NORMED:
point = minLoc;
break;
case CV_TM_CCORR:
case CV_TM_CCOEFF:
point = maxLoc;
break;
case CV_TM_CCORR_NORMED:
case CV_TM_CCOEFF_NORMED:
default:
point = maxLoc;
break;
}
std::cout << "object.x :" << point.x << " object.y :" << point.y << std::endl;
}
void runMatchGrayUseGpu(int method){
cv::Mat src = cv::imread(SRC_IMG, 1);
cv::Mat tmp = cv::imread(TMP_IMG, 1);
oclMat ocl_src, ocl_tmp;
oclMat gray_src, gray_tmp;
_work_begin_();
ocl_src.upload(src);
ocl_tmp.upload(tmp);
_work_end_("GPU: Upload image from host to device");
if (src.channels() == 1) gray_src = src;
else ocl::cvtColor(ocl_src, gray_src, CV_RGB2GRAY);
if (tmp.channels() == 1) gray_tmp = tmp;
else ocl::cvtColor(ocl_tmp, gray_tmp, CV_RGB2GRAY);
int result_cols = gray_src.cols - gray_tmp.cols + 1;
int result_rows = gray_src.rows - gray_tmp.rows + 1;
oclMat ocl_result = oclMat(result_cols, result_rows, CV_32FC1);
_work_begin_();
ocl::matchTemplate(gray_src, gray_tmp, ocl_result, method);
_work_end_("GPU: matchTemplate");
cv::Point point;
double minVal, maxVal;
cv::Point minLoc, maxLoc;
cv::Mat mat_result;
_work_begin_();
ocl_result.download(mat_result);
_work_end_("GPU: Download image from device to host");
_work_begin_();
cv::minMaxLoc(mat_result, &minVal, &maxVal, &minLoc, &maxLoc, cv::Mat());
_work_end_("GPU: minMaxLoc");
switch (method){
case CV_TM_SQDIFF:
point = minLoc;
break;
case CV_TM_SQDIFF_NORMED:
point = minLoc;
break;
case CV_TM_CCORR:
case CV_TM_CCOEFF:
point = maxLoc;
break;
case CV_TM_CCORR_NORMED:
case CV_TM_CCOEFF_NORMED:
default:
point = maxLoc;
break;
}
std::cout << "object.x :" << point.x << " object.y :" << point.y << std::endl;
}
static void _work_begin_(){
work_begin = getTickCount();
}
static double _work_end_(std::string msg){
work_end = getTickCount() - work_begin;
double t = work_end / ((double)cvGetTickFrequency() * 1000.);
std::cout << msg << "(ms) :" << t << std::endl;
return t;
}
实验结果:
可以看到单独在模板匹配的这个环节GPU比CPU快了进50倍,符合官方给出的数据,但是在从GPU下载数据到CPU的这个过程用时也太长了吧!简直不科学!官方给出的模板匹配数据如下:
于是我又写了一小段代码测试下CPU和GPU之间传输数据是不是真的这么的慢,代码如下:
cv::Mat src = cv::imread(SRC_IMG, 1);
cv::Mat dst;
oclMat ocl_src;
_work_begin_();
ocl_src.upload(src);
ocl_src.download(dst);
_work_end_("GPU: Upload & download CPU<->GPU speed Test");
实验结果:
我靠,太TM不科学了吧!图片大小和模板匹配实验使用的是一样的,但是这次的upload和download的总时间才53毫秒。为何之前那次download要接近4秒?这是个神马情况???神呐,谁能告诉我这是为虾米啊······
以上给出的都是完整的代码,最近研究OpenCV-OCL的兄弟姐妹门可以copy运行一下看是不是一样的结果。还有对于上面的实验结果有什么想法和疑问吗,请一定要指教指教···。
目前的显卡是A卡的,下次换个N卡的机器试一下OCL吧!然后再在N卡上跑下CUDA看看什么情况!
总结:
1、可以看到单单对于模板匹配的这个过程,opencl是有加速效果的,而且加速效果还挺明显的!
2、为什么CPU和GPU之间传输数据时,差别这么大?(如上实验,其中一次用了近4秒,一次才几十毫秒?)。
3、在大量重复计算的时候用OCL加速比较合适,少量的单次计算还不如CPU来的快!