cmakelist编译opencv cuda、libjpeg-turbo库
https://blog.csdn.net/cgt19910923/article/details/86541471
编译完成直接调用imcode,解码速度提升。gpu涉及到与cpu的交互,额外需要upload、download操作,应用上需要增加上传和下载时间。仅比较resize、subtract操作处理性能有所提升。
#include <stdio.h>
#include <opencv2/opencv.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/cudawarping.hpp>
#include <opencv2/cudaarithm.hpp>
#include <chrono>
using namespace cv;
using namespace std;
typedef std::chrono::high_resolution_clock Time;
typedef std::chrono::duration<double, std::ratio<1, 1000>> ms;
typedef std::chrono::duration<float> fsec;
int main(int argc, char **argv)
{
//cpu gpu subtract
#if 0
Mat img1,img2;
Mat img1_resize = Mat(Size(416,416), CV_8UC3);
Mat img2_resize = Mat(Size(416,416), CV_8UC3);
img1 = cv::imread("motor.jpg", 1);
img2 = cv::imread("person.jpg", 1);
resize(img1, img1_resize, Size(416,416), 0, 0, CV_INTER_LINEAR);
resize(img2, img2_resize, Size(416,416), 0, 0, CV_INTER_LINEAR);
imshow("resize1",img1_resize);
waitKey(0);
imshow("resize2",img2_resize);
waitKey(0);
Mat dst = Mat(Size(416,416), CV_8UC3);
double cpusubtracttotal = 0.0;
auto cpusubtract_startTime = Time::now();
for(int i=0;i<1000;i++)
{
subtract(img1_resize,img2_resize,dst);
}
auto cpusubtract_endTime = Time::now();
fsec fscpusubtract = cpusubtract_endTime - cpusubtract_startTime;
ms d3 = std::chrono::duration_cast<ms>(fscpusubtract);
cpusubtracttotal += d3.count();
printf("Time cpusubtracttotal: %f\n", cpusubtracttotal);
imshow("cpusubtract", dst);
waitKey(0);
cuda::GpuMat cuda_src1, cuda_src2;
cuda::GpuMat cuda_dst;
cuda_src1.upload(img1_resize);
cuda_src2.upload(img1_resize);
double gpusubtracttotal = 0.0;
auto gpusubtract_startTime = Time::now();
for(int i=0;i<1000;i++)
{
cuda::subtract(cuda_src1,cuda_src2,cuda_dst);
}
auto gpusubtract_endTime = Time::now();
fsec fsgpusubtract = gpusubtract_endTime - gpusubtract_startTime;
ms d4 = std::chrono::duration_cast<ms>(fsgpusubtract);
gpusubtracttotal += d4.count();
printf("Time gpusubtract: %f\n", gpusubtracttotal);
Mat result;
cuda_dst.download(result);
imshow("gpusubtract", result);
waitKey(0);
return 0;
#endif
//cpu gpu encode decode
#if 0
const char fname[] = "motor.jpg";
Mat image=imread(fname,1);
if (image.empty())
{
printf("Can't load image %s\n", fname);
}
vector<unsigned char> inImage;
double encodetotal = 0.0;
auto encode_startTime = Time::now();
for(int i=0;i<100;i++)
{
imencode(".jpg",image,inImage);
}
auto encode_endTime = Time::now();
fsec fsencode = encode_endTime - encode_startTime;
ms d2 = std::chrono::duration_cast<ms>(fsencode);
encodetotal += d2.count();
printf("Time encode: %f\n", encodetotal);
size_t datalen=inImage.size();
unsigned char *msgImage=new unsigned char[datalen];
for(int i=0;i<datalen;i++)
{
msgImage[i]=inImage[i];
//cout<<msgImage[i]<<endl;
}
vector<unsigned char> buff;
for(int i=0;i<datalen;i++)
{
buff.push_back(msgImage[i]);
}
double decodetotal = 0.0;
auto decode_startTime = Time::now();
Mat show;
for(int j=0;j<100;j++)
{
show=imdecode(buff,CV_LOAD_IMAGE_COLOR);
}
auto decode_endTime = Time::now();
fsec fsdecode = decode_endTime - decode_startTime;
ms d3 = std::chrono::duration_cast<ms>(fsdecode);
decodetotal += d3.count();
printf("Time imcode: %f\n", decodetotal);
imshow("picture",show);
cv::waitKey(0);
#endif
//cpu gpu resize
#if 1
Mat image;
Mat cpu_resize = Mat(Size(416,416), CV_32FC3);
const char fname[] = "motor.jpg";
image = cv::imread(fname, 1);
image.convertTo(image, CV_32FC3, 1/255.0);
if (image.empty())
{
printf("Can't load image %s\n", fname);
}
Mat gpu_image;
image.copyTo(gpu_image);
double cputotal = 0.0;
auto cpu_startTime = Time::now();
for(int i=0; i<100; i++)
{
resize(image, cpu_resize, Size(416,416), 0, 0, CV_INTER_LINEAR);
}
auto cpu_endTime = Time::now();
fsec fscpu = cpu_endTime - cpu_startTime;
ms d = std::chrono::duration_cast<ms>(fscpu);
cputotal += d.count();
printf("Time CPU: %f\n", cputotal);
imshow("Resize_cpu", cpu_resize);
waitKey(0);
cuda::Stream stream;
cuda::GpuMat d_src =cuda::GpuMat(1920,1080,CV_32FC3);
cuda::GpuMat gpu_resize=cuda::GpuMat(Size(416,416),CV_32FC3);
double gputotal = 0.0;
auto gpu_startTime = Time::now();
for(int j=0; j<100; j++)
{
d_src.upload(gpu_image,stream);
cuda::resize(d_src, gpu_resize, Size(416,416), 0, 0, CV_INTER_LINEAR,stream);
}
auto gpu_endTime = Time::now();
fsec fsgpu = gpu_endTime - gpu_startTime;
ms d1 = std::chrono::duration_cast<ms>(fsgpu);
gputotal += d1.count();
printf("Time GPU: %f\n", gputotal);
Mat result;
gpu_resize.download(result,stream);
imshow("Resize_gpu", result);
waitKey(0);
#endif
return 0;
}
P100服务器测试得:uchar型 cuda resize快1.9倍,float32型 cuda resize快18.9倍,cuda subtract快3.3倍;对比libjpeg、libjpeg-turbo 编解码,其中libjpeg-turbo解码快6.5倍,但是编码慢2.1倍。
操作 | 库 | 输入尺寸 | 输出尺寸 | 处理器 | 耗时(ms) |
resize uchar | / | 1080P | 416*416 | cpu | 2.046 |
resize uchar | cuda | 1080P | 416*416 | gpu | 1.101 |
resize float32 | / | 1080P | 416*416 | cpu | 0.323 |
resize float32 | cuda | 1080P | 416*416 | gpu | 6.094 |
encode | libjpeg | 1080P | 1080P | cpu | 11.941 |
encode | libjpeg-turbo | 1080P | 1080P | cpu | 25.525 |
imcode | libjpeg | 1080P | 1080P | cpu | 79.643 |
imcode | libjpeg-turbo | 1080P | 1080P | cpu | 12.253 |
subtract | / | 416*416 | 416*416 | cpu | 0.096 |
subtract | cuda | 416*416 | 416*416 | gpu | 0.029 |