准备条件: 语言-C++; 显卡-1070;
速度: 处理一张1920X1080分辨率的图片速度为35ms左右
目的: 使用多线程提高人脸检测的速度
结果:发现最多只能同时跑3个人脸处理的线程就把GPU的加载率占满了,再多开线程并不能提升实际
总体处理的效率
应用描述:一台电脑同时实时检测多个视频流里的人脸信息
遇到的问题:1. 多线程同时检测人脸的时候不能使用同一个加载模型后的detector
2. 因为视频路数比较多,所以不每一个线程加一个模型,需要把处理视频从每一个视频的线程中脱离出来
大致思想1:对应每一个视频开一个线程
开三个detector并分别加载模型
每一个视频线程读到图片到送到detector端并选取空闲的detector进行处理后返回结果
因为这样的方法数据交叉太多,一直遇到访问冲突,经过一段时间思考后果然换成另一种另一种思路
大致思想2:对应每一个视频开一个线程,负责读图差送入到队列Q
对于每一个detector分别开一个线程,当空闲的时候就取Q里的图片进行处理并返回结果
这样情况数据清析,稍微加一点锁就不会出现访问冲突的情况,并且detector只是检测人脸,其它处理可以
在视频的线程中处理,有利于效率。
这是第一次写多线程,其中的数据访问冲突,相互之间的逻辑比想象中还是要复杂不少。虽然最后结果看上去似乎简单明了,这都是走了一些弯路过来的。因为没有系统的学习过,都是用的时候查找的一些资料,应该这样的写法很粗陋。这也遵守了做程序员的一条宗旨:程序能跑就行,实在不行删库跑路。
一点很乱的测试代码记录,从main函数开始只看关键点就好:
#include <dlib/threads.h>
#include <dlib/misc_api.h> // for dlib::sleep
#include <dlib/logger.h>
#include <vector>
#include <windows.h>
#include <process.h>
#include "opencv.hpp"
#include <dlib\opencv.h>
#include <time.h>
using namespace dlib;
using namespace cv;
thread_pool tp(3);
// ----------------------------------------------------------------------------------------
#include <iostream>
#include <dlib/dnn.h>
#include <dlib/data_io.h>
#include <dlib/image_processing.h>
#include <dlib/gui_widgets.h>
#include <queue>
using namespace std;
template <long num_filters, typename SUBNET> using con5d = con<num_filters, 5, 5, 2, 2, SUBNET>;
template <long num_filters, typename SUBNET> using con5 = con<num_filters, 5, 5, 1, 1, SUBNET>;
template <typename SUBNET> using downsampler = relu<affine<con5d<32, relu<affine<con5d<32, relu<affine<con5d<16, SUBNET>>>>>>>>>;
template <typename SUBNET> using rcon5 = relu<affine<con5<45, SUBNET>>>;
using net_type = loss_mmod<con<1, 9, 9, 1, 1, rcon5<rcon5<rcon5<downsampler<input_rgb_image_pyramid<pyramid_down<6>>>>>>>>;
// ----------------------------------------------------------------------------------------
const int pathLen = 256;
DWORD start = GetTickCount();
int imageCnt = 0;
int cnt = 0;
unsigned mytask(void* argc)
{
cout << cnt++ << endl;
char* fileDir = "D:/pic/test_faceDetection/pics/";
char* saveDir = "D:/pic/test_faceDetection/results-dlib-CNN-pyramid_up/";
std::vector<file> files = get_files_in_directory_tree(fileDir, match_ending(".jpg"));
cout << files.size() << endl;
net_type net;
deserialize("D:/python/test/mmod_human_face_detector.dat") >> net;
char* rtsp = (char*)argc;
VideoCapture cap(rtsp);
while (cap.isOpened())
{
Mat _image;
cap >> _image;
if (_image.empty())
{
cout << "empty" << endl;
break;
}
matrix<rgb_pixel> img;
cvtColor(_image, _image, CV_BGR2RGB);
assign_image(img, cv_image<rgb_pixel>(_image));
auto dets = net(img);
Sleep(50);
cout << "result size: " << dets.size() << endl;
if (dets.size() > 0)
{
cvtColor(_image, _image, CV_BGR2RGB);
for (auto d : dets)
{
int left = d.rect.left();
int right = d.rect.right();
int top = d.rect.top();
int bottom = d.rect.bottom();
int width = d.rect.width();
int height = d.rect.height();
//1195 559
cout << "left=" << left
<< "right = " << right
<< "top =" << top
<< "bottom=" << bottom
<<"width ="<<width
<<"height="<<height
<< endl;
cv::Rect rec = cv::Rect(left, top,width,height);
cv::rectangle(_image, rec,Scalar(0, 0, 255), 3);
}
//imshow("hehhe", _image);
//waitKey(0);
}
}
return 0;
}
const int nCUDA = 1;
net_type nets[nCUDA];
bool CUDARunning[nCUDA];
CRITICAL_SECTION CUDARunningLocks[nCUDA];
void load_models()
{
for (int i = 0; i < nCUDA; i++)
deserialize("D:/python/test/mmod_human_face_detector.dat") >> nets[i];
}
CRITICAL_SECTION qLock;
void init_locks()
{
for (int i = 0; i < nCUDA; i++)
InitializeCriticalSection(&CUDARunningLocks[i]);
InitializeCriticalSection(&qLock);
}
void detector(Mat _image)
{
matrix<rgb_pixel> img;
cvtColor(_image, _image, CV_BGR2RGB);
assign_image(img, cv_image<rgb_pixel>(_image));
while (true)
{
int i = 0;
for (; i < nCUDA; i++) if (!CUDARunning[i])
{
try
{
CUDARunning[i] = true;
auto dets = nets[i](img);
cout << "result size: " << dets.size() << endl;
if (dets.size() > 0)
{
cvtColor(_image, _image, CV_BGR2RGB);
for (auto d : dets)
{
int left = d.rect.left();
int right = d.rect.right();
int top = d.rect.top();
int bottom = d.rect.bottom();
int width = d.rect.width();
int height = d.rect.height();
//1195 559
cout << "left=" << left
<< "right = " << right
<< "top =" << top
<< "bottom=" << bottom
<< "width =" << width
<< "height=" << height
<< endl;
cv::Rect rec = cv::Rect(left, top, width, height);
cv::rectangle(_image, rec, Scalar(0, 0, 255), 3);
}
//imshow("hehhe", _image);
//waitKey(0);
}
CUDARunning[i] = false;
break;
}
catch (exception e)
{
cout << e.what() << endl;
break;
}
}
if (i < nCUDA)
break;
}
}
struct CImage
{
public:
CImage()
{
}
CImage(matrix<rgb_pixel> _dlibImg, Mat _cvImg)
{
dlibImg = _dlibImg;
cvImg = _cvImg;
}
matrix<rgb_pixel> dlibImg;
Mat cvImg;
};
std::queue<CImage > q;
Mat image[10000];
matrix<rgb_pixel> imgs[10000];
int tail=0;
int tail2;
int tail3;
class my_queue
{
public:
my_queue(int n = 6)
{
n++;
img = new matrix<rgb_pixel>[n];
head = 0;
tail = 0;
N = n;
}
~my_queue()
{
delete img;
}
unsigned size()
{
return (head + N - tail) % N;
}
bool empty()
{
return head == tail;
}
matrix<rgb_pixel> front()
{
if (empty())
{
cout << "queue is empty, please do not attempt to acquire!" << endl;
}
return img[head];
}
void pop()
{
if (head != tail)
head = (head + 1) % N;
}
bool full()
{
return (head + N - tail) % N == N - 1;
}
void push(matrix<rgb_pixel> data)
{
if (!full())
{
img[tail] = data;
tail = (tail + 1) % N;
}
}
private:
matrix<rgb_pixel> *img;
int tail, head;
int N;
};
my_queue qq(50);
unsigned mytask2(void* argc)
{
//cout << cnt++ << endl;
char* fileDir = "D:/pic/test_faceDetection/pics/";
char* saveDir = "D:/pic/test_faceDetection/results-dlib-CNN-pyramid_up/";
std::vector<file> files = get_files_in_directory_tree(fileDir, match_ending(".jpg"));
// cout << files.size() << endl;
net_type net;
char* rtsp = (char*)argc;
VideoCapture cap(rtsp);
DWORD start = GetTickCount();
while (cap.isOpened())
{
Mat _image;
cap >> _image;
cap >> _image;
if (_image.empty())
{
cout << "empty" << endl;
cout << rtsp << endl;
Sleep(100);
cap.release();
cap.open(rtsp);
continue;
break;
}
// matrix<rgb_pixel> img;
Mat convertImage;
EnterCriticalSection(&qLock);
if (q.size() < 50)
{
cvtColor(_image, convertImage, CV_BGR2RGB);
int temp = tail++;
// cout << "temp: " << temp << endl;
matrix<rgb_pixel> img;
assign_image(img, cv_image<rgb_pixel>(_image));
q.push(CImage(img, _image));
}
LeaveCriticalSection(&qLock);
//cvtColor(_image, _image, CV_BGR2RGB);
//assign_image(imgs[temp], cv_image<rgb_pixel>(_image));
//cout << rtsp << endl;
//detector(_image);
Sleep(10);
if (GetTickCount() - start > 200000)
{
break;
}
}
return 0;
}
int processCnt;
unsigned CUDA_processer(void* arg)
{
net_type net;
deserialize("D:/python/test/mmod_human_face_detector.dat") >> net;
matrix<rgb_pixel> img;
while (true)
{
int index = 0;
EnterCriticalSection(&qLock);
bool flag = !q.empty();
if (flag)
{
// img = q.front().dlibImg;
assign_image(img, cv_image<rgb_pixel>(q.front().cvImg));
DWORD tic = GetTickCount();
//cvtColor(q.front().cvImg, q.front().cvImg, CV_BGR2RGB);
// imwrite("D:\\pic\\dlib.jpg", q.front().cvImg);
cout << GetTickCount() - tic << endl;
q.pop();
}
LeaveCriticalSection(&qLock);
if (flag)
{
//EnterCriticalSection(&qLock);
//matrix<rgb_pixel> img;
//cvtColor(image[tail2], image[tail2], CV_BGR2RGB);
//assign_image(img, cv_image<rgb_pixel>(image[tail2]));
//LeaveCriticalSection(&qLock);
//EnterCriticalSection(&qLock);
//q.pop();
//LeaveCriticalSection(&qLock);
auto dets = net(img);
for (auto d : dets)
{
cout<<"labels" << d.label << endl;
}
// cout << dets.size() << endl;
// img.steal_memory();
printf("%dth -- %d released!\n",++processCnt, index);
}
else
Sleep(5);
}
return 0;
}
// ----------------------------------------------------------------------------------------
int main()
{
// tell the logger to print out everything
// Schedule the thread pool to call taskobj.mytask(). Note that all forms of
// add_task() pass in the task object by reference. This means you must make sure,
// in this case, that taskobj isn't destructed until after the task has finished
// executing.
load_models();
init_locks();
unsigned int threadID;
const int n = 4;
HANDLE h[n];
//
char cameraDir[][256] = { "视频流1",
"视频流2",
"视频流3",
"视频流地址4" };
InitializeCriticalSection(&qLock);
Sleep(10);
for (int i = 0; i < 3; i++)
{
_beginthreadex(NULL, 0, CUDA_processer, NULL, 0, &threadID);
}
Sleep(200);
for (int i = 0; i < n; i++)
{
h[i] = (HANDLE)_beginthreadex(NULL, 0, mytask2, cameraDir[i], 0, &threadID);
}
for (int i = 0; i < n; i++)
{
WaitForSingleObject(h[i], INFINITE);
}
system("pause");
}