在将算法配置到工程的时候,或者在验证自身算法的过程中,经常会遇到耗时较长的运算卡住流程的脖子,尤其是串行执行的一系列运算。举一个可能引起大家共鸣的例子,我们需要对一个文件夹中的所有图片提取特征,串行计算将会十分痛苦,用并行则更加合适。作为一个会算法的软件工程师,自然要在最快的时间内实现一个能用的线程池demo。
在线程池中,如果为了保守创建较少的线程数,则不能充分利用性能;而创建过多的线程数会有可能在我们使用别的进程时,出现CPU的调度延迟导致缓慢,因为我们的任务是计算密集型,计算时会占用一整个核。在这里我使用一个多余的线程来监听CPU的状态,让线程池可以根据CPU占用率灵活控制本进程的计算量,在最大程度榨取计算机性能的同时保证用户对机器的其他基本操作。
另一个问题,如果我们希望该线程池作为基本的utils,适用于不同模块,它需要兼容静态函数、成员函数以及不同数量的形参。
在此我设计了一个简单的线程池,将它称呼为“计算池”似乎更加合适~
#ifndef THREAD_POOL_H
#define THREAD_POOL_H
#include <iostream>
#include <thread>
#include <mutex>
#include <functional>
#include <queue>
#include "utils/cpu_utils.h"
using std::cout;
using std::endl;
class ThreadPool {
public:
using Task = std::function<void()>;
ThreadPool();
ThreadPool(int size);
~ThreadPool();
void start();
void stop();
bool finished() {
std::unique_lock<std::mutex> lg(mutex_);
return (isAllResting() && task_queue_.empty());
}
template <class T, typename... Args>
void addTask(T&& obj, Args&&... args) {
Task func = std::bind(std::forward<T>(obj), std::forward<Args>(args)...);
{
std::unique_lock<std::mutex> lg(mutex_);
task_queue_.push(func);
}
}
private:
ThreadPool(const ThreadPool&) = delete;
void processTask(int thread_id);
bool isAllResting() {
bool all_resting = true;
for (auto& v : resting_list_) {
if (v == 0) {
all_resting = false;
break;
}
}
return all_resting;
}
std::queue<Task> task_queue_;
std::vector<std::thread*> thread_list_;
std::vector<int> available_list_, resting_list_;
std::thread* cpu_check_thread_;
void cpuCheckCycle();
std::mutex mutex_;
bool started_ = false;
int thread_size_;
};
inline ThreadPool::ThreadPool() {
int cpu_c = GetCPUCount();
cout << "cpu_c : " << cpu_c << "\n";
thread_size_ = cpu_c;
thread_list_.resize(cpu_c);
available_list_.resize(cpu_c);
resting_list_.resize(cpu_c);
for (int i = 0; i < cpu_c; ++i) {
available_list_[i] = 0;
resting_list_[i] = 1;
}
cpu_check_thread_ = new std::thread(&ThreadPool::cpuCheckCycle, this);
}
inline ThreadPool::ThreadPool(int size) : thread_size_(size) {
thread_list_.resize(size);
}
inline ThreadPool::~ThreadPool() {
if (started_) stop();
cpu_check_thread_->join();
delete cpu_check_thread_;
}
inline void ThreadPool::start() {
if (!started_) {
started_ = true;
for (int i = 0; i < (int)thread_list_.size(); ++i) {
auto& ptr = thread_list_[i];
ptr = new std::thread(&ThreadPool::processTask, this, i);
}
} else {
cout << "thread pool is already started.\n";
}
}
inline void ThreadPool::stop() {
started_ = false;
for (auto& ptr : thread_list_) {
ptr->join();
delete ptr;
}
thread_list_.clear();
}
inline void ThreadPool::processTask(int thread_id) {
cout << "process task start : " << thread_id << endl;
while (1) {
usleep(10000);
if (available_list_[thread_id] == 0) {
continue;
}
Task task;
bool get = false;
{
std::unique_lock<std::mutex> lg(mutex_);
if (!task_queue_.empty()) {
task = task_queue_.front();
task_queue_.pop();
get = true;
resting_list_[thread_id] = 0;
} else {
resting_list_[thread_id] = 1;
}
}
if (get) {
task();
resting_list_[thread_id] = 1;
}
if (!started_) {
break;
}
}
cout << "process task finished" << endl;
}
inline void ThreadPool::cpuCheckCycle() {
while (1) {
double cpu_usage = get_sysCpuUsage();
std::unique_lock<std::mutex> lg(mutex_);
/// 查看线程池当前占用的核数
int occupy_c = 0;
for (auto& v : resting_list_) {
if (v == 0)
++occupy_c;
}
int other_occupy_count = (int)((0.01*cpu_usage) * (double)(thread_size_)) - occupy_c;
other_occupy_count = std::max(other_occupy_count, 1);
int available_cpu_count = thread_size_ - other_occupy_count - 1;
available_cpu_count = std::max(available_cpu_count, 1);
if (!isAllResting()) {
cout << "cpu_usage : " << cpu_usage << " , occupy_c : " << occupy_c << " , other_occupy_count : " << other_occupy_count << "\n";
cout << "available_cpu_count : " << available_cpu_count << "\n";
}
for (size_t i = 0; i < thread_size_; ++i) {
if (i >= available_cpu_count) {
available_list_[i] = 0;
} else {
available_list_[i] = 1;
}
}
}
}
#endif
奇怪的格式有点难看,大家凑合着看吧~另外就是cpu_utils.h的内容,如下,是CPU占用率的模块,我参考了51CTO博客作者313119992的博客,感谢他的启发。
#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
typedef struct cpu_occupy_ //定义一个cpu occupy的结构体
{
char name[20]; //定义一个char类型的数组名name有20个元素
unsigned int user; //定义一个无符号的int类型的user
unsigned int nice; //定义一个无符号的int类型的nice
unsigned int system; //定义一个无符号的int类型的system
unsigned int idle; //定义一个无符号的int类型的idle
unsigned int iowait;
unsigned int irq;
unsigned int softirq;
}cpu_occupy_t;
static int GetCPUCount() {
cpu_set_t cs;
CPU_ZERO(&cs);
sched_getaffinity(0, sizeof(cs), &cs);
int count = 0;
for (int i = 0; i < 1000; i++) {
if (CPU_ISSET(i, &cs))
count++;
}
return count;
}
static double cal_cpuoccupy (cpu_occupy_t *o, cpu_occupy_t *n) {
double od, nd;
double id, sd;
double cpu_use ;
od = (double) (o->user + o->nice + o->system +o->idle+o->softirq+o->iowait+o->irq);//第一次(用户+优先级+系统+空闲)的时间再赋给od
nd = (double) (n->user + n->nice + n->system +n->idle+n->softirq+n->iowait+n->irq);//第二次(用户+优先级+系统+空闲)的时间再赋给od
id = (double) (n->idle); //用户第一次和第二次的时间之差再赋给id
sd = (double) (o->idle) ; //系统第一次和第二次的时间之差再赋给sd
if((nd-od) != 0)
cpu_use =100.0 - ((id-sd))/(nd-od)*100.00; //((用户+系统)乖100)除(第一次和第二次的时间差)再赋给g_cpu_used
else
cpu_use = 0;
return cpu_use;
}
static void get_cpuoccupy (cpu_occupy_t *cpust) {
FILE *fd;
int n;
char buff[256];
cpu_occupy_t *cpu_occupy;
cpu_occupy=cpust;
fd = fopen ("/proc/stat", "r");
if(fd == NULL)
{
perror("fopen:");
exit (0);
}
fgets (buff, sizeof(buff), fd);
sscanf (buff, "%s %u %u %u %u %u %u %u", cpu_occupy->name, &cpu_occupy->user, &cpu_occupy->nice,&cpu_occupy->system, &cpu_occupy->idle ,&cpu_occupy->iowait,&cpu_occupy->irq,&cpu_occupy->softirq);
fclose(fd);
}
static double get_sysCpuUsage() {
cpu_occupy_t cpu_stat1;
cpu_occupy_t cpu_stat2;
double cpu;
get_cpuoccupy((cpu_occupy_t *)&cpu_stat1);
sleep(1);
//第二次获取cpu使用情况
get_cpuoccupy((cpu_occupy_t *)&cpu_stat2);
//计算cpu使用率
cpu = cal_cpuoccupy ((cpu_occupy_t *)&cpu_stat1, (cpu_occupy_t *)&cpu_stat2);
return cpu;
}
那么如何使用这个线程池?用一个小的测试用例展示一下,我假设有extraction和matching两种任务,它们分别需要用线程池并行计算,它们计算的函数形参的类型与数目各不相同。
#include "thread_pool.h"
using std::cout;
using std::endl;
class Extraction {
public:
Extraction() {
c = 140;
for (int i = 0; i < c; ++i) {
ExtractParam p;
p.v1 = i;
p.v2 = i+2;
p.v3 = i+3;
params.push_back(p);
}
// results.resize(c);
}
int getCount() { return c; }
void process(int i, int f1, int f2, int f3) {
double v1, v2, v3;
getParam(i, v1, v2, v3);
usleep(1000000);
std::unique_lock<std::mutex> ul(proc_mutex);
double val = v1 + v2 + v3;
results.push_back(val);
cout << "process : " << i << " , results : " << val << "\n";
}
void getResults() {
for (auto& ret : results)
cout << ret << " ";
cout << endl;
}
void processOmp(ThreadPool& pool);
private:
struct ExtractParam {
double v1, v2, v3;
};
void getParam(int i, double& v1, double& v2, double& v3) {
v1 = params[i].v1;
v2 = params[i].v2;
v3 = params[i].v3;
}
std::vector<ExtractParam> params;
std::vector<double> results;
std::mutex proc_mutex;
int c;
};
class Matching {
public:
Matching() {
c = 1300;
for (int i = 0; i < c; ++i) {
MatchParam p;
p.v1 = i;
p.v2 = i+2;
params.push_back(p);
}
results.resize(c);
}
int getCount() { return c; }
void process(int i, int f1, int f2) {
cout << "process : " << i << "\n";
double v1, v2;
getParam(i, v1, v2);
usleep(10000);
std::unique_lock<std::mutex> ul(proc_mutex);
results[i] = v1 + v2;
}
void getResults() {
for (auto& ret : results)
cout << ret << " ";
cout << endl;
}
private:
struct MatchParam {
double v1, v2;
};
void getParam(int i, double& v1, double& v2) {
v1 = params[i].v1;
v2 = params[i].v2;
}
std::vector<MatchParam> params;
std::vector<double> results;
std::mutex proc_mutex;
int c;
};
static void extractionProc(Extraction* e, int i, int f1, int f2, int f3) {
e->process(i, f1, f2, f3);
}
static void matchingProc(Matching* m, int i, int f1, int f2) {
m->process(i, f1, f2);
}
void Extraction::processOmp(ThreadPool& thread_pool) {
for (int i = 0; i < getCount(); ++i) {
thread_pool.addTask(extractionProc, this, i, 0, 0, 0);
thread_pool.addTask(&Extraction::process, this, i, 0, 0, 0);
}
thread_pool.start();
while (1) {
usleep(1000);
if (thread_pool.finished())
break;
}
getResults();
}
int main(int argc, char** argv) {
ThreadPool thread_pool;
Extraction extraction;
extraction.processOmp(thread_pool);
return 0;
}
对于成员函数,我在这做了一个小trick,用一个静态函数作为线程池的task,而该函数拥有类实例的指针,相当于是验证成员函数和静态函数的适用性。
另外,我们先将所有的计算任务塞入队列中,再开始计算,这样做的原因只是为了满足我自己的特定需求,我希望所有任务都就位后再统一开始。如果是像一般的IO密集型或需要即时响应的任务,则用一个条件变量用作从队列中取任务的阻塞,使未开始计算的线程争抢该条件变量即可。
这样的“计算池”基本上可以满足我的需要,能够在执行高强度运算的同时保证能上网摸鱼不卡。如果有需要的同学可以直接拿去复制粘贴,若是从中获得灵感,有更好的延伸和实现,那就更好不过了。