软件开发随笔：用C++写一个计算密集型适用的自适应线程池

若愚和小巧

已于 2022-10-09 09:59:48 修改

阅读量719

点赞数 2

分类专栏： C++ 数据结构与算法文章标签：开发语言 c++

于 2022-09-25 00:04:58 首次发布

本文链接：https://blog.csdn.net/weixin_42048023/article/details/126840794

版权

数据结构与算法同时被 2 个专栏收录

14 篇文章 4 订阅

订阅专栏

C++

2 篇文章 0 订阅

订阅专栏

在将算法配置到工程的时候，或者在验证自身算法的过程中，经常会遇到耗时较长的运算卡住流程的脖子，尤其是串行执行的一系列运算。举一个可能引起大家共鸣的例子，我们需要对一个文件夹中的所有图片提取特征，串行计算将会十分痛苦，用并行则更加合适。作为一个会算法的软件工程师，自然要在最快的时间内实现一个能用的线程池demo。

在线程池中，如果为了保守创建较少的线程数，则不能充分利用性能；而创建过多的线程数会有可能在我们使用别的进程时，出现CPU的调度延迟导致缓慢，因为我们的任务是计算密集型，计算时会占用一整个核。在这里我使用一个多余的线程来监听CPU的状态，让线程池可以根据CPU占用率灵活控制本进程的计算量，在最大程度榨取计算机性能的同时保证用户对机器的其他基本操作。

另一个问题，如果我们希望该线程池作为基本的utils，适用于不同模块，它需要兼容静态函数、成员函数以及不同数量的形参。

在此我设计了一个简单的线程池，将它称呼为“计算池”似乎更加合适~

#ifndef THREAD_POOL_H
#define THREAD_POOL_H

#include <iostream>
#include <thread>
#include <mutex>
#include <functional>
#include <queue>
#include "utils/cpu_utils.h"

using std::cout;
using std::endl;

class ThreadPool {
public:
	using Task = std::function<void()>;
	ThreadPool();
	ThreadPool(int size);
	~ThreadPool();
	void start();
	void stop();
	bool finished() {
		std::unique_lock<std::mutex> lg(mutex_);
		
		return (isAllResting() && task_queue_.empty()); 
	}

	template <class T, typename... Args>
    void addTask(T&& obj, Args&&... args) {
		Task func = std::bind(std::forward<T>(obj), std::forward<Args>(args)...);
	
		{
			std::unique_lock<std::mutex> lg(mutex_);
			task_queue_.push(func);
		}
	}

private:
	ThreadPool(const ThreadPool&) = delete;
	
	void processTask(int thread_id);

	bool isAllResting() {
		bool all_resting = true;
		for (auto& v : resting_list_) {
			if (v == 0) {
				all_resting = false;
				break;
			}
		}
		return all_resting;
	}

	std::queue<Task> task_queue_;

	std::vector<std::thread*> thread_list_;
	std::vector<int> available_list_, resting_list_;
	std::thread* cpu_check_thread_;
	void cpuCheckCycle();
	
	std::mutex mutex_;

	bool started_ = false;
	int thread_size_;
};

inline ThreadPool::ThreadPool() {
	int cpu_c = GetCPUCount();
	cout << "cpu_c : " << cpu_c << "\n";
	thread_size_ = cpu_c;
	thread_list_.resize(cpu_c);
	available_list_.resize(cpu_c);
	resting_list_.resize(cpu_c);
	for (int i = 0; i < cpu_c; ++i) {
		available_list_[i] = 0;
		resting_list_[i] = 1;
	}
	cpu_check_thread_ = new std::thread(&ThreadPool::cpuCheckCycle, this);
}

inline ThreadPool::ThreadPool(int size) : thread_size_(size) {
	thread_list_.resize(size);
}

inline ThreadPool::~ThreadPool() {
	if (started_) stop();
	cpu_check_thread_->join();
	delete cpu_check_thread_;
}

inline void ThreadPool::start() {
	if (!started_) {
		started_ = true;
		for (int i = 0; i < (int)thread_list_.size(); ++i) {
			auto& ptr = thread_list_[i];
			ptr = new std::thread(&ThreadPool::processTask, this, i);
		}
	} else {
		cout << "thread pool is already started.\n";
	}
}

inline void ThreadPool::stop() {
	started_ = false;
	
	for (auto& ptr : thread_list_) {
		ptr->join();
		delete ptr;
	}
	thread_list_.clear();
}

inline void ThreadPool::processTask(int thread_id) {
	cout << "process task start : " << thread_id << endl;
	
	while (1) {
		usleep(10000);

		if (available_list_[thread_id] == 0) {
			continue;
		}

		Task task;
		bool get = false;
		{
			std::unique_lock<std::mutex> lg(mutex_);

			if (!task_queue_.empty()) {
				task = task_queue_.front();
				task_queue_.pop();
				get = true;
				resting_list_[thread_id] = 0;
			} else {
				resting_list_[thread_id] = 1;
			}
		}

		if (get) {
			task();
			resting_list_[thread_id] = 1;
		}

		if (!started_) {
			break;
		}
	}

	cout << "process task finished" << endl;
}

inline void ThreadPool::cpuCheckCycle() {
	while (1) {
		double cpu_usage = get_sysCpuUsage();
		
		std::unique_lock<std::mutex> lg(mutex_);
		
		/// 查看线程池当前占用的核数
		int occupy_c = 0;
		for (auto& v : resting_list_) {
			if (v == 0)
				++occupy_c;
		}
		
		int other_occupy_count = (int)((0.01*cpu_usage) * (double)(thread_size_)) - occupy_c;
		other_occupy_count = std::max(other_occupy_count, 1);
		int available_cpu_count = thread_size_ - other_occupy_count - 1;
		available_cpu_count = std::max(available_cpu_count, 1);

		if (!isAllResting()) {
			cout << "cpu_usage : " << cpu_usage << " , occupy_c : " << occupy_c << " , other_occupy_count : " << other_occupy_count << "\n";
			cout << "available_cpu_count : " << available_cpu_count << "\n";
		}
		for (size_t i = 0; i < thread_size_; ++i) {
			if (i >= available_cpu_count) {
				available_list_[i] = 0;
			} else {
				available_list_[i] = 1;
	                }
	        }
        }
}

#endif

奇怪的格式有点难看，大家凑合着看吧～另外就是cpu_utils.h的内容，如下，是CPU占用率的模块，我参考了51CTO博客作者313119992的博客，感谢他的启发。

#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

typedef struct cpu_occupy_          //定义一个cpu occupy的结构体
{
    char name[20];                  //定义一个char类型的数组名name有20个元素
    unsigned int user;              //定义一个无符号的int类型的user
    unsigned int nice;              //定义一个无符号的int类型的nice
    unsigned int system;            //定义一个无符号的int类型的system
    unsigned int idle;              //定义一个无符号的int类型的idle
    unsigned int iowait;
    unsigned int irq;
    unsigned int softirq;
}cpu_occupy_t;

static int GetCPUCount() {
	cpu_set_t cs;
	CPU_ZERO(&cs);
	sched_getaffinity(0, sizeof(cs), &cs);

	int count = 0;
	for (int i = 0; i < 1000; i++) {
		if (CPU_ISSET(i, &cs))
			count++;
	}
	return count;
}

static double cal_cpuoccupy (cpu_occupy_t *o, cpu_occupy_t *n) {
    double od, nd;
    double id, sd;
    double cpu_use ;
 
    od = (double) (o->user + o->nice + o->system +o->idle+o->softirq+o->iowait+o->irq);//第一次(用户+优先级+系统+空闲)的时间再赋给od
    nd = (double) (n->user + n->nice + n->system +n->idle+n->softirq+n->iowait+n->irq);//第二次(用户+优先级+系统+空闲)的时间再赋给od
 
    id = (double) (n->idle);    //用户第一次和第二次的时间之差再赋给id
    sd = (double) (o->idle) ;    //系统第一次和第二次的时间之差再赋给sd
    if((nd-od) != 0)
        cpu_use =100.0 - ((id-sd))/(nd-od)*100.00; //((用户+系统)乖100)除(第一次和第二次的时间差)再赋给g_cpu_used
    else 
        cpu_use = 0;
    return cpu_use;
}
 
static void get_cpuoccupy (cpu_occupy_t *cpust) {
    FILE *fd;
    int n;
    char buff[256];
    cpu_occupy_t *cpu_occupy;
    cpu_occupy=cpust;
 
    fd = fopen ("/proc/stat", "r");
    if(fd == NULL)
    {
            perror("fopen:");
            exit (0);
    }
    fgets (buff, sizeof(buff), fd);
 
    sscanf (buff, "%s %u %u %u %u %u %u %u", cpu_occupy->name, &cpu_occupy->user, &cpu_occupy->nice,&cpu_occupy->system, &cpu_occupy->idle ,&cpu_occupy->iowait,&cpu_occupy->irq,&cpu_occupy->softirq);
 
    fclose(fd);
}
 
static double get_sysCpuUsage() {
    cpu_occupy_t cpu_stat1;
    cpu_occupy_t cpu_stat2;
    double cpu;
    get_cpuoccupy((cpu_occupy_t *)&cpu_stat1);
    sleep(1);
    //第二次获取cpu使用情况
    get_cpuoccupy((cpu_occupy_t *)&cpu_stat2);
 
    //计算cpu使用率
    cpu = cal_cpuoccupy ((cpu_occupy_t *)&cpu_stat1, (cpu_occupy_t *)&cpu_stat2);
 
    return cpu;
}

那么如何使用这个线程池？用一个小的测试用例展示一下，我假设有extraction和matching两种任务，它们分别需要用线程池并行计算，它们计算的函数形参的类型与数目各不相同。

#include "thread_pool.h"

using std::cout;
using std::endl;

class Extraction {
public:
    Extraction() {
        c = 140;
        for (int i = 0; i < c; ++i) {
            ExtractParam p;
            p.v1 = i;
            p.v2 = i+2;
            p.v3 = i+3;
            params.push_back(p);
        }
        // results.resize(c);
    }
    int getCount() { return c; }
    void process(int i, int f1, int f2, int f3) {
        double v1, v2, v3;
        getParam(i, v1, v2, v3);
        usleep(1000000);
        std::unique_lock<std::mutex> ul(proc_mutex);
        double val = v1 + v2 + v3;
        results.push_back(val);
        cout << "process : " << i << " , results : " << val << "\n";
    }
    void getResults() {
        for (auto& ret : results)
            cout << ret << " ";
        cout << endl;
    }
    void processOmp(ThreadPool& pool);
private:    
    struct ExtractParam {
        double v1, v2, v3;
    };
    void getParam(int i, double& v1, double& v2, double& v3) {
        v1 = params[i].v1;
        v2 = params[i].v2;
        v3 = params[i].v3;
    }
    std::vector<ExtractParam> params;
    std::vector<double> results;
    std::mutex proc_mutex;
    int c;
};

class Matching {
public:
    Matching() {
        c = 1300;
        for (int i = 0; i < c; ++i) {
            MatchParam p;
            p.v1 = i;
            p.v2 = i+2;
            params.push_back(p);
        }
        results.resize(c);
    }
    int getCount() { return c; }
    void process(int i, int f1, int f2) {
        cout << "process : " << i << "\n";
        double v1, v2;
        getParam(i, v1, v2);
        usleep(10000);
        std::unique_lock<std::mutex> ul(proc_mutex);
        results[i] = v1 + v2;
    }
    void getResults() {
        for (auto& ret : results)
            cout << ret << " ";
        cout << endl;
    }
private:    
    struct MatchParam {
        double v1, v2;
    };
    void getParam(int i, double& v1, double& v2) {
        v1 = params[i].v1;
        v2 = params[i].v2;
    }
    std::vector<MatchParam> params;
    std::vector<double> results;
    std::mutex proc_mutex;
    int c;
};

static void extractionProc(Extraction* e, int i, int f1, int f2, int f3) {
    e->process(i, f1, f2, f3);
}

static void matchingProc(Matching* m, int i, int f1, int f2) {
    m->process(i, f1, f2);
}

void Extraction::processOmp(ThreadPool& thread_pool) {
    
    for (int i = 0; i < getCount(); ++i) {
        thread_pool.addTask(extractionProc, this, i, 0, 0, 0);
        thread_pool.addTask(&Extraction::process, this, i, 0, 0, 0);
    }
    thread_pool.start();
    while (1) {
        usleep(1000);
        if (thread_pool.finished())
            break;
    }
    getResults();
}

int main(int argc, char** argv) {
    ThreadPool thread_pool;
    Extraction extraction;
    extraction.processOmp(thread_pool);
    return 0;
}

对于成员函数，我在这做了一个小trick，用一个静态函数作为线程池的task，而该函数拥有类实例的指针，相当于是验证成员函数和静态函数的适用性。

另外，我们先将所有的计算任务塞入队列中，再开始计算，这样做的原因只是为了满足我自己的特定需求，我希望所有任务都就位后再统一开始。如果是像一般的IO密集型或需要即时响应的任务，则用一个条件变量用作从队列中取任务的阻塞，使未开始计算的线程争抢该条件变量即可。

这样的“计算池”基本上可以满足我的需要，能够在执行高强度运算的同时保证能上网摸鱼不卡。如果有需要的同学可以直接拿去复制粘贴，若是从中获得灵感，有更好的延伸和实现，那就更好不过了。

若愚和小巧

关注

2
点赞
踩
5

收藏

觉得还不错? 一键收藏
1
评论
软件开发随笔：用C++写一个计算密集型适用的自适应线程池

在线程池中，如果为了保守创建较少的线程数，则不能充分利用性能；而创建过多的线程数会有可能在我们使用别的进程时，出现CPU的调度延迟导致缓慢，因为我们的任务是计算密集型，计算时会占用一整个核。在这里我使用一个多余的线程来监听CPU的状态，让线程池可以根据CPU占用率灵活控制本进程的计算量，在最大程度榨取计算机性能的同时保证用户对机器的其他基本操作。
复制链接

扫一扫

专栏目录