研究了下多线程并发计算的魅力,性能提升确实还是比较明显。
#include<iostream>
#include<thread>
#include<vector>
#include<numeric>
#include<chrono>
#include<random>
template<typename Iterator,typename T>
struct accumulate_block{
void operator()(Iterator first,Iterator last,T& result)
{
result = std::accumulate(first,last,result);
}
};
template<typename Iterator,typename T>
T parrallel_accumulate(Iterator first,Iterator last,T init)
{
const unsigned long length = std::distance(first,last);
const unsigned long min_per_thread = 25000;
const unsigned long max_thread_num = (length-1)/min_per_thread + 1;
const unsigned long hardware_num = std::thread::hardware_concurrency();
const unsigned long thread_number = std::min((hardware_num != 0 ? hardware_num : 2),max_thread_num);
std::vector<std::thread> threads;
std::vector<T> results(thread_number);
Iterator start = first;
for(size_t i = 0;i < (thread_number - 1);i++)
{
Iterator end = start;
std::advance(end,min_per_thread);
threads.emplace_back(std::thread(accumulate_block<Iterator,T>(),start,end,std::ref(results[i])));
start = end;
}
accumulate_block<Iterator,T>()(start,last,results[thread_number-1]);
for (auto& t : threads){
t.join();
}
return std::accumulate(results.begin(),results.end(),init);
}
int main(){
std::mt19937 engine(std::random_device{}());
std::vector<unsigned long> v;
for(size_t i = 0;i < 1000000; i++)
{
std::uniform_int_distribution<int> distrib(1,1000);
v.emplace_back(distrib(engine));
}
auto start = std::chrono::high_resolution_clock::now();
const unsigned long result = std::accumulate(v.begin(),v.end(),0);
auto stop = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start);
std::cout << "normal accumulate result:" << result << ",elapsed:"<< std::dec << duration.count() <<" ms."<< std::endl;
start = std::chrono::high_resolution_clock::now();
const unsigned long result1 = parrallel_accumulate<std::vector<unsigned long>::iterator,unsigned long>(v.begin(),v.end(),0);
stop = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start);
std::cout << "parrallel accumulate result:" << result1 << ",elapsed:"<< std::dec << duration.count() <<" ms." << std::endl;
return 0;
}
跑了几次,能看到性能提升有4-5倍左右,min_per_thread可以根据实际的总量来调整。