# 多线程编程学习笔记-海量数据求和

## 背景

### 单线程：

int main()
{
boost::posix_time::ptime start =boost::posix_time::microsec_clock::local_time();
uint64_t result = 0;
for (int i = 0; i < max_sum_item; i++)
result += i;
std::cout << "sum="<<result<<std::endl;
boost::posix_time::ptime end = boost::posix_time::microsec_clock::local_time();
boost::posix_time::time_duration timeTaken = end - start;
std::cout <<"cost time:"<< timeTaken.total_milliseconds() << std::endl;
}

sum=499999999500000000
cost time:4061

const int max_sum_item = 1000000000;
void do_sum(uint64_t *total)
{
*total = 0;
for (int i = 0; i < max_sum_item; i++)
*total += i;
}
int main()
{
boost::posix_time::ptime start = boost::posix_time::microsec_clock::local_time();
uint64_t result = 0;
worker.join();
std::cout << "sum="<<result<<std::endl;

boost::posix_time::ptime end = boost::posix_time::microsec_clock::local_time();
boost::posix_time::time_duration timeTaken = end - start;
std::cout <<"cost time:"<< timeTaken.total_milliseconds() << std::endl;
}

sum=499999999500000000
cost time:4346

void do_sum(uint64_t *total)
{
uint64_t localTotal = 0;
for (int i = 0; i < max_sum_item; i++)
localTotal += i;

*total = localTotal;
}

sum=499999999500000000
cost time:4068

### 多线程：

std::for_each(part_sums.begin(), part_sums.end(), [&result] (uint64_t *subtotal) { result += *subtotal; });

std::vector<uint64_t *> part_sums;
void do_partial_sum(uint64_t *final, int start_val, int sums_to_do)
{
uint64_t sub_result = 0
for (int i = start_val; i < start_val + sums_to_do; i++)
sub_result += i;

*final = sub_result;
}
int main()
{
boost::posix_time::ptime start = boost::posix_time::microsec_clock::local_time();
part_sums.clear();
part_sums1.clear();
for (int i = 0; i < threads_to_use; i++)
{
part_sums.push_back(new uint64_t(0));
}
for (int start_val = 0, i = 0; start_val < max_sum_item; start_val += sums_per_thread, i++)
{
}
for (int i = 0; i < threads_to_use; i++)
t[i]->join();
uint64_t result = 0;
// std::for_each(part_sums.begin(), part_sums.end(),myfunc);
//vector中元素求和
for(int i = 0; i < threads_to_use; i++)
{
uint64_t *temp = part_sums[i];
// std::cout<<*temp<<std::endl;
result += *temp;//注意这里的取值方式
}
// result = accumulate(part_sums1.begin() , part_sums1.end() ,0);
for (int i = 0; i < threads_to_use; i++)
{
delete t[i];
delete part_sums[i];
}
std::cout << "sum="<<result<<std::endl;

boost::posix_time::ptime end = boost::posix_time::microsec_clock::local_time();
boost::posix_time::time_duration timeTaken = end - start;
std::cout <<"cost time:"<< timeTaken.total_milliseconds() << std::endl;
}

sum=499999999500000000
cost time:1907

for (std::vector<boost::uint64_t *>::iterator it = part_sums.begin(); it != part_sums.end(); ++it)  result += **it;

### 线程数和任务数的分配问题

int sums_per_thread = max_sum_item / threads_to_use;

for (int start_val = 0, i = 0; start_val < max_sum_item; start_val += sums_per_thread, i++)
{
// Lump extra bits onto last thread if work items is not equally divisible by number of threads

if (start_val + sums_per_thread < max_sum_item && start_val + sums_per_thread * 2 > max_sum_item)
sums_to_do = max_sum_item - start_val;//尾部处理，一倍间距之上，两倍间距以内

break;//当第一个非标准任务数量被分配的时候，因为尾部线程的任务数量是大于1倍标准任务数的。如果该循环没有的话，则会进入下一个外循环，使得start_val=999,999,994，此时便会再创建一个没有必要的错误线程。
}

const int max_sum_item = 1000000000;
std::vector<uint64_t *> part_sums;
void do_partial_sum(uint64_t *final, int start_val, int sums_to_do)
{
uint64_t sub_result = 0;

for (int i = start_val; i < start_val + sums_to_do; i++)
sub_result += i;

*final = sub_result;
}
int main()
{
boost::posix_time::ptime start = boost::posix_time::microsec_clock::local_time();
uint64_t result = 0;
part_sums.clear();
part_sums1.clear();
for (int i = 0; i < threads_to_use; i++)
{
part_sums.push_back(new uint64_t(0));
}

for (int start_val = 0, i = 0; start_val < max_sum_item; start_val += sums_per_thread, i++)
{
// Lump extra bits onto last thread if work items is not equally divisible by number of threads

if (start_val + sums_per_thread < max_sum_item && start_val + sums_per_thread * 2 > max_sum_item)
sums_to_do = max_sum_item - start_val;//尾部处理，一倍间距之上，两倍间距以内

break;
}
for (int i = 0; i < threads_to_use; i++)
t[i]->join();
//vector中元素求和
int tt=0;
for(int i = 0; i < threads_to_use; i++)
{
uint64_t *temp = part_sums[i];
// std::cout<<*temp<<std::endl;
result += *temp;
}
// result = accumulate(part_sums1.begin() , part_sums1.end() ,0);
for (int i = 0; i < threads_to_use; i++)
{
delete t[i];
delete part_sums[i];
// delete part_sums1[i];
}
std::cout << "sum="<<result<<std::endl;

boost::posix_time::ptime end = boost::posix_time::microsec_clock::local_time();
boost::posix_time::time_duration timeTaken = end - start;
std::cout <<"cost time:"<< timeTaken.total_milliseconds() << std::endl;
//************************多线程测试************************************//
return 0;
}

sum=499999999500000000
cost time:546

### 准确记录每个线程的耗时情况

std::vector<uint64_t *> part_sums;
boost::mutex coutmutex;//同步对象
void do_partial_sum(uint64_t *final, int start_val, int sums_to_do)
{
coutmutex.lock();
std::cout << "Start: TID " << boost::this_thread::get_id() << " starting at " << start_val << ", workload of " << sums_to_do << " items" << std::endl;
coutmutex.unlock();
//You can simply output text to cout or a file stream, but as discussed in the first part of this series, stream operations in C++ are not atomic so you must wrap their use in a synchronization //object.
//Notice that all uses of std::cout must be wrapped in mutex locks as provided by the lock() method of std::mutex (or boost::mutex).
boost::posix_time::ptime start = boost::posix_time::microsec_clock::local_time();

uint64_t sub_result = 0;

for (int i = start_val; i < start_val + sums_to_do; i++)
sub_result += i;

*final = sub_result;
boost::posix_time::ptime end = boost::posix_time::microsec_clock::local_time();
boost::posix_time::time_duration timeTaken = end - start;
coutmutex.lock();
std::cout << "End  : TID " << boost::this_thread::get_id() << " with result " << sub_result << ", time taken "<< timeTaken.total_milliseconds() << std::endl;
//Notice that all uses of std::cout must be wrapped in mutex locks as provided by the lock() method of std::mutex (or boost::mutex).
coutmutex.unlock();//如果没有解锁的话，就一直尴尬地等待了
}

Start: TID 7f7a85a6a700 starting at 142857142, workload of 142857142 items
Start: TID 7f7a84668700 starting at 428571426, workload of 142857142 items
Start: TID 7f7a83266700 starting at 714285710, workload of 142857142 items
Start: TID 7f7a82865700 starting at 857142852, workload of 142857148 items
Start: TID 7f7a8646b700 starting at 0, workload of 142857142 items
Start: TID 7f7a85069700 starting at 285714284, workload of 142857142 items
Start: TID 7f7a83c67700 starting at 571428568, workload of 142857142 items
End : TID 7f7a85a6a700 with result 30612244459183675, time taken 542
End : TID 7f7a82865700 with result 132653065561224474, time taken 543
End : TID 7f7a84668700 with result 71428570500000003, time taken 543
End : TID 7f7a8646b700 with result 10204081438775511, time taken 544
End : TID 7f7a83266700 with result 112244896540816331, time taken 544
End : TID 7f7a83c67700 with result 91836733520408167, time taken 582
End : TID 7f7a85069700 with result 51020407479591839, time taken 583
sum=499999999500000000
cost time:583

### 完整代码

C11版本

#include <iostream>       // for std::cout
#include <cstdint>        // for uint64_t
#include <chrono>     // for std::chrono::high_resolution_clock
#include <vector>     // for std::vector
#include <algorithm>  // for std::for_each
#include <cassert>        // for assert

#define TRACE

#ifdef TRACE
#include <mutex>      // for std::mutex

std::mutex coutmutex;
#endif

std::vector<uint64_t *> part_sums;
const int max_sum_item = 1000000000;

void do_partial_sum(uint64_t *final, int start_val, int sums_to_do)
{
#ifdef TRACE
coutmutex.lock();
std::cout << "Start: TID " << std::this_thread::get_id() << " starting at " << start_val << ", workload of " << sums_to_do << " items" << std::endl;
coutmutex.unlock();

auto start = std::chrono::high_resolution_clock::now();
#endif

uint64_t sub_result = 0;

for (int i = start_val; i < start_val + sums_to_do; i++)
sub_result += i;

*final = sub_result;

#ifdef TRACE
auto end = std::chrono::high_resolution_clock::now();

coutmutex.lock();
std::cout << "End  : TID " << std::this_thread::get_id() << " with result " << sub_result << ", time taken "
<< (end - start).count() * ((double) std::chrono::high_resolution_clock::period::num / std::chrono::high_resolution_clock::period::den) << std::endl;
coutmutex.unlock();
#endif
}

int main()
{
part_sums.clear();

for (int i = 0; i < threads_to_use; i++)
part_sums.push_back(new uint64_t(0));

auto start = std::chrono::high_resolution_clock::now();

for (int start_val = 0, i = 0; start_val < max_sum_item; start_val += sums_per_thread, i++)
{
// Lump extra bits onto last thread if work items is not equally divisible by number of threads

if (start_val + sums_per_thread < max_sum_item && start_val + sums_per_thread * 2 > max_sum_item)
sums_to_do = max_sum_item - start_val;

break;
}

for (int i = 0; i < threads_to_use; i++)
t[i]->join();

uint64_t result = 0;

std::for_each(part_sums.begin(), part_sums.end(), [&result] (uint64_t *subtotal) { result += *subtotal; });

auto end = std::chrono::high_resolution_clock::now();

for (int i = 0; i < threads_to_use; i++)
{
delete t[i];
delete part_sums[i];
}

assert(result == uint64_t(499999999500000000));

std::cout << "Result is correct" << std::endl;

std::cout << "Time taken: " << (end - start).count() * ((double) std::chrono::high_resolution_clock::period::num / std::chrono::high_resolution_clock::period::den) << std::endl;
}

boost版本

#include <iostream>                   // for std::cout
#include <boost/cstdint.hpp>      // for boost::boost::uint64_t
#include <boost/chrono.hpp>           // for boost::chrono::high_resolution_clock
#include <vector>                 // for std::vector
#include <cassert>                    // for assert

#define TRACE

#ifdef TRACE

boost::mutex coutmutex;
#endif

std::vector<boost::uint64_t *> part_sums;
const int max_sum_item = 1000000000;

void do_partial_sum(boost::uint64_t *final, int start_val, int sums_to_do)
{
#ifdef TRACE
coutmutex.lock();
std::cout << "Start: TID " << boost::this_thread::get_id() << " starting at " << start_val << ", workload of " << sums_to_do << " items" << std::endl;
coutmutex.unlock();

boost::chrono::high_resolution_clock::time_point start = boost::chrono::high_resolution_clock::now();
#endif

boost::uint64_t sub_result = 0;

for (int i = start_val; i < start_val + sums_to_do; i++)
sub_result += i;

*final = sub_result;

#ifdef TRACE
boost::chrono::high_resolution_clock::time_point end = boost::chrono::high_resolution_clock::now();

coutmutex.lock();
std::cout << "End  : TID " << boost::this_thread::get_id() << " with result " << sub_result << ", time taken "
<< (end - start).count() * ((double) boost::chrono::high_resolution_clock::period::num / boost::chrono::high_resolution_clock::period::den) << std::endl;
coutmutex.unlock();
#endif
}

int main()
{
part_sums.clear();

for (int i = 0; i < threads_to_use; i++)
part_sums.push_back(new boost::uint64_t(0));

boost::chrono::high_resolution_clock::time_point start = boost::chrono::high_resolution_clock::now();

for (int start_val = 0, i = 0; start_val < max_sum_item; start_val += sums_per_thread, i++)
{
// Lump extra bits onto last thread if work items is not equally divisible by number of threads

if (start_val + sums_per_thread < max_sum_item && start_val + sums_per_thread * 2 > max_sum_item)
sums_to_do = max_sum_item - start_val;

break;
}

for (int i = 0; i < threads_to_use; i++)
t[i]->join();

boost::uint64_t result = 0;

for (std::vector<boost::uint64_t *>::iterator it = part_sums.begin(); it != part_sums.end(); ++it)
result += **it;

boost::chrono::high_resolution_clock::time_point end = boost::chrono::high_resolution_clock::now();

for (int i = 0; i < threads_to_use; i++)
{
delete t[i];
delete part_sums[i];
}

assert(result == boost::uint64_t(499999999500000000));

std::cout << "Result is correct" << std::endl;

std::cout << "Time taken: " << (end - start).count() * ((double) boost::chrono::high_resolution_clock::period::num / boost::chrono::high_resolution_clock::period::den) << std::endl;
}

## 多核处理器

for (int threads_to_use = 1; threads_to_use <= static_cast<int>(std::thread::hardware_concurrency()); threads_to_use++)
{
// original code

std::cout << "Time taken with " << threads_to_use << " core" << (threads_to_use != 1? "s":"") << ": " << (end - start).count() * ((double) std::chrono::high_resolution_clock::period::num / std::chrono::high_resolution_clock::period::den) << std::endl;
}

{
// original code
std::cout << “Time taken with ” << threads_to_use << ” core” << (threads_to_use != 1? “s”:”“) << “: ” << timeTaken.total_milliseconds()<< std::endl;
}

int main()
{
{
}
std::cout << "Time taken with " << threads_to_use << " core" << (threads_to_use != 1? "s":"") << ": " << timeTaken.total_milliseconds()<< std::endl;
return 0;
}

Time taken with 1 core: 3874
Time taken with 2 cores: 1927
Time taken with 3 cores: 1289
Time taken with 4 cores: 965
Time taken with 5 cores: 773
Time taken with 6 cores: 643
Time taken with 7 cores: 552
Time taken with 8 cores: 482
Time taken with 9 cores: 429
Time taken with 10 cores: 386
Time taken with 11 cores: 358
Time taken with 12 cores: 327
Time taken with 13 cores: 406
Time taken with 14 cores: 387
Time taken with 15 cores: 374
Time taken with 16 cores: 394
Time taken with 17 cores: 337
Time taken with 18 cores: 304
Time taken with 19 cores: 314
Time taken with 20 cores: 303
Time taken with 21 cores: 296
Time taken with 22 cores: 285
Time taken with 23 cores: 279
Time taken with 24 cores: 267

## 线程同步

import matplotlib.pyplot as plt
import numpy as np
y = [3874,1927,1289,965,773,643,552,482,429,386,358,327,406,387,374,394,337,304,314,303,296,285,279,267]
x = np.arange(1,25)
x1 = x.tolist()
print(type(x1))
print(len(x1))
print(len(y))
print(y)
plt.plot(x1,y,'r--')
plt.axis([1, 24, 0, 4000])
plt.title('cost time of cores')
plt.xlabel('number of cores')
plt.ylabel('cost time/milliseconds')
plt.show()