TBB基础之parallel_for

最新推荐文章于 2024-06-19 20:55:20 发布

zoufeiyy

最新推荐文章于 2024-06-19 20:55:20 发布

阅读量1.3w

点赞数 2

分类专栏： TBB 文章标签： parallel iterator functor function constructor class

本文链接：https://blog.csdn.net/zoufeiyy/article/details/1887579

版权

TBB 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

从现在开始我们要看一些TBB里更实在的一些东西了，之所以说它实在，是因为这些内容是切实地能帮助我们去解决一些并行编程里的问题。

首先看的也是最简单的parallel_for。

我们还是先从一个例子开始看起：

问题：对一个数组里的每个元素施加一个操作Foo(...)

串行化的版本：

  
  
   
   
   
   1 
   
   void
   
    SerialApplyFoo (
   
   int
   
    a[], size_t n) {

   
   2 
   
       
   
   for
   
    (size_t i 
   
   =
   
    
   
   0
   
   ; i 
   
   <
   
    n; 
   
   ++
   
    i)

   
   3 
   
           Foo(a[i]);

   
   4 
   
   }

使用TBB并行化的版本：

  
  
   
   
   
    1 
   
   #include 
   
   "
   
   tbb/task_scheduler_init.h
   
   "
   
    
   
   
 2 
   
   #include 
   
   "
   
   tbb/blocked_range.h
   
   "
   
    
   
   
 3 
   
   #include 
   
   "
   
   tbb/parallel_for.h
   
   "
   
    
   
   
 4 
   
    
   
   
 5 
   
   
   
   using
   
    
   
   namespace
   
    tbb;

   
    6 
   
    
   
   
 7 
   
   
   
   //
   
    对每个元素执行该操作
   
    

   
    8 
   
   
   
   void
   
    Foo(
   
   int
   
    value)

   
    9 
   
   {

   
   10 
   
       
   
   //
   
    Applied function
   
    

   
   11 
   
   
   
   }

   
   12 
   
    

   
   13 
   
   
   
   class
   
    ApplyFoo

   
   14 
   
   {

   
   15 
   
       
   
   int
   
    
   
   *
   
    
   
   const
   
    my_a;

   
   16 
   
   
   
   public
   
   :

   
   17 
   
       
   
   void
   
    
   
   operator
   
    () (
   
   const
   
    blocked_range
   
   <
   
   size_t
   
   >
   
    
   
   &
   
    r) 
   
   const
   
    

   
   18 
   
       {

   
   19 
   
           
   
   int
   
    
   
   *
   
    a 
   
   =
   
    my_a;

   
   20 
   
           
   
   for
   
    (size_t i 
   
   =
   
    r.begin(); i 
   
   !=
   
    r.end(); 
   
   ++
   
    i)

   
   21 
   
               Foo(a[i]);

   
   22 
   
       }

   
   23 
   
       

   
   24 
   
       ApplyFoo(
   
   int
   
    a[]) : my_a(a) {}

   
   25 
   
   };

   
   26 
   
    

   
   27 
   
   
   
   int
   
    main(
   
   int
   
    argc, 
   
   char
   
   *
   
    argv[])

   
   28 
   
   {

   
   29 
   
       
   
   //
   
    创建task scheduler
   
    

   
   30 
   
   
   
       task_scheduler_init init;

   
   31 
   
           
   
   const
   
    
   
   int
   
    n 
   
   =
   
    
   
   100
   
   ;

   
   32 
   
       
   
   int
   
    a[n];

   
   33 
   
       
   
   for
   
    (
   
   int
   
    i 
   
   =
   
    
   
   0
   
   ; i 
   
   <
   
    n; i 
   
   ++
   
   )

   
   34 
   
           a[i] 
   
   =
   
    i;

   
   35 
   
       
   
   //
   
    TBB会把数组分成若干的block

   
   36 
   
       
   
   //
   
    对block调用ApplyFoo这个functor
   
    
   
   
37 
   
   
   
       parallel_for(blocked_range
   
   <
   
   size_t
   
   >
   
   (
   
   0
   
   , n), ApplyFoo(a), simple_partitioner());

   
   38 
   
       
   
   return
   
    
   
   0
   
   ;

   
   39 
   
   }

   
   40

这个其实就是我们最早开始看TBB时的一个例子。

我们看到这里面多了好几个陌生的东西：

blocked_range
parallel_for

block

OK，我们一个个来看，先说blocked_range ，这个template class表述了一个一维迭代（iterator）。同样的，我们先来看看它的declaration（部分无关代码已裁减），在tbb/blocked_range.h里：

  
  
   
   
   
    1 
   
   template
   
   <
   
   typename Value
   
   >
   
    
   
   
 2 
   
   
   
   class
   
    blocked_range {

   
    3 
   
   
   
   public
   
   :

   
    4 
   
       
   
   //
   
   ! Type of a value
   
    
   
   
 5 
   
   
   
       
   
   /*
   
   * Called a const_iterator for sake of algorithms that need to treat a blocked_range

   
    6 
   
           as an STL container. 
   
   */
   
    
   
   
 7 
   
       typedef Value const_iterator;

   
    8 
   
    
   
   
 9 
   
       
   
   //
   
   ! Type for size of a range
   
    
   
   
10 
   
   
   
       typedef size_t size_type;

   
   11 
   
    
   
   
12 
   
       
   
   /*
   
   * Requires that Value have a default constructor. 
   
   */
   
    
   
   
13 
   
       blocked_range() : my_begin(), my_end();

   
   14 
   
    
   
   
15 
   
       
   
   //
   
   ! Construct range over half-open interval [begin,end), with the given grainsize.
   
    

   
   16 
   
   
   
       blocked_range( Value begin_, Value end_, size_type grainsize_
   
   =
   
   1
   
    ) : 

   
   17 
   
           my_end(end_), my_begin(begin_), my_grainsize(grainsize_);

   
   18 
   
    

   
   19 
   
       
   
   //
   
   ! Beginning of range.
   
    

   
   20 
   
   
   
       const_iterator begin() 
   
   const
   
    {
   
   return
   
    my_begin;}

   
   21 
   
    

   
   22 
   
       
   
   //
   
   ! One past last value in range.
   
    

   
   23 
   
   
   
       const_iterator end() 
   
   const
   
    {
   
   return
   
    my_end;}

   
   24 
   
    

   
   25 
   
       
   
   //
   
   ! Size of the range
   
    
   
   
26 
   
   
   
       
   
   /*
   
   * Unspecified if end()
   
   */
   
    

   
   27 
   
       size_type size() 
   
   const
   
   ;

   
   28 
   
    
   
   
29 
   
       
   
   //
   
   ! The grain size for this range.
   
    

   
   30 
   
   
   
       size_type grainsize() 
   
   const
   
    {
   
   return
   
    my_grainsize;}

   
   31 
   
    
   
   
32 
   
       
   
   //
   
   ! True if range is empty.
   
    

   
   33 
   
   
   
       
   
   bool
   
    empty() 
   
   const
   
    {
   
   return
   
    
   
   !
   
   (my_begin
   
   <
   
   my_end);}

   
   34 
   
    

   
   35 
   
       
   
   //
   
   ! True if range is divisible.
   
    

   
   36 
   
   
   
       
   
   /*
   
   * Unspecified if end()
   
   */
   
    

   
   37 
   
       
   
   bool
   
    is_divisible() 
   
   const
   
    {
   
   return
   
    my_grainsize
   
   <
   
   size();}

   
   38 
   
    

   
   39 
   
       
   
   //
   
   ! Split range.  
   
    

   
   40 
   
   
   
       
   
   /*
   
   * The new Range *this has the second half, the old range r has the first half. 

   
   41 
   
           Unspecified if end()
   
   */
   
    
   
   
42 
   
       blocked_range( blocked_range
   
   &
   
    r, split ) : 

   
   43 
   
           my_end(r.my_end),

   
   44 
   
           my_begin(do_split(r)),

   
   45 
   
           my_grainsize(r.my_grainsize);

   
   46 
   
   };

从代码里可以看到blocked_range 有3个constructor，一个不接收参数，一个处理split（split的概念后面再讲），而我们示例里用到的是：

  
  
   
   
   
       
   
   //
   
   ! Construct range over half-open interval [begin,end), with the given grainsize.
   
   
    
   
   blocked_range( Value begin_, Value end_, size_type grainsize_
   
   =
   
   1
   
    ) : 
        my_end(end_), my_begin(begin_), my_grainsize(grainsize_)

第一个参数表示起始，第二个参数表示结束，它们的类型为const_iterator，表示的区间为[begin，end)这样一个半开区间。

第三个参数，grainsize，表示的是一个“合适的大小”块，这个块会在一个循环中进行处理，如果数组比这个grainsize还大，parallel_for会把它分割为独立的block，然后分别进行调度（有可能由多个线程进行处理）。

这样我们知道，grainsize其实决定了TBB什么时候对数据进行划分，如果我们把grainsize指定得太小，那就可能会导致产生过多得block，从而使得不同block间的overhead增加（比如多个线程间切换的代价），有可能会使性能下降。相反，如果grainsize设得太大，以致于这个数组几乎没有被划分，那又会导致不能发挥parallel_for期望达到的并行效果，也没有达到理想得性能。

所以我们在决定grainsize时需要小心，最好是能够经过调整测试后得到的值，当然你也可以如本例中一样不指定，让TBB帮你来决定合适的值（一般不是最优的）。

一个调整grainsize的经验性步骤：

首先把grainsize设得比预想的要大一些，通常设为10000
在单处理机机器上运行，得到性能数据
把grainsize减半，看性能降低多少，如果降低在5%-10%之间，那这个grainsize就已经是一个不错的设定了

partitioner

看完blocked_range ，再来看跟它很关联的另一个概念partitioner，顾名思义，partitioner就是指示怎么进行划分block的东东。在示例的parallel_for调用中，第3个参数就指定了一个partitioner，这里我们使用的是simple_partitioner。

TBB里提供了两个partitioner，一个是我们用到的simple_partitioner，另一个是auto_partitioner。

simple_partitioner是parallel_for（以及后面会讲到的parallel_reduce，parallel_scan）的缺省partitioner。simple_partitioner有如下特性：

概念简单
确保分割不会超过grainsize大小，这样你可以假定operator()的最大范围不会超过grainsize
它可以针对特定机器调节

simple_partitioner的缺点在于它需要你确定出一个合适的grainsize，而合适的grainsize并不是那么容易得到的。

另一个partitioner：auto_partitioner，它依赖于一定的规则自动决定划分，在线程间负载均衡和线程切换代价间寻找一个平衡，当然普适的一般就不是对于所有都是最好的～～～

如果我们想用auto_partitioner，那只要把示例里的simple_partitioner替换一下即可，要注意的是，既然auto_partitioner是自动决定分割的，那指定的grainsize就没有太大意义了。

一般情况下，建议使用auto_partitioner，除非你有足够的时间和精力去优化出一个比较好的grainsize～～～

parallel_for

最后，终于看到我们最关键的主题了：parallel_for，这是一个算法，类似于STL里的sort、for_each等。

直接步入主题，我们来看parallel_for的源码吧，在tbb/parallel_for.h中：

  
  
   
   
   
    1 
   
   //
   
   ! Parallel iteration over range.
   
    

   
    2 
   
   /*
   
   * The body b must allow:                                      /n

   
    3 
   
           b(r)                    Apply function to range r.      /n

   
    4 
   
       r must define:                                              /n

   
    5 
   
           r.is_divisible()        True if range should be divided /n

   
    6 
   
           r.empty()               True if range is empty          /n

   
    7 
   
           R r2(r,split())         Split range into r2 and r.      /n

   
    8 
   
       @ingroup algorithms 
   
   */
   
    

   
    9 
   
   template
   
   <
   
   typename Range, typename Body
   
   >
   
    
   
   
10 
   
   
   
   void
   
    parallel_for( 
   
   const
   
    Range
   
   &
   
    range, 
   
   const
   
    Body
   
   &
   
    body ) {

   
   11 
   
       
   
   if
   
   ( 
   
   !
   
   range.empty() ) {

   
   12 
   
           typedef typename 
   
   internal
   
   ::start_for
   
   <
   
   Range,Body
   
   >
   
    start_type;

   
   13 
   
           start_type
   
   &
   
    a 
   
   =
   
    
   
   *
   
   new
   
   (task::allocate_root()) start_type(range,body,simple_partitioner());

   
   14 
   
           task::spawn_root_and_wait(a);

   
   15 
   
       }

   
   16 
   
   }

   
   17 
   
    
   
   
18 
   
   
   
   //
   
   ! Parallel iteration over range using a partitioner.
   
    

   
   19 
   
   /*
   
   * The body b must allow:                                      /n

   
   20 
   
           b(r)                    Apply function to range r.      /n

   
   21 
   
       r must define:                                              /n

   
   22 
   
           r.is_divisible()        True if range can be divided /n

   
   23 
   
           r.empty()               True if range is empty          /n

   
   24 
   
           R r2(r,split())         Split range into r2 and r.      /n

   
   25 
   
       The partitioner p must define: /n

   
   26 
   
           p.should_execute_range(r,t)   True if r should be executed to completion without further splits. /n  

   
   27 
   
           P p2(p,split())               Split the partitioner into p2 and p.      /n

   
   28 
   
       @ingroup algorithms 
   
   */
   
    
   
   
29 
   
   template
   
   <
   
   typename Range, typename Body, typename Partitioner
   
   >
   
    

   
   30 
   
   
   
   void
   
    parallel_for( 
   
   const
   
    Range
   
   &
   
    range, 
   
   const
   
    Body
   
   &
   
    body, 
   
   const
   
    Partitioner
   
   &
   
    partitioner ) {

   
   31 
   
       
   
   if
   
   ( 
   
   !
   
   range.empty() ) {

   
   32 
   
           typedef typename 
   
   internal
   
   ::start_for
   
   <
   
   Range,Body,Partitioner
   
   >
   
    start_type;

   
   33 
   
           start_type
   
   &
   
    a 
   
   =
   
    
   
   *
   
   new
   
   (task::allocate_root()) start_type(range,body,partitioner);

   
   34 
   
           task::spawn_root_and_wait(a);

   
   35 
   
       }

   
   36 
   
   }

我们看到，parallel_for有两个版本，一个接收两个参数，一个接收三个参数：

range：指定划分block的范围
body：指定对block应用的操作，Body可以看成是一个操作子functor，它的operator(...)会以blocked_range 为参数进行调用，当然如果我们传过来的是一个函数指针也是可以的，只要它能以blocked_range 为参数进行调用
partitioner：指定划分器，可选的两种simple_partitioner和auto_partitioner

其实从parallel_for的prototype declaration和definition中我们可以明显地看到generic programming的意思，这里Range、Body、Partitioner其实都是GP里的concept，它们要求满足一定的policy，因此是典型的基于policy的design，当然这里的policy比起STL，有过之而无不及了，有兴趣的可以参考《Generic Programming and the STL》（范型编程与STL，电力出版社），这本书里对GP和policy based design介绍得还是很详细的。

标签: 多核 TBB parallel_for block partition

zoufeiyy

关注

2
点赞
踩
17

收藏

觉得还不错? 一键收藏
4
评论
TBB基础之parallel_for

从现在开始我们要看一些TBB里更实在的一些东西了，之所以说它实在，是因为这些内容是切实地能帮助我们去解决一些并行编程里的问题。首先看的也是最简单的parallel_for。我们还是先从一个例子开始看起：问题：对一个数组里的每个元素施加一个操作Foo(...)串行化的版本： <!--Code highlighting produced by Actipro Cod
复制链接

扫一扫

专栏目录