关于CUDA实现最值问题

          一转眼一周就过去了,算来入职已经半个月了,项目也进行了十来天,感觉这一周受到最纠结的莫过于寻最值问题了。

           听起来寻最值应该是个很简单的问题,当时的思路是这样的,并行规约寻最值,并记录下标,于是有了第一版的程序:

           __global__ void Max_Reduce(int *d_array, int array_len, int *max_value, int *max_idx)

            {

                           __share__ int temp_value_share[warp_size];

                           __share__ int temp_idx_share[warp_size];

                          int tid=thread.x+blockDim.x*blockIdx.x;

                          int i,temp_value,temp_value1,temp_idx,temp_idx1;

                          int warpid=thread.x/warp_size,laneid=thread.x%warp_size;

                          if(tid<n)

                          {

                                 temp_value=d_array[tid];

                                temp_idx=thread.x;

                                 for(i=warp_size/2;i>=1;i/=2)

                                 {

                                         temp_value1=shft_xor(temp_value,i,warp_size);

                                         temp_idx1=shft_xor(temp_idx,i,warp_size);

                                        if(temp_value<temp_value1)

                                        {

                                                temp_value=temp_value1;

                                                temp_idx=temp_idx1;

                                         }

                                  }

                               if(!laneid)

                               {

                                      temp_value_share[warpid]=temp_value;

                                      temp_idx_share[warpid]=temp_idx;

                              }

                            __sychthreads();

                           if(thread.x<warp_size)

                          {

                                   temp_value=temp_value_share[thread.x];

                                   temp_idx=temp_idx_share[thread.x];

                                  for(i=warp_size/2;i>=1;i/=2)

                                 {

                                         temp_value1=shft_xor(temp_value,i,warp_size);

                                         temp_idx1=shft_xor(temp_idx,i,warp_size);

                                        if(temp_value<temp_value1)

                                        {

                                                temp_value=temp_value1;

                                                temp_idx=temp_idx1;

                                         }

                                  }

                           }

                             if(!thread.x)

                            {

                                    max_value[blockIdx.x]=temp_value;

                                    max_idx[block.x]=temp_idx;

                             }

                           }

             }

            用例测试发现当数组存在超过长度n,可能存在寻找的数据超过n的情况,在师兄的提示下,有了第二版:

    __global__ void Max_Reduce(int *d_array, int array_len, int *max_value, int *max_idx)

            {

                           __share__ int temp_value_share[warp_size];

                           __share__ int temp_idx_share[warp_size];

                          int tid=thread.x+blockDim.x*blockIdx.x;

                          int i,temp_value,temp_value1,temp_idx,temp_idx1;

                          int warpid=thread.x/warp_size,laneid=thread.x%warp_size;

                            temp_value=-1e30;

                           temp_idx=thread.x;

                          if(tid<n)

                          {

                                    temp_value=d_array[tid];

                           }

                                 for(i=warp_size/2;i>=1;i/=2)

                                 {

                                         temp_value1=shft_xor(temp_value,i,warp_size);

                                         temp_idx1=shft_xor(temp_idx,i,warp_size);

                                        if(temp_value<temp_value1)

                                        {

                                                temp_value=temp_value1;

                                                temp_idx=temp_idx1;

                                         }

                                  }

                               if(!laneid)

                               {

                                      temp_value_share[warpid]=temp_value;

                                      temp_idx_share[warpid]=temp_idx;

                              }

                           __sychthreads();

                           if(thread.x<warp_size)

                          {

                                   temp_value=temp_value_share[thread.x];

                                   temp_idx=temp_idx_share[thread.x];

                                  for(i=warp_size/2;i>=1;i/=2)

                                 {

                                         temp_value1=shft_xor(temp_value,i,warp_size);

                                         temp_idx1=shft_xor(temp_idx,i,warp_size);

                                        if(temp_value<temp_value1)

                                        {

                                                temp_value=temp_value1;

                                                temp_idx=temp_idx1;

                                         }

                                  }

                           }

                             if(!thread.x)

                            {

                                    max_value[blockIdx.x]=temp_value;

                                    max_idx[block.x]=temp_idx;

                             }

                     

             }

          这个感觉应该没问题了吧,可事实令我发狂,项目中刚开始是对的,可后面就错了,为此,调试了很久,结果发现最大值是对的,而下标是错的,很显然,存在相等的下标,这个让我吃透了苦,于是有了第三版:

__global__ void Max_Reduce(int *d_array, int array_len, int *max_value, int *max_idx)

            {

                           __share__ int temp_value_share[warp_size];

                           __share__ int temp_idx_share[warp_size];

                          int tid=thread.x+blockDim.x*blockIdx.x;

                          int i,temp_value,temp_value1,temp_idx,temp_idx1;

                          int warpid=thread.x/warp_size,laneid=thread.x%warp_size;

                            temp_value=-1e30;

                           temp_idx=thread.x;

                          if(tid<n)

                          {

                                    temp_value=d_array[tid];

                           }

                                 for(i=warp_size/2;i>=1;i/=2)

                                 {

                                         temp_value1=shft_xor(temp_value,i,warp_size);

                                         temp_idx1=shft_xor(temp_idx,i,warp_size);

                                        if(temp_value<temp_value1)

                                        {

                                                temp_value=temp_value1;

                                                temp_idx=temp_idx1;

                                         }

                                       else if(temp_value=temp_value1)

                                       {

                                                 if(temp_idx>temp_idx1)

                                                {

                                                         temp_idx=temp_idx1;

                                                }

                                        }

                                  }

                               if(!laneid)

                               {

                                      temp_value_share[warpid]=temp_value;

                                      temp_idx_share[warpid]=temp_idx;

                              }

                            __sychthreads();

                           if(thread.x<warp_size)

                          {

                                   temp_value=temp_value_share[thread.x];

                                   temp_idx=temp_idx_share[thread.x];

                                  for(i=warp_size/2;i>=1;i/=2)

                                 {

                                         temp_value1=shft_xor(temp_value,i,warp_size);

                                         temp_idx1=shft_xor(temp_idx,i,warp_size);

                                        if(temp_value<temp_value1)

                                        {

                                                temp_value=temp_value1;

                                                temp_idx=temp_idx1;

                                         }

                                         else if(temp_value=temp_value1)

                                       {

                                                 if(temp_idx>temp_idx1)

                                                {

                                                         temp_idx=temp_idx1;

                                                }

                                        }

                                  }

                           }

                             if(!thread.x)

                            {

                                    max_value[blockIdx.x]=temp_value;

                                    max_idx[block.x]=temp_idx;

                             }

                     

             }



  • 1
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值