关于CUDA实现最值问题

最新推荐文章于 2023-12-07 15:22:12 发布

zhbxlm

最新推荐文章于 2023-12-07 15:22:12 发布

阅读量3.7k

点赞数 1

分类专栏： CUDA

本文链接：https://blog.csdn.net/zhbxlm/article/details/52006557

版权

CUDA 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

一转眼一周就过去了，算来入职已经半个月了，项目也进行了十来天，感觉这一周受到最纠结的莫过于寻最值问题了。

听起来寻最值应该是个很简单的问题，当时的思路是这样的，并行规约寻最值，并记录下标，于是有了第一版的程序：

__global__ void Max_Reduce(int *d_array, int array_len, int *max_value, int *max_idx)

{

__share__ int temp_value_share[warp_size];

__share__ int temp_idx_share[warp_size];

int tid=thread.x+blockDim.x*blockIdx.x;

int i,temp_value,temp_value1,temp_idx,temp_idx1;

int warpid=thread.x/warp_size,laneid=thread.x%warp_size;

if(tid<n)

{

temp_value=d_array[tid];

temp_idx=thread.x;

for(i=warp_size/2;i>=1;i/=2)

{

temp_value1=shft_xor(temp_value,i,warp_size);

temp_idx1=shft_xor(temp_idx,i,warp_size);

if(temp_value<temp_value1)

{

temp_value=temp_value1;

temp_idx=temp_idx1;

}

if(!laneid)

{

temp_value_share[warpid]=temp_value;

temp_idx_share[warpid]=temp_idx;

}

__sychthreads();

if(thread.x<warp_size)

{

temp_value=temp_value_share[thread.x];

temp_idx=temp_idx_share[thread.x];

for(i=warp_size/2;i>=1;i/=2)

{

temp_value1=shft_xor(temp_value,i,warp_size);

temp_idx1=shft_xor(temp_idx,i,warp_size);

if(temp_value<temp_value1)

{

temp_value=temp_value1;

temp_idx=temp_idx1;

}

if(!thread.x)

{

max_value[blockIdx.x]=temp_value;

max_idx[block.x]=temp_idx;

}

用例测试发现当数组存在超过长度n，可能存在寻找的数据超过n的情况，在师兄的提示下，有了第二版：

__global__ void Max_Reduce(int *d_array, int array_len, int *max_value, int *max_idx)

{

__share__ int temp_value_share[warp_size];

__share__ int temp_idx_share[warp_size];

int tid=thread.x+blockDim.x*blockIdx.x;

int i,temp_value,temp_value1,temp_idx,temp_idx1;

int warpid=thread.x/warp_size,laneid=thread.x%warp_size;

temp_value=-1e30;

temp_idx=thread.x;

if(tid<n)

{

temp_value=d_array[tid];

}

for(i=warp_size/2;i>=1;i/=2)

{

temp_value1=shft_xor(temp_value,i,warp_size);

temp_idx1=shft_xor(temp_idx,i,warp_size);

if(temp_value<temp_value1)

{

temp_value=temp_value1;

temp_idx=temp_idx1;

}

if(!laneid)

{

temp_value_share[warpid]=temp_value;

temp_idx_share[warpid]=temp_idx;

}

__sychthreads();

if(thread.x<warp_size)

{

temp_value=temp_value_share[thread.x];

temp_idx=temp_idx_share[thread.x];

for(i=warp_size/2;i>=1;i/=2)

{

temp_value1=shft_xor(temp_value,i,warp_size);

temp_idx1=shft_xor(temp_idx,i,warp_size);

if(temp_value<temp_value1)

{

temp_value=temp_value1;

temp_idx=temp_idx1;

}

if(!thread.x)

{

max_value[blockIdx.x]=temp_value;

max_idx[block.x]=temp_idx;

}

这个感觉应该没问题了吧，可事实令我发狂，项目中刚开始是对的，可后面就错了，为此，调试了很久，结果发现最大值是对的，而下标是错的，很显然，存在相等的下标，这个让我吃透了苦，于是有了第三版：

__global__ void Max_Reduce(int *d_array, int array_len, int *max_value, int *max_idx)

{

__share__ int temp_value_share[warp_size];

__share__ int temp_idx_share[warp_size];

int tid=thread.x+blockDim.x*blockIdx.x;

int i,temp_value,temp_value1,temp_idx,temp_idx1;

int warpid=thread.x/warp_size,laneid=thread.x%warp_size;

temp_value=-1e30;

temp_idx=thread.x;

if(tid<n)

{

temp_value=d_array[tid];

}

for(i=warp_size/2;i>=1;i/=2)

{

temp_value1=shft_xor(temp_value,i,warp_size);

temp_idx1=shft_xor(temp_idx,i,warp_size);

if(temp_value<temp_value1)

{

temp_value=temp_value1;

temp_idx=temp_idx1;

}

else if(temp_value=temp_value1)

{

if(temp_idx>temp_idx1)

{

temp_idx=temp_idx1;

}

if(!laneid)

{

temp_value_share[warpid]=temp_value;

temp_idx_share[warpid]=temp_idx;

}

__sychthreads();

if(thread.x<warp_size)

{

temp_value=temp_value_share[thread.x];

temp_idx=temp_idx_share[thread.x];

for(i=warp_size/2;i>=1;i/=2)

{

temp_value1=shft_xor(temp_value,i,warp_size);

temp_idx1=shft_xor(temp_idx,i,warp_size);

if(temp_value<temp_value1)

{

temp_value=temp_value1;

temp_idx=temp_idx1;

}

else if(temp_value=temp_value1)

{

if(temp_idx>temp_idx1)

{

temp_idx=temp_idx1;

}

if(!thread.x)

{

max_value[blockIdx.x]=temp_value;

max_idx[block.x]=temp_idx;

}

zhbxlm

关注

1
点赞
踩
8

收藏

觉得还不错? 一键收藏
2
评论
关于CUDA实现最值问题

一转眼一周就过去了，算来入职已经半个月了，项目也进行了十来天，感觉这一周受到最纠结的莫过于寻最值问题了。听起来寻最值应该是个很简单的问题，当时的思路是这样的，并行规约寻最值，并记录下标，于是有了第一版的程序： __global__ void Max_Reduce(int *d_array, int array_len, int *max_value
复制链接

扫一扫

专栏目录