浅析cv::cuda::threshold函数的实现

最新推荐文章于 2024-05-05 23:03:33 发布

Charming Sun

最新推荐文章于 2024-05-05 23:03:33 发布

阅读量4k

点赞数

分类专栏： CUDA OpenCV 文章标签： CUDA

本文链接：https://blog.csdn.net/CharmingSun/article/details/124204456

版权

OpenCV 同时被 2 个专栏收录

3 篇文章 0 订阅

订阅专栏

CUDA

2 篇文章 0 订阅

订阅专栏

我觉得通过阅读优秀开源项目的代码来学习库的使用是一个不错的学习方法，比如通过阅读OpenCV和TensorFlow的源码来学习如何使用CUDA和cuDNN，还可以顺便学习一些编程技巧。所以寻找一个合适的切入点就显得尤为重要，本文以分析cv::cuda::threshold函数的实现为切入点，来初步了解一下如何使用CUDA来加速程序的运行。以下是两份值得一读的CUDA官方文档：

CUDA C++ Programming Guide

CUDA C++ Best Practices Guide

一、使用案例

这是一个对图像进行二值化处理的程序，如果想要编译运行它，请确保已编译、配置好带CUDA加速功能的OpenCV库，可以参考我之前写的编译支持CUDA和cuDNN的OpenCV。

#include <iostream>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/cudaarithm.hpp>
#include <opencv2/cudaimgproc.hpp>

int main()
{
	try
	{
		cv::Mat src_host = cv::imread("file.png", cv::IMREAD_GRAYSCALE);
		std::cout << "src_host.depth() == " << src_host.depth() << std::endl;
		cv::cuda::GpuMat dst, src;
		src.upload(src_host);

		cv::cuda::threshold(src, dst, 128.0, 255.0, cv::THRESH_BINARY);

		cv::Mat result_host;
		dst.download(result_host);

		cv::imshow("Result", result_host);
		cv::waitKey();
	}
	catch (const cv::Exception& ex)
	{
		std::cout << "Error: " << ex.what() << std::endl;
	}
	return 0;
}

二、函数原型

cv::cuda::threshold函数在opencv_contrib-4.5.5\modules\cudaarithm\include\opencv2\cudaarithm.hpp文件中声明：

/** @brief Applies a fixed-level threshold to each array element.

@param src Source array (single-channel).
@param dst Destination array with the same size and type as src .
@param thresh Threshold value.
@param maxval Maximum value to use with THRESH_BINARY and THRESH_BINARY_INV threshold types.
@param type Threshold type. For details, see threshold . The THRESH_OTSU and THRESH_TRIANGLE
threshold types are not supported.
@param stream Stream for the asynchronous version.

@sa threshold
 */
CV_EXPORTS_W double threshold(InputArray src, OutputArray dst, double thresh, double maxval, int type, Stream& stream = Stream::Null());

在opencv_contrib-4.5.5\modules\cudaarithm\src\cuda\threshold.cu文件中定义：

double cv::cuda::threshold(InputArray _src, OutputArray _dst, double thresh, double maxVal, int type, Stream& stream)
{
    GpuMat src = getInputMat(_src, stream);

    const int depth = src.depth();

    CV_Assert( depth <= CV_64F );
    CV_Assert( type <= 4 /*THRESH_TOZERO_INV*/ );

    GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);
    src = src.reshape(1);
    dst = dst.reshape(1);

    if (depth == CV_32F && type == 2 /*THRESH_TRUNC*/)
    {
        NppStreamHandler h(StreamAccessor::getStream(stream));

        NppiSize sz;
        sz.width  = src.cols;
        sz.height = src.rows;

        nppSafeCall( nppiThreshold_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step),
            dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, static_cast<Npp32f>(thresh), NPP_CMP_GREATER) );

        if (!stream)
            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
    }
    else
    {
        typedef void (*func_t)(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, Stream& stream);
        static const func_t funcs[] =
        {
            thresholdImpl<uchar>,
            thresholdImpl<schar>,
            thresholdImpl<ushort>,
            thresholdImpl<short>,
            thresholdImpl<int>,
            thresholdImpl<float>,
            thresholdImpl<double>
        };

        if (depth != CV_32F && depth != CV_64F)
        {
            thresh = cvFloor(thresh);
            maxVal = cvRound(maxVal);
        }

        funcs[depth](src, dst, thresh, maxVal, type, stream);
    }

    syncOutput(dst, _dst, stream);

    return thresh;
}

三、调用流程

cv::cuda::threshold函数调用了同样在opencv_contrib-4.5.5\modules\cudaarithm\src\cuda\threshold.cu文件中定义的thresholdImpl函数：

namespace
{
    template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
    {
    };
    template <> struct TransformPolicy<double> : DefaultTransformPolicy
    {
        enum {
            shift = 1
        };
    };

    template <typename T>
    void thresholdImpl(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, Stream& stream)
    {
        const T thresh_ = static_cast<T>(thresh);
        const T maxVal_ = static_cast<T>(maxVal);

        switch (type)
        {
        case 0:
            gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), thresh_binary_func(thresh_, maxVal_), stream);
            break;
        case 1:
            gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), thresh_binary_inv_func(thresh_, maxVal_), stream);
            break;
        case 2:
            gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), thresh_trunc_func(thresh_), stream);
            break;
        case 3:
            gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), thresh_to_zero_func(thresh_), stream);
            break;
        case 4:
            gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), thresh_to_zero_inv_func(thresh_), stream);
            break;
        };
    }
}

thresholdImpl函数调用了定义于opencv_contrib-4.5.5\modules\cudev\include\opencv2\cudev\grid\transform.hpp文件中的gridTransformUnary_函数：

template <class Policy, class SrcPtr, typename DstType, class UnOp>
__host__ void gridTransformUnary_(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const UnOp& op, Stream& stream = Stream::Null())
{
    const int rows = getRows(src);
    const int cols = getCols(src);

    CV_Assert( getRows(dst) == rows && getCols(dst) == cols );

    grid_transform_detail::transform_unary<Policy>(shrinkPtr(src), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
}

gridTransformUnary_函数调用了定义于opencv_contrib-4.5.5\modules\cudev\include\opencv2\cudev\grid\detail\transform.hpp文件中的transform_unary函数：

template <class Policy, class SrcPtr, typename DstType, class UnOp, class MaskPtr>
__host__ void transform_unary(const SrcPtr& src, const GlobPtr<DstType>& dst, const UnOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
{
    TransformDispatcher<false, Policy>::call(src, dst, op, mask, rows, cols, stream);
}

transform_unary函数调用了同样定义于opencv_contrib-4.5.5\modules\cudev\include\opencv2\cudev\grid\detail\transform.hpp文件中TransformDispatcher结构体内的call函数：

template <class Policy> struct TransformDispatcher<false, Policy>
{
    template <class SrcPtr, typename DstType, class UnOp, class MaskPtr>
    __host__ static void call(const SrcPtr& src, const GlobPtr<DstType>& dst, const UnOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
    {
        const dim3 block(Policy::block_size_x, Policy::block_size_y);
        const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));

        transformSimple<<<grid, block, 0, stream>>>(src, dst, op, mask, rows, cols);
        CV_CUDEV_SAFE_CALL( cudaGetLastError() );

        if (stream == 0)
            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
    }
...
};

call函数调用了同样定义于opencv_contrib-4.5.5\modules\cudev\include\opencv2\cudev\grid\detail\transform.hpp文件中的transformSimple函数：

template <class SrcPtr, typename DstType, class UnOp, class MaskPtr>
__global__ void transformSimple(const SrcPtr src, GlobPtr<DstType> dst, const UnOp op, const MaskPtr mask, const int rows, const int cols)
{
    const int x = blockIdx.x * blockDim.x + threadIdx.x;
    const int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x >= cols || y >= rows || !mask(y, x))
        return;

    dst(y, x) = saturate_cast<DstType>(op(src(y, x)));
}

transformSimple函数调用了定义于opencv_contrib-4.5.5\modules\cudev\include\opencv2\cudev\functional\functional.hpp文件中作为函数参数传进来的op结构体内的括号操作符函数（像素的二值化操作在该函数内完成，即saturate_cast<T>(src > thresh) * maxVal）：

template <typename T> struct ThreshBinaryFunc : unary_function<T, T>
{
    T thresh;
    T maxVal;

    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::parameter_type src) const
    {
        return saturate_cast<T>(src > thresh) * maxVal;
    }
};

template <typename T>
__host__ __device__ ThreshBinaryFunc<T> thresh_binary_func(T thresh, T maxVal)
{
    ThreshBinaryFunc<T> f;
    f.thresh = thresh;
    f.maxVal = maxVal;
    return f;
}

transformSimple函数还调用了定义于opencv_contrib-4.5.5\modules\cudev\include\opencv2\cudev\ptr2d\glob.hpp文件中GlobPtr结构体内的括号操作符函数（src(y, x)和dst(y, x)）：

/** @brief Structure similar to cv::cudev::GlobPtrSz but containing only a pointer and row step.

Width and height fields are excluded due to performance reasons. The structure is intended
for internal use or for users who write device code.
 */
template <typename T> struct GlobPtr
{
    typedef T   value_type;
    typedef int index_type;

    T* data;

    //! stride between two consecutive rows in bytes. Step is stored always and everywhere in bytes!!!
    size_t step;

    __device__ __forceinline__       T* row(int y)       { return (      T*)( (      uchar*)data + y * step); }
    __device__ __forceinline__ const T* row(int y) const { return (const T*)( (const uchar*)data + y * step); }

    __device__ __forceinline__       T& operator ()(int y, int x)       { return row(y)[x]; }
    __device__ __forceinline__ const T& operator ()(int y, int x) const { return row(y)[x]; }
};

transformSimple函数及其作为函数参数传进来的op结构体内的括号操作符函数都用到了定义于opencv_contrib-4.5.5\modules\cudev\include\opencv2\cudev\util\saturate_cast.hpp文件中的saturate_cast函数：

template <typename T> __device__ __forceinline__ T saturate_cast(uchar v) { return T(v); }

Charming Sun

关注

0
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
浅析cv::cuda::threshold函数的实现

我觉得通过阅读优秀开源项目的代码来学习库的使用是一个不错的学习方法，比如通过阅读OpenCV和TensorFlow的源码来学习如何使用CUDA和cuDNN，还可以顺便学习一些编程技巧。所以寻找一个合适的切入点就显得尤为重要，本文以分析cv::cuda::threshold函数的实现为切入点，来初步了解一下如何使用CUDA来加速程序的运行。以下是两份值得一读的CUDA官方文档：CUDA C++ Programming GuideCUDA C++ Best Practices Guide一、使用案例这是
复制链接

扫一扫

专栏目录