基于 opencv 3.1.0
相关源码
..\sources\modules\cudalegacy\src\image_pyramid.cpp
..\sources\modules\cudalegacy\src\cuda\NCVPyramid.cu
类
cv::cuda::ImagePyramidImpl
金字塔分解,调用层次:
cv::cuda::ImagePyramidImpl::ImagePyramidImpl ->
cv::cuda::device::pyramid::downsampleX2 ->
kernelDownsampleX2_gpu ->
kernelDownsampleX2 (cuda kernel)
template<typename T>
__global__ void kernelDownsampleX2(T *d_src, //d for device
Ncv32u srcPitch,
T *d_dst,
Ncv32u dstPitch,
NcvSize32u dstRoi)
{
Ncv32u i = blockIdx.y * blockDim.y + threadIdx.y;
Ncv32u j = blockIdx.x * blockDim.x + threadIdx.x;
if (i < dstRoi.height && j < dstRoi.width)
{
//srcPitch是每行的字节长度
//两行合并为一行
T *d_src_line1 = (T *)((Ncv8u *)d_src + (2 * i + 0) * srcPitch);
T *d_src_line2 = (T *)((Ncv8u *)d_src + (2 * i + 1) * srcPitch);
T *d_dst_line = (T *)((Ncv8u *)d_dst + i * dstPitch);
T p00 = d_src_line1[2*j+0];
T p01 = d_src_line1[2*j+1];
T p10 = d_src_line2[2*j+0];
T p11 = d_src_line2[2*j+1];
//2倍抽取:两行相邻四个点取平均
//也可以采用高斯滤波
d_dst_line[j] = _average4(p00, p01, p10, p11);
}
}
获得金字塔第n层图像,调用层次:
cv::cuda::ImagePyramidImpl::getLayer->
cv::cuda::device::pyramid::interpolateFrom1->
kernelInterpolateFrom1_gpu->
kernelInterpolateFrom1 (cuda kernel)
template<typename T>
__global__ void kernelInterpolateFrom1(T *d_srcTop,
Ncv32u srcTopPitch,
NcvSize32u szTopRoi,
T *d_dst,
Ncv32u dstPitch,
NcvSize32u dstRoi)
{
Ncv32u i = blockIdx.y * blockDim.y + threadIdx.y;
Ncv32u j = blockIdx.x * blockDim.x + threadIdx.x;
if (i < dstRoi.height && j < dstRoi.width)
{
Ncv32f ptTopX = 1.0f * (szTopRoi.width - 1) * j / (dstRoi.width - 1);
Ncv32f ptTopY = 1.0f * (szTopRoi.height - 1) * i / (dstRoi.height - 1);
Ncv32u xl = (Ncv32u)ptTopX;
Ncv32u xh = xl+1;
Ncv32f dx = ptTopX - xl;
Ncv32u yl = (Ncv32u)ptTopY;
Ncv32u yh = yl+1;
Ncv32f dy = ptTopY - yl;
//相邻两行之间插值,获得新一行
T *d_src_line1 = (T *)((Ncv8u *)d_srcTop + yl * srcTopPitch);
T *d_src_line2 = (T *)((Ncv8u *)d_srcTop + yh * srcTopPitch);
T *d_dst_line = (T *)((Ncv8u *)d_dst + i * dstPitch);
T p00, p01, p10, p11;
p00 = d_src_line1[xl];
p01 = xh < szTopRoi.width ? d_src_line1[xh] : p00;
p10 = yh < szTopRoi.height ? d_src_line2[xl] : p00;
p11 = (xh < szTopRoi.width && yh < szTopRoi.height) ? d_src_line2[xh] : p00;
typedef typename TConvBase2Vec<Ncv32f, NC(T)>::TVec TVFlt;
TVFlt m_00_01 = _lerp<T, TVFlt>(p00, p01, dx); //线性插值
TVFlt m_10_11 = _lerp<T, TVFlt>(p10, p11, dx);
TVFlt mixture = _lerp<TVFlt, TVFlt>(m_00_01, m_10_11, dy);
T outPix = _pixDemoteClampZ<TVFlt, T>(mixture); //像素值饱和处理
d_dst_line[j] = outPix;
}
}
线性插值
_lerp函数,支持多通道数据,下面是单通道的定义:
template<typename Tin, typename Tout> struct __lerp_CN<Tin, Tout, 1> {
static __host__ __device__ Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d)
{
typedef typename TConvVec2Base<Tout>::TBase TB; //定义输出数据类型
return _pixMake(TB(b.x * d + a.x * (1 - d)));
//!b*d+a*(1-d) 等价于 (b-a)*d+a
//! d取值[0,1],代表线性插值系数,d越大,插值越接近于b,d越小,插值越接近于a
}};
能否利用纹理内存的硬件插值特性加速下采样和上采样处理?
金字塔的cuda实现采用线性插值法,而cpu实现则采用高斯滤波,两者的计算结果是不一样的,前者的实时性更好,精度更差。
相比cpu版本的代码,cuda版本的代码并没有实现拉普拉斯金字塔,也没有金字塔重构。