OpenCV 中的 cvtColor 将图像从一种颜色空间转换为另一种颜色空间。虽然 OpenCV 可以支持从各种 YUV 格式转换到 BGR,但反向到 YUV420 却仅能输出 I420。尽管内部有 cvtBGRtoTwoPlaneYUV 函数,但是对外没有提供。
Carotene 库并无该功能,所以 cvtColorBGR2ThreePlaneYUV 使用统一向量指令(universal intrinsics)来加速,维护成本低效率同样不高。在图像大于320x240时可能会启用线程。
YCbCr 色彩空间是在制定全球数字分量视频标准(在第4章中讨论)的过程中开发的,是 ITU-R BT.601的一部分。YCbCr 是 YUV 颜色空间的缩放和偏移版本。Y 定义为标称8位范围为16–235; Cb 和 Cr 的标称范围为16–240。
COLOR_BGRA2YUV_I420 是按照 RGB 为 full range 的 ITU-R BT.601 标准来转换的。
cvtColor
CV_OCL_RUN 检查输入输出及转换类型,若满足则调用 ocl_cvtColor 并返回。
CV_INSTRUMENT_REGION();
CV_Assert(!_src.empty());
if(dcn <= 0)
dcn = dstChannels(code);
CV_OCL_RUN( _src.dims() <= 2 && _dst.isUMat() &&
!(CV_MAT_DEPTH(_src.type()) == CV_8U && (code == COLOR_Luv2BGR || code == COLOR_Luv2RGB)),
ocl_cvtColor(_src, _dst, code, dcn) )
根据不同的转换码来调用相应处理函数。
swapBlue 判断是否交换蓝色通道,源或目的为 RGB 顺序的数据需要处理。
switch( code )
{
case COLOR_BGR2BGRA: case COLOR_RGB2BGRA: case COLOR_BGRA2BGR:
case COLOR_RGBA2BGR: case COLOR_RGB2BGR: case COLOR_BGRA2RGBA:
if(_src.channels() == 1)
cvtColorGray2BGR(_src, _dst, dcn);
else
cvtColorBGR2BGR(_src, _dst, dcn, swapBlue(code));
break;
case COLOR_BGR2BGR565: case COLOR_BGR2BGR555: case COLOR_BGRA2BGR565: case COLOR_BGRA2BGR555:
case COLOR_RGB2BGR565: case COLOR_RGB2BGR555: case COLOR_RGBA2BGR565: case COLOR_RGBA2BGR555:
cvtColorBGR25x5(_src, _dst, swapBlue(code), greenBits(code));
break;
case COLOR_BGR5652BGR: case COLOR_BGR5552BGR: case COLOR_BGR5652BGRA: case COLOR_BGR5552BGRA:
case COLOR_BGR5652RGB: case COLOR_BGR5552RGB: case COLOR_BGR5652RGBA: case COLOR_BGR5552RGBA:
cvtColor5x52BGR(_src, _dst, dcn, swapBlue(code), greenBits(code));
break;
case COLOR_BGR2GRAY: case COLOR_BGRA2GRAY:
case COLOR_RGB2GRAY: case COLOR_RGBA2GRAY:
cvtColorBGR2Gray(_src, _dst, swapBlue(code));
break;
case COLOR_BGR5652GRAY:
case COLOR_BGR5552GRAY:
cvtColor5x52Gray(_src, _dst, greenBits(code));
break;
case COLOR_GRAY2BGR:
case COLOR_GRAY2BGRA:
cvtColorGray2BGR(_src, _dst, dcn);
break;
case COLOR_GRAY2BGR565:
case COLOR_GRAY2BGR555:
cvtColorGray25x5(_src, _dst, greenBits(code));
break;
case COLOR_BGR2YCrCb: case COLOR_RGB2YCrCb:
case COLOR_BGR2YUV: case COLOR_RGB2YUV:
cvtColorBGR2YUV(_src, _dst, swapBlue(code), code == COLOR_BGR2YCrCb || code == COLOR_RGB2YCrCb);
break;
case COLOR_YCrCb2BGR: case COLOR_YCrCb2RGB:
case COLOR_YUV2BGR: case COLOR_YUV2RGB:
cvtColorYUV2BGR(_src, _dst, dcn, swapBlue(code), code == COLOR_YCrCb2BGR || code == COLOR_YCrCb2RGB);
break;
case COLOR_BGR2XYZ:
case COLOR_RGB2XYZ:
cvtColorBGR2XYZ(_src, _dst, swapBlue(code));
break;
case COLOR_XYZ2BGR:
case COLOR_XYZ2RGB:
cvtColorXYZ2BGR(_src, _dst, dcn, swapBlue(code));
break;
case COLOR_BGR2HSV: case COLOR_BGR2HSV_FULL:
case COLOR_RGB2HSV: case COLOR_RGB2HSV_FULL:
cvtColorBGR2HSV(_src, _dst, swapBlue(code), isFullRangeHSV(code));
break;
case COLOR_BGR2HLS: case COLOR_BGR2HLS_FULL:
case COLOR_RGB2HLS: case COLOR_RGB2HLS_FULL:
cvtColorBGR2HLS(_src, _dst, swapBlue(code), isFullRangeHSV(code));
break;
case COLOR_HSV2BGR: case COLOR_HSV2BGR_FULL:
case COLOR_HSV2RGB: case COLOR_HSV2RGB_FULL:
cvtColorHSV2BGR(_src, _dst, dcn, swapBlue(code), isFullRangeHSV(code));
break;
case COLOR_HLS2BGR: case COLOR_HLS2BGR_FULL:
case COLOR_HLS2RGB: case COLOR_HLS2RGB_FULL:
cvtColorHLS2BGR(_src, _dst, dcn, swapBlue(code), isFullRangeHSV(code));
break;
case COLOR_BGR2Lab: case COLOR_LBGR2Lab:
case COLOR_RGB2Lab: case COLOR_LRGB2Lab:
cvtColorBGR2Lab(_src, _dst, swapBlue(code), is_sRGB(code));
break;
case COLOR_BGR2Luv: case COLOR_LBGR2Luv:
case COLOR_RGB2Luv: case COLOR_LRGB2Luv:
cvtColorBGR2Luv(_src, _dst, swapBlue(code), is_sRGB(code));
break;
case COLOR_Lab2BGR: case COLOR_Lab2LBGR:
case COLOR_Lab2RGB: case COLOR_Lab2LRGB:
cvtColorLab2BGR(_src, _dst, dcn, swapBlue(code), is_sRGB(code));
break;
case COLOR_Luv2BGR: case COLOR_Luv2LBGR:
case COLOR_Luv2RGB: case COLOR_Luv2LRGB:
cvtColorLuv2BGR(_src, _dst, dcn, swapBlue(code), is_sRGB(code));
break;
case COLOR_BayerBG2GRAY: case COLOR_BayerGB2GRAY: case COLOR_BayerRG2GRAY: case COLOR_BayerGR2GRAY:
case COLOR_BayerBG2BGR: case COLOR_BayerGB2BGR: case COLOR_BayerRG2BGR: case COLOR_BayerGR2BGR:
case COLOR_BayerBG2BGR_VNG: case COLOR_BayerGB2BGR_VNG: case COLOR_BayerRG2BGR_VNG: case COLOR_BayerGR2BGR_VNG:
case COLOR_BayerBG2BGR_EA: case COLOR_BayerGB2BGR_EA: case COLOR_BayerRG2BGR_EA: case COLOR_BayerGR2BGR_EA:
case COLOR_BayerBG2BGRA: case COLOR_BayerGB2BGRA: case COLOR_BayerRG2BGRA: case COLOR_BayerGR2BGRA:
{
Mat src;
if (_src.getObj() == _dst.getObj()) // inplace processing (#6653)
_src.copyTo(src);
else
src = _src.getMat();
demosaicing(src, _dst, code, dcn);
break;
}
uIndex 返回 U 在三个通道中的排位。
cvtColorTwoPlaneYUV2BGR
cvtColorThreePlaneYUV2BGR
cvtColorYUV2Gray_420
case COLOR_YUV2BGR_NV21: case COLOR_YUV2RGB_NV21: case COLOR_YUV2BGR_NV12: case COLOR_YUV2RGB_NV12:
case COLOR_YUV2BGRA_NV21: case COLOR_YUV2RGBA_NV21: case COLOR_YUV2BGRA_NV12: case COLOR_YUV2RGBA_NV12:
// http://www.fourcc.org/yuv.php#NV21 == yuv420sp -> a plane of 8 bit Y samples followed by an interleaved V/U plane containing 8 bit 2x2 subsampled chroma samples
// http://www.fourcc.org/yuv.php#NV12 -> a plane of 8 bit Y samples followed by an interleaved U/V plane containing 8 bit 2x2 subsampled colour difference samples
cvtColorTwoPlaneYUV2BGR(_src, _dst, dcn, swapBlue(code), uIndex(code));
break;
case COLOR_YUV2BGR_YV12: case COLOR_YUV2RGB_YV12: case COLOR_YUV2BGRA_YV12: case COLOR_YUV2RGBA_YV12:
case COLOR_YUV2BGR_IYUV: case COLOR_YUV2RGB_IYUV: case COLOR_YUV2BGRA_IYUV: case COLOR_YUV2RGBA_IYUV:
//http://www.fourcc.org/yuv.php#YV12 == yuv420p -> It comprises an NxM Y plane followed by (N/2)x(M/2) V and U planes.
//http://www.fourcc.org/yuv.php#IYUV == I420 -> It comprises an NxN Y plane followed by (N/2)x(N/2) U and V planes
cvtColorThreePlaneYUV2BGR(_src, _dst, dcn, swapBlue(code), uIndex(code));
break;
case COLOR_YUV2GRAY_420:
cvtColorYUV2Gray_420(_src, _dst);
break;
cvtBGRtoThreePlaneYUV
cvtColorOnePlaneYUV2BGR 调用 cvtOnePlaneYUVtoBGR
case COLOR_RGB2YUV_YV12: case COLOR_BGR2YUV_YV12: case COLOR_RGBA2YUV_YV12: case COLOR_BGRA2YUV_YV12:
case COLOR_RGB2YUV_IYUV: case COLOR_BGR2YUV_IYUV: case COLOR_RGBA2YUV_IYUV: case COLOR_BGRA2YUV_IYUV:
cvtColorBGR2ThreePlaneYUV(_src, _dst, swapBlue(code), uIndex(code));
break;
case COLOR_YUV2RGB_UYVY: case COLOR_YUV2BGR_UYVY: case COLOR_YUV2RGBA_UYVY: case COLOR_YUV2BGRA_UYVY:
case COLOR_YUV2RGB_YUY2: case COLOR_YUV2BGR_YUY2: case COLOR_YUV2RGB_YVYU: case COLOR_YUV2BGR_YVYU:
case COLOR_YUV2RGBA_YUY2: case COLOR_YUV2BGRA_YUY2: case COLOR_YUV2RGBA_YVYU: case COLOR_YUV2BGRA_YVYU:
//http://www.fourcc.org/yuv.php#UYVY
//http://www.fourcc.org/yuv.php#YUY2
//http://www.fourcc.org/yuv.php#YVYU
{
int ycn = (code==COLOR_YUV2RGB_UYVY || code==COLOR_YUV2BGR_UYVY ||
code==COLOR_YUV2RGBA_UYVY || code==COLOR_YUV2BGRA_UYVY) ? 1 : 0;
cvtColorOnePlaneYUV2BGR(_src, _dst, dcn, swapBlue(code), uIndex(code), ycn);
break;
}
case COLOR_YUV2GRAY_UYVY:
case COLOR_YUV2GRAY_YUY2:
cvtColorYUV2Gray_ch(_src, _dst, code == COLOR_YUV2GRAY_UYVY ? 1 : 0);
break;
case COLOR_RGBA2mRGBA:
cvtColorRGBA2mRGBA(_src, _dst);
break;
case COLOR_mRGBA2RGBA:
cvtColormRGBA2RGBA(_src, _dst);
break;
default:
CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" );
cvtColorBGR2ThreePlaneYUV
CvtHelper 检查输入输出并做必要准备。
hal::cvtBGRtoThreePlaneYUV 为 hal 层实现。
CvtHelper< Set<3, 4>, Set<1>, Set<CV_8U>, TO_YUV > h(_src, _dst, 1);
hal::cvtBGRtoThreePlaneYUV(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
h.scn, swapb, uidx);
CvtHelper
检查输入输出是否与外部指定的类型相同。
CvtHelper(InputArray _src, OutputArray _dst, int dcn)
{
CV_Assert(!_src.empty());
int stype = _src.type();
scn = CV_MAT_CN(stype), depth = CV_MAT_DEPTH(stype);
CV_Check(scn, VScn::contains(scn), "Invalid number of channels in input image");
CV_Check(dcn, VDcn::contains(dcn), "Invalid number of channels in output image");
CV_CheckDepth(depth, VDepth::contains(depth), "Unsupported depth of input image");
如果是原地处理,将输入拷贝到src
中。
if (_src.getObj() == _dst.getObj()) // inplace processing (#6653)
_src.copyTo(src);
else
src = _src.getMat();
如果是转换到 YUV,要求输入的宽高为2的倍数;
如果输入是 YUV,要求宽为2的倍数同时高为3的倍数。
CV_MAKETYPE 根据色深和通道数合成类型。
Size sz = src.size();
switch (sizePolicy)
{
case TO_YUV:
CV_Assert( sz.width % 2 == 0 && sz.height % 2 == 0);
dstSz = Size(sz.width, sz.height / 2 * 3);
break;
case FROM_YUV:
CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0);
dstSz = Size(sz.width, sz.height * 2 / 3);
break;
case NONE:
default:
dstSz = sz;
break;
}
_dst.create(dstSz, CV_MAKETYPE(depth, dcn));
dst = _dst.getMat();
}
Mat src, dst;
int depth, scn;
Size dstSz;
hal::cvtBGRtoThreePlaneYUV
CV_INSTRUMENT_REGION
CALL_HAL 调用 cv_hal_cvtBGRtoThreePlaneYUV 函数并检查返回值。hal_ni_cvtBGRtoThreePlaneYUV 未实现。
CV_CPU_DISPATCH 调用 __CV_CPU_DISPATCH_EXPAND, 而 __CV_EXPAND 会执行函数。
调用 color_yuv.simd.hpp 中的函数来执行。
CV_INSTRUMENT_REGION();
CALL_HAL(cvtBGRtoThreePlaneYUV, cv_hal_cvtBGRtoThreePlaneYUV, src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, uIdx);
CV_CPU_DISPATCH(cvtBGRtoThreePlaneYUV, (src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, uIdx),
CV_CPU_DISPATCH_MODES_ALL);
cvtBGRtoThreePlaneYUV
RGB8toYUV420pInvoker 类执行变换。
如果输入大于320x240,调用 parallel_for_ 多线程处理。
传入的行数除以2,在内部使用时又乘以2。使得最终高度为偶数。
CV_INSTRUMENT_REGION();
uchar * uv_data = dst_data + dst_step * height;
RGB8toYUV420pInvoker cvt(src_data, src_step, dst_data, uv_data, dst_step, width, height,
scn, swapBlue, uIdx == 2, false);
if( width * height >= 320*240 )
parallel_for_(Range(0, height/2), cvt);
else
cvt(Range(0, height/2));
RGB8toYUV420pInvoker
ParallelLoopBody 是并行数据处理器的基类,重载括号运算符。
Y 和 UV 分开表示。
_swapUV
和_interleave
两个参数使得 RGB8toYUV420pInvoker 可以输出 YUV420 的4种格式。
RGB8toYUV420pInvoker(const uchar * _srcData, size_t _srcStep,
uchar * _yData, uchar * _uvData, size_t _dstStep,
int _srcWidth, int _srcHeight, int _scn, bool _swapBlue, bool _swapUV, bool _interleave)
: srcData(_srcData), srcStep(_srcStep),
yData(_yData), uvData(_uvData), dstStep(_dstStep),
srcWidth(_srcWidth), srcHeight(_srcHeight),
srcCn(_scn), swapBlue(_swapBlue), swapUV(_swapUV), interleave(_interleave) { }
逐行处理。遇到偶数行时 UV 指针换行。
void operator()(const Range& rowRange) const CV_OVERRIDE
{
const int w = srcWidth;
const int h = srcHeight;
const int scn = srcCn;
const uchar* srcRow = (uchar*)0;
uchar* yRow = (uchar*)0, *uRow = (uchar*)0, *vRow = (uchar*)0, *uvRow = (uchar*)0;
for( int sRow = rowRange.start*2; sRow < rowRange.end*2; sRow++)
{
srcRow = srcData + srcStep*sRow;
yRow = yData + dstStep * sRow;
bool evenRow = (sRow % 2) == 0;
if(evenRow)
{
if (interleave)
{
uvRow = uvData + dstStep*(sRow/2);
}
else
{
uRow = uvData + dstStep * (sRow/4) + ((sRow/2) % 2) * (w/2);
vRow = uvData + dstStep * ((sRow + h)/4) + (((sRow + h)/2) % 2) * (w/2);
}
}
vsize
为16,每次循环处理2*vsize
个像素。
根据 SIMD 宽度来定义 v_uint8 的实际类型,可能是 v_uint8x16。
v_load_deinterleave 从存储器解交织加载数据并存储到3个寄存器。OPENCV_HAL_IMPL_NEON_INTERLEAVED 宏定义了 arm 下的实现。
rgbToY42x 按照 ITUR_BT_601 标准转换。
v_store 将寄存器向量中的值保存到数组。
v_store_interleave 将3个寄存器中的数据交织并存储到内存中。
偶数行调用 rgbToUV42x 得到 U 和 V 的值。
int i = 0;
#if CV_SIMD
const int vsize = v_uint8::nlanes;
for( ; i <= w/2 - vsize;
i += vsize)
{
// processing (2*vsize) pixels at once
v_uint8 b0, b1, g0, g1, r0, r1, a0, a1;
if(scn == 4)
{
v_load_deinterleave(srcRow + 2*4*i + 0*vsize, b0, g0, r0, a0);
v_load_deinterleave(srcRow + 2*4*i + 4*vsize, b1, g1, r1, a1);
}
else // scn == 3
{
v_load_deinterleave(srcRow + 2*3*i + 0*vsize, b0, g0, r0);
v_load_deinterleave(srcRow + 2*3*i + 3*vsize, b1, g1, r1);
}
if(swapBlue)
{
swap(b0, r0); swap(b1, r1);
}
v_uint8 y0, y1;
y0 = rgbToY42x(r0, g0, b0);
y1 = rgbToY42x(r1, g1, b1);
v_store(yRow + 2*i + 0*vsize, y0);
v_store(yRow + 2*i + 1*vsize, y1);
if(evenRow)
{
v_uint8 u, v;
rgbToUV42x(r0, r1, g0, g1, b0, b1, u, v);
if(swapUV)
{
swap(u, v);
}
if(interleave)
{
v_store_interleave(uvRow + 2*i, u, v);
}
else
{
v_store(uRow + i, u);
v_store(vRow + i, v);
}
}
}
vx_cleanup();
#endif
处理行尾未对齐的数据,每次转换一对像素。
此处的 rgbToY42x 接受uchar
输入,与上面的不同。
// processing two pixels at once
for( ; i < w/2; i++)
{
uchar b0, g0, r0;
uchar b1, g1, r1;
b0 = srcRow[(2*i+0)*scn + 0];
g0 = srcRow[(2*i+0)*scn + 1];
r0 = srcRow[(2*i+0)*scn + 2];
b1 = srcRow[(2*i+1)*scn + 0];
g1 = srcRow[(2*i+1)*scn + 1];
r1 = srcRow[(2*i+1)*scn + 2];
if(swapBlue)
{
swap(b0, r0); swap(b1, r1);
}
uchar y0 = rgbToY42x(r0, g0, b0);
uchar y1 = rgbToY42x(r1, g1, b1);
yRow[2*i+0] = y0;
yRow[2*i+1] = y1;
if(evenRow)
{
uchar uu, vv;
rgbToUV42x(r0, g0, b0, uu, vv);
if(swapUV)
{
swap(uu, vv);
}
if(interleave)
{
uvRow[2*i+0] = uu;
uvRow[2*i+1] = vv;
}
else
{
uRow[i] = uu;
vRow[i] = vv;
}
}
}
}
}
类成员变量。
const uchar * srcData;
size_t srcStep;
uchar *yData, *uvData;
size_t dstStep;
int srcWidth;
int srcHeight;
const int srcCn;
bool swapBlue;
bool swapUV;
bool interleave;
rgbToY42x
Y
601
=
0.257
R
′
+
0.504
G
′
+
0.098
B
′
+
16
C
b
=
–
0.148
R
′
–
0.291
G
′
+
0.439
B
′
+
128
C
r
=
0.439
R
′
–
0.368
G
′
–
0.071
B
′
+
128
\begin{aligned} Y_{601} &= 0.257R' + 0.504G' + 0.098B' + 16 \\ C_b &= –0.148R' – 0.291G' + 0.439B' + 128 \\ Cr &= 0.439R' – 0.368G' – 0.071B' + 128 \end{aligned}
Y601CbCr=0.257R′+0.504G′+0.098B′+16=–0.148R′–0.291G′+0.439B′+128=0.439R′–0.368G′–0.071B′+128
R
′
=
1.164
(
Y
601
–
16
)
+
1.596
(
C
r
–
128
)
G
′
=
1.164
(
Y
601
–
16
)
–
0.813
(
C
r
–
128
)
–
0.391
(
C
b
–
128
)
B
′
=
1.164
(
Y
601
–
16
)
+
2.018
(
C
b
–
128
)
\begin{aligned} R' &= 1.164(Y_{601} – 16) + 1.596(C_r – 128) \\ G' &= 1.164(Y_{601} – 16) – 0.813(C_r – 128) – 0.391(C_b – 128)\\ B' &= 1.164(Y_{601} – 16) + 2.018(C_b – 128) \end{aligned}
R′G′B′=1.164(Y601–16)+1.596(Cr–128)=1.164(Y601–16)–0.813(Cr–128)–0.391(Cb–128)=1.164(Y601–16)+2.018(Cb–128)
ITUR_BT_601_SHIFT 为20。ITUR_BT_601_CRY、ITUR_BT_601_CGY 和 ITUR_BT_601_CBY 为左移放大后的系数。
v_expand 将寄存器的内容复制到两个2倍宽包装类型的寄存器中。
CV_INTRIN_DEFINE_WIDE_INTRIN 定义特定类型的数据操作。CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES 定义所有数据类型的操作。
intrin.hpp 会根据宏符号选择包含相应的平台头文件。
对于 arm 而言,vx_setall_u32
会调用 v_setall_u32,后者由 OPENCV_HAL_IMPL_NEON_INIT 定义。
const int shifted16 = (16 << ITUR_BT_601_SHIFT);
const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1));
v_uint16 r0, r1, g0, g1, b0, b1;
v_expand(r, r0, r1);
v_expand(g, g0, g1);
v_expand(b, b0, b1);
v_uint32 rq[4], gq[4], bq[4];
v_expand(r0, rq[0], rq[1]); v_expand(r1, rq[2], rq[3]);
v_expand(g0, gq[0], gq[1]); v_expand(g1, gq[2], gq[3]);
v_expand(b0, bq[0], bq[1]); v_expand(b1, bq[2], bq[3]);
v_uint32 ry = vx_setall_u32(ITUR_BT_601_CRY), gy = vx_setall_u32(ITUR_BT_601_CGY);
v_uint32 by = vx_setall_u32(ITUR_BT_601_CBY), shift = vx_setall_u32(halfShift + shifted16);
v_uint32 y[4];
for(int k = 0; k < 4; k++)
{
y[k] = (rq[k]*ry + gq[k]*gy + bq[k]*by + shift) >> ITUR_BT_601_SHIFT;
}
v_pack 由 OPENCV_HAL_IMPL_NEON_PACK 宏定义。
将值从两个向量压缩到一个。
v_uint16 y0, y1;
y0 = v_pack(y[0], y[1]);
y1 = v_pack(y[2], y[3]);
return v_pack(y0, y1);
rgbToUV42x
OPENCV_HAL_IMPL_C_REINTERPRET 通过 v_reg::reinterpret_as 函数实现转义。
将输入颜色值转为 int16,丢弃了奇数索引的像素值。
然后将值扩展为 int32。
// [r0, r1, r2, r3,..] => [r0, 0, r2, 0,..]
v_int16 vlowByte = vx_setall_s16(0x00ff);
v_int16 rd0, rd1, gd0, gd1, bd0, bd1;
rd0 = v_reinterpret_as_s16(r0) & vlowByte;
rd1 = v_reinterpret_as_s16(r1) & vlowByte;
gd0 = v_reinterpret_as_s16(g0) & vlowByte;
gd1 = v_reinterpret_as_s16(g1) & vlowByte;
bd0 = v_reinterpret_as_s16(b0) & vlowByte;
bd1 = v_reinterpret_as_s16(b1) & vlowByte;
v_int32 rq[4], gq[4], bq[4];
v_expand(rd0, rq[0], rq[1]);
v_expand(rd1, rq[2], rq[3]);
v_expand(gd0, gq[0], gq[1]);
v_expand(gd1, gq[2], gq[3]);
v_expand(bd0, bq[0], bq[1]);
v_expand(bd1, bq[2], bq[3]);
加上halfShift
实现四舍五入。
const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1));
const int shifted128 = (128 << ITUR_BT_601_SHIFT);
v_int32 shift = vx_setall_s32(halfShift + shifted128);
v_int32 ru, gu, bu, gv, bv;
ru = vx_setall_s32(ITUR_BT_601_CRU);
gu = vx_setall_s32(ITUR_BT_601_CGU);
gv = vx_setall_s32(ITUR_BT_601_CGV);
bu = vx_setall_s32(ITUR_BT_601_CBU);
bv = vx_setall_s32(ITUR_BT_601_CBV);
v_int32 uq[4], vq[4];
for(int k = 0; k < 4; k++)
{
uq[k] = (ru*rq[k] + gu*gq[k] + bu*bq[k] + shift) >> ITUR_BT_601_SHIFT;
vq[k] = (bu*rq[k] + gv*gq[k] + bv*bq[k] + shift) >> ITUR_BT_601_SHIFT;
}
v_int16 u0, u1, v0, v1;
u0 = v_pack(uq[0], uq[1]);
u1 = v_pack(uq[2], uq[3]);
v0 = v_pack(vq[0], vq[1]);
v1 = v_pack(vq[2], vq[3]);
u = v_pack_u(u0, u1);
v = v_pack_u(v0, v1);
parallel_for_
调用重载函数。
ParallelLoopBodyLambdaWrapper 将 lambda 表达式封装成对象。
parallel_for_(range, ParallelLoopBodyLambdaWrapper(functor), nstripes);
parallel_for_
CV_INSTRUMENT_REGION_MT_FORK 调用 CV_INSTRUMENT_REGION_META
使用静态变量检查操作有无嵌套。
#ifdef OPENCV_TRACE
CV__TRACE_OPENCV_FUNCTION_NAME_("parallel_for", 0);
CV_TRACE_ARG_VALUE(range_start, "range.start", (int64)range.start);
CV_TRACE_ARG_VALUE(range_end, "range.end", (int64)range.end);
CV_TRACE_ARG_VALUE(nstripes, "nstripes", (int64)nstripes);
#endif
CV_INSTRUMENT_REGION_MT_FORK();
if (range.empty())
return;
static std::atomic<bool> flagNestedParallelFor(false);
bool isNotNestedRegion = !flagNestedParallelFor.load();
if (isNotNestedRegion)
isNotNestedRegion = !flagNestedParallelFor.exchange(true);
没有嵌套的话调用 parallel_for_impl,否则直接运行。
if (isNotNestedRegion)
{
try
{
parallel_for_impl(range, body, nstripes);
flagNestedParallelFor = false;
}
catch (...)
{
flagNestedParallelFor = false;
throw;
}
}
else // nested parallel_for_() calls are not parallelized
{
CV_UNUSED(nstripes);
body(range);
}
parallel_for_impl
如果线程数不为1则调用各种并行技术。
ParallelLoopBodyWrapperContext 为上下文环境,不允许拷贝。
ProxyLoopBody 在不同环境下的实现不同。
ParallelLoopBodyWrapper::stripeRange 返回 ParallelLoopBodyWrapperContext 中的细条数。
如果stripeRange
中仅一个元素则直接执行。
using namespace cv::parallel;
if ((numThreads < 0 || numThreads > 1) && range.end - range.start > 1)
{
ParallelLoopBodyWrapperContext ctx(body, range, nstripes);
ProxyLoopBody pbody(ctx);
cv::Range stripeRange = pbody.stripeRange();
if( stripeRange.end - stripeRange.start == 1 )
{
body(range);
return;
}
getCurrentParallelForAPI 获取 ParallelForAPI,由 setParallelForBackend 进行设置。
具体实现有 cv::parallel::tbb::ParallelForBackend 和 cv::parallel::openmp::ParallelForBackend 两种。
api
不为空意味着已显式进行了设置。
parallel_for_cb 执行传入的函数。
std::shared_ptr<ParallelForAPI>& api = getCurrentParallelForAPI();
if (api)
{
CV_CheckEQ(stripeRange.start, 0, "");
api->parallel_for(stripeRange.end, parallel_for_cb, (void*)&pbody);
ctx.finalize(); // propagate exceptions if exists
return;
}
CV_PARALLEL_FRAMEWORK 的值根据编译选项设置。
#ifdef CV_PARALLEL_FRAMEWORK
#if defined HAVE_TBB
#if TBB_INTERFACE_VERSION >= 8000
tbbArena.execute(pbody);
#else
pbody();
#endif
#elif defined HAVE_HPX
pbody();
#elif defined HAVE_OPENMP
#pragma omp parallel for schedule(dynamic) num_threads(numThreads > 0 ? numThreads : numThreadsMax)
for (int i = stripeRange.start; i < stripeRange.end; ++i)
pbody(Range(i, i + 1));
#elif defined HAVE_GCD
dispatch_queue_t concurrent_queue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
dispatch_apply_f(stripeRange.end - stripeRange.start, concurrent_queue, &pbody, block_function);
#elif defined WINRT
Concurrency::parallel_for(stripeRange.start, stripeRange.end, pbody);
#elif defined HAVE_CONCURRENCY
if(!pplScheduler || pplScheduler->Id() == Concurrency::CurrentScheduler::Id())
{
Concurrency::parallel_for(stripeRange.start, stripeRange.end, pbody);
}
else
{
pplScheduler->Attach();
Concurrency::parallel_for(stripeRange.start, stripeRange.end, pbody);
Concurrency::CurrentScheduler::Detach();
}
#elif defined HAVE_PTHREADS_PF
parallel_for_pthreads(pbody.stripeRange(), pbody, pbody.stripeRange().size());
#else
#error You have hacked and compiling with unsupported parallel framework
#endif
ctx.finalize(); // propagate exceptions if exists
return;
#endif // CV_PARALLEL_FRAMEWORK
}
对应线程数或任务数为1,直接执行。
body(range);
参考资料:
- C语言 ## VA_ARGS 宏
- Xilinx OpenCV User Guide
- ermig1979/Simd
- 虹软人脸识别SDK在网络摄像头中的实际应用
- OpenCV并行加速Parallel_for_与ParallelLoopBody教程
- OpenCV源码解析:直方图均衡化的详细算法和过程
- BT.601-7 (03/2011)
- YCbCr
- YUV to RGB Conversion
- Color Space Converter: R’G’B’ to Y’CbCr
- EXT_YUV_target
- 1© 2014 Cisco and/or its affiliates. All rights reserved.Colorspaces and HDMI
- 图像颜色空间转换-CSC
- ImageCodingResearchTools/YUV/rgb2yuv.m
- Color Spaces
- Chapter 3 颜色空间(2)——YUV、YCbCr
- 聊聊OpenCV的SIMD机制
- 使用OpenCV中的universal intrinsics为算法提速