RGB转GRAY公式如下:
本实验通过对一张1920*1080 分辨率大小RGB彩色图进行灰度图转换测试耗时时间。
测试条件为 嵌入式开发板ssc9381g A7
通过四种转换方式进行耗时对比
结果如下:
方式1
通过opencv 库函数cvtColor转换耗时为48.325ms
方式2
通过C语言代码转换耗时为 151.006ms
方式3
通过neon Intrinsics函数优化转换算法耗时 40.358ms
方式4
arm neon asm 汇编优化转换算法耗时 1.385ms
以下是参考代码和详细讲解,供大家参考下:
//使用NEON Intrinsics优化
// 关键字restrict只用于限定指针,表明本指针是访问一个数据对象的惟一且初始的方式
__attribute__((target("fpu=neon"))) void neon_convert(uint8_t *__restrict dest, uint8_t *__restrict src, int n)
{
int i;
//读取8字节的预设值到64位寄存器
// 将一个标量扩展城向量 8 bit * 8
uint8x8_t rfac = vdup_n_u8(77); // 转换权值 R
uint8x8_t gfac = vdup_n_u8(151); // 转换权值 G
uint8x8_t bfac = vdup_n_u8(28); // 转换权值 B
n /= 8;
for (i = 0; i < n; i++)
{
uint16x8_t temp;
// uint8x8 表示将64bit寄存器 分成 8 个 8bit
uint8x8x3_t rgb = vld3_u8(src); //一次读取3个unit8x8到3个64位寄存器
uint8x8_t result;
temp = vmull_u8(rgb.val[0], rfac); // temp=rgb.val[0]*rfac
temp = vmlal_u8(temp, rgb.val[1], gfac); // temp=temp+rgb.val[1]*gfac
temp = vmlal_u8(temp, rgb.val[2], bfac); //temp=temp+rgb.val[2]*bfac
result = vshrn_n_u16(temp, 8); // vshrn_n_u16 会在temp做右移8 位的同时将2字节无符号型转成1字节无符号型
vst1_u8(dest, result); // 转存运算结果到dest
src += 8 * 3;
dest += 8;
}
}
//NEON汇编代码优化:
__attribute__((target("fpu=neon")))
static void neon_asm_convert(uint8_t * __restrict dest, uint8_t * __restrict src, int numPixels)
{
asm volatile("lsr %2, %2, #3 \n"
"# build the three constants: \n"
"mov r4, #28 \n" // Blue channel multiplier
"mov r5, #151 \n" // Green channel multiplier
"mov r6, #77 \n" // Red channel multiplier
"vdup.8 d4, r4 \n"
"vdup.8 d5, r5 \n"
"vdup.8 d6, r6 \n"
".loop: \n"
"# load 8 pixels: \n"
"vld4.8 {d0-d3}, [%1]! \n"
"# do the weight average: \n"
"vmull.u8 q7, d0, d4 \n"
"vmlal.u8 q7, d1, d5 \n"
"vmlal.u8 q7, d2, d6 \n"
"# shift and store: \n"
"vshrn.u16 d7, q7, #8 \n" // Divide q3 by 256 and store in the d7
"vst1.8 {d7}, [%0]! \n"
"subs %2, %2, #1 \n" // Decrement iteration count
"bne .loop \n" // Repeat unil iteration count is not zero
:
: "r"(dest), "r"(src), "r"(numPixels)
: "r4", "r5", "r6"
);
}
// C语言转换
void reference_convert(uint8_t *__restrict dest, uint8_t *__restrict src, int n)
{
int i;
for (i = 0; i < n; i++)
{
int r = *src++; // load red
int g = *src++; // load green
int b = *src++; // load blue
// build weighted average:
int y = (r * 77) + (g * 151) + (b * 28);
// undo the scale by 256 and write to memory:
*dest++ = (y >> 8);
}
}
int main(int argc, char *argv[])
{
int row, col, chn;
cv::Mat SrcImg = cv::imread("srcimg.jpg", 1);
row = SrcImg.rows;
col = SrcImg.cols;
chn = SrcImg.channels();
cv::Mat GrayImg(row, col, CV_8UC1, cv::Scalar(0, 0, 0));
cv::Mat GrayImgTmp;
struct timeval tv_start;
struct timeval tv_end;
float time_use;
// 计算cvtColor转换耗时
gettimeofday(&tv_start, NULL);
cv::cvtColor(SrcImg, GrayImgTmp, cv::COLOR_BGR2GRAY);
gettimeofday(&tv_end, NULL);
time_use = (tv_end.tv_sec - tv_start.tv_sec) * 1000000 + (tv_end.tv_usec - tv_start.tv_usec); //微秒
printf("cvtColor time_use is %.10f\n", time_use);
imwrite("gray1.jpg", GrayImgTmp);
//计算自己手动转换算法耗时
gettimeofday(&tv_start, NULL);
reference_convert(GrayImg.data, SrcImg.data, row*col);
gettimeofday(&tv_end, NULL);
time_use = (tv_end.tv_sec - tv_start.tv_sec) * 1000000 + (tv_end.tv_usec - tv_start.tv_usec); //微秒
printf("C program time_use is %.10f\n", time_use);
//arm neon Intrinsics函数优化转换算法耗时
gettimeofday(&tv_start, NULL);
// 这个由于每次处理8个像素所以 大小为row*col/8 (不考虑不能被8整除的)
neon_convert(GrayImg.data, SrcImg.data, row*col);
gettimeofday(&tv_end, NULL);
time_use = (tv_end.tv_sec - tv_start.tv_sec) * 1000000 + (tv_end.tv_usec - tv_start.tv_usec); //微秒
printf("neon Intrinsics time_use is %.10f\n", time_use);
imwrite("gray2.jpg", GrayImg);
//arm neon asm 汇编优化转换算法耗时
gettimeofday(&tv_start, NULL);
neon_asm_convert(GrayImg.data, SrcImg.data, row*col/8);
gettimeofday(&tv_end, NULL);
time_use = (tv_end.tv_sec - tv_start.tv_sec) * 1000000 + (tv_end.tv_usec - tv_start.tv_usec); //微秒
printf("neon asm time_use is %.10f\n", time_use);
return 0;
}
原图
opencv cvtcolor转换后灰度图
neon 转换后灰度图