目标
- 使用
neon intrinsic
指令对视频的full range转limited range进行加速,转换公式为:
// Y通道
data[i] = (219 * data[i]) / 255 + 16;
// UV通道
data[i] = (224 * data[i]) / 255 + 16;
介绍
- neon指令属于SIMD指令,一次加载多组数据同时运算,从而实现并行加速,有点像影流之主的感觉。
- 在Studio Level设备中,8位YCbCr系统都规定亮度的取值范围介于16至235之间,而B-Y和R-Y信号的取值范围介于16至240之间。YCbCr 4∶2∶2色差信号的灰阶是16~235,通常被称作Limited Range,而RGB信号也有两种采样频率和灰阶,在PC Level系统中是0~255,而在Studio Level消费电子中则是16~235。
代码
- 代码中将除以255简化成除以256,这样就可以通过右移8位来轻松实现。除以255的版本通过*(219/255)这个float32_t的浮点数,这样每个128位寄存器只能同时计算4个data,而除以265的版本使用的是uint16_t防止溢出,这样每次可以计算8个data,同时浮点数的乘法运算量也比整数乘法加移位的运算量大很多。
- github链接TestNeon
#include <iostream>
#include <string.h>
#include <cstdint>
#include <chrono>
#if defined(__ARM_NEON)
#include <arm_neon.h>
#define USE_NEON
#elif defined(WIN32)
#include "NEON_2_SSE.h"
#define USE_NEON
#endif
using namespace std;
const int width = 360, height = 720;
void FullRange2VideoRangeNeon(uint8_t *data) {
int size = width * height;
#ifdef USE_NEON
// process Y data
int slicePixelCount = 16; // neon: 16, sse: 4
int divYPartCount = size / slicePixelCount;
int endY = divYPartCount * slicePixelCount;
int i = 0;
for (; i < endY; i += slicePixelCount) {
// load into register
uint8x16_t y_16 = vld1q_u8(data + i);
uint16x8_t y_16_l = vmovl_u8(vget_low_u8(y_16)); // 0~7bit
uint16x8_t y_16_h = vmovl_u8(vget_high_u8(y_16)); // 8~15bit
y_16_l = vmulq_n_u16(y_16_l, 219);
y_16_h = vmulq_n_u16(y_16_h, 219);
y_16_l = vshrq_n_u16(y_16_l, 8);
y_16_h = vshrq_n_u16(y_16_h, 8);
uint8x16_t result = vcombine_u16(vmovn_u16(y_16_l), vmovn_u16(y_16_h));
result = vaddq_u8(result, vdupq_n_u8(16));
// store into memory
vst1q_u8(data + i, result);
}
// remaind Y data
for (i = endY; i < size; ++i) {
data[i] = (219 * data[i]) / 256 + 16;
}
// process UV data
int divUVPartCount = (size / 2) / slicePixelCount;
int endUV = size + divUVPartCount * slicePixelCount;
for (i = size; i < endUV; i += slicePixelCount) {
uint8x16_t uv_16 = vld1q_u8(data + i);
uint16x8_t uv_16_l = vmovl_u8(vget_low_u8(uv_16)); // 0~7bit
uint16x8_t uv_16_h = vmovl_u8(vget_high_u8(uv_16)); // 8~15bit
uv_16_l = vmulq_n_u16(uv_16_l, 224);
uv_16_h = vmulq_n_u16(uv_16_h, 224);
uv_16_l = vshrq_n_u16(uv_16_l, 8);
uv_16_h = vshrq_n_u16(uv_16_h, 8);
uint8x16_t result = vcombine_u16(vmovn_u16(uv_16_l), vmovn_u16(uv_16_h));
result = vaddq_u8(result, vdupq_n_u8(16));
// store into memory
vst1q_u8(data + i, result);
}
// remaind UV data
for (i = endUV; i < size * 3 / 2; ++i) {
data[i] = (224 * data[i]) / 256 + 16;
}
#endif
}
void FullRange2VideoRangeNoNeon(uint8_t *data) {
int size = width * height;
// no acceleration version
int i = 0;
// Y
for (; i < size; i++) {
data[i] = (219 * data[i]) / 256 + 16;
}
// UV
for (; i < size*3/2; i++) {
data[i] = (224 * data[i]) / 256 + 16;
}
}
int main() {
int size = width * height;
uint8_t data1[width * height * 3 / 2];
uint8_t data2[width * height * 3 / 2];
// Y
for (int i = 0; i < size; ++i) {
data1[i] = (uint8_t)(i % 256);
data2[i] = (uint8_t)(i % 256);
}
// UV
for (int i = 0; i < size / 2; ++i) {
data1[size + i] = (uint8_t)(i % 256);
data2[size + i] = (uint8_t)(i % 256);
}
// not use neon
auto start1 = std::chrono::steady_clock::now();
FullRange2VideoRangeNeon(data1);
auto end1 = std::chrono::steady_clock::now();
auto elapsed1 = std::chrono::duration_cast<std::chrono::microseconds>(end1-start1).count();
// use neon
auto start2 = std::chrono::steady_clock::now();
FullRange2VideoRangeNoNeon(data2);
auto end2 = std::chrono::steady_clock::now();
auto elapsed2 = std::chrono::duration_cast<std::chrono::microseconds>(end2-start2).count();
std::cout << "no NEON: " << elapsed1 << "(us)" << std::endl;
std::cout << "has NEON: " << elapsed2 << "(us)" << std::endl;
// validate
for (int i = 0; i < height * 3 / 2; ++i) {
for (int j = 0; j < width; ++j) {
if (data1[i * width + j] != data2[i * width + j]) {
cout << "result is wrong!!!" << endl;
return -1;
}
}
}
return 0;
}
参考资料