【neon加速拆分/合并交叉数据】使用neon intrinsics加速合并/拆分uv的内存分布(交叉存储/分别存储)

说明

  • 在YUV格式的图片中,uv数据可以“UVUVUVUV”形式交叉存储叫NV12,也可以以“UUUUUVVVVV”的格式分开存储,为了将二者转换,需要类似如下的代码:
// 将交错的数据分开
for (int i = 0; i < size; i++) {
	uv[2*i] = u[i];
	uv[2*i+1] = v[i];
}
// 交织数据
for (int i = 0; i < size; i++) {
	u[i] = uv[2*i];
	v[i] = uv[2*i+1];
}
  • 为了更快实现代码,可以使用neon intrinsics中的vld2q_u8和vst2q_u8实现高效率交叉地读取两路数据和交叉地写两路数据

代码

github链接TestNeon

#include <iostream>
#include <string.h>
#include <cstdint>
#include <chrono>

#if defined(__ARM_NEON)
#include <arm_neon.h>
#define USE_NEON
#elif defined(WIN32)
#include "NEON_2_SSE.h"
// #define USE_NEON
#endif

using namespace std;

void NV12TOYUV420P(const uint8_t* uv, uint8_t* u, uint8_t* v, int w, int h) {
	int size = w*h;
#ifdef USE_NEON
    int sliceDataCount = 16; // neon: 16, sse: 4
    int divPartCount = size / sliceDataCount;
	int divCountEnd = divPartCount * sliceDataCount;
	for (int i = 0; i < divCountEnd; i += sliceDataCount) {
		uint8x16x2_t uvData = vld2q_u8(uv + 2 * i);
		vst1q_u8(u + i, uvData.val[0]);
		vst1q_u8(v + i, uvData.val[1]);
	}
	// remained data
	 for (int i = divCountEnd; i < size; ++i) {
		u[i] = uv[2*i];
		v[i] = uv[2*i+1];
	 }
#else
	for (int i = 0; i < size; i++) {
		u[i] = uv[2*i];
		v[i] = uv[2*i+1];
	}
#endif
}

void YUV420PTONV12(const uint8_t* u, const uint8_t* v, uint8_t* uv, int w, int h) {
	int size = w*h;
#ifdef USE_NEON
	int sliceDataCount = 16; // neon: 16, sse: 4
    int divPartCount = size / sliceDataCount;
	int divCountEnd = divPartCount * sliceDataCount;
	for (int i = 0; i < divCountEnd; i += sliceDataCount) {
		uint8x16_t uData = vld1q_u8(u + i);
		uint8x16_t vData = vld1q_u8(v + i);
        uint8x16x2_t uvData;
        uvData.val[0] = uData;
        uvData.val[1] = vData;
		vst2q_u8(uv + 2 * i, uvData);
	}
	// remained data
	 for (int i = divCountEnd; i < size; ++i) {
		uv[2*i] = u[i];
		uv[2*i+1] = v[i];
	 }
#else
	for (int i = 0; i < size; i++) {
		uv[2*i] = u[i];
		uv[2*i+1] = v[i];
	}
#endif
}

void testNV12TOYUV420P() {
    cout << "split data" << endl;
    const uint64_t width = 10000, height = 10000;
    int size = width * height;
    uint8_t uv[width * height * 2];
    uint8_t u[width * height];
    uint8_t v[width * height];
    // uv
    for (int i = 0; i < size * 2; ++i) {
        uv[i] = (uint8_t)(i % 256);
    }

    for ( int i = 0; i < 20; ++i) {
        auto start2 = std::chrono::steady_clock::now();
        NV12TOYUV420P(uv,u,v,width,height);
        auto end2 = std::chrono::steady_clock::now();
        auto elapsed2 = std::chrono::duration_cast<std::chrono::microseconds>(end2-start2).count();
        std::cout << "NEON: " << elapsed2 << "(us)" << std::endl;
    }
}

void testYUV420PTONV12() {
    cout << "combine data" << endl;
    const uint64_t width = 10000, height = 10000;
    int size = width * height;
    uint8_t uv[width * height * 2];
    uint8_t u[width * height];
    uint8_t v[width * height];
    // u, v data
    for (int i = 0; i < size; ++i) {
        u[i] = (uint8_t)((2 * i) % 256);
        v[i] = (uint8_t)((2 * i + 1) % 256);
    }

    for ( int i = 0; i < 20; ++i) {
        auto start2 = std::chrono::steady_clock::now();
        YUV420PTONV12(u,v,uv,width,height);
        auto end2 = std::chrono::steady_clock::now();
        auto elapsed2 = std::chrono::duration_cast<std::chrono::microseconds>(end2-start2).count();
        std::cout << "NEON: " << elapsed2 << "(us)" << std::endl;
    }
}

int main() {
    // testNV12TOYUV420P();
    testYUV420PTONV12();
    return 0;
}

参考资料

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值