Intrinsic RGB2Y

void SdV3(unsigned char* pRGB, unsigned char* pOut)
	{
		__m128i RBGW0 = _mm_setr_epi16(B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT);
		__m128i RBGW1 = _mm_setr_epi16(G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT);
		__m128i RBGW2 = _mm_setr_epi16(R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT);
		__m128i SHFMSK0 = _mm_setr_epi8(0, 6, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
		__m128i SHFMSK1 = _mm_setr_epi8(-1, -1, -1, 0, 6, 12,  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
		__m128i SHFMSK3 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, 0, 6, 12,  -1, -1, -1, -1, -1, -1, -1);
		__m128i SHFMSK4 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 6, 12,  -1, -1, -1, -1);
		clock_t tstartcpp = clock();
		for (int it = 0; it < NRGB; ++it)
		{
#pragma omp parallel for num_threads(4)/*试下来线程数设置成4获取的加速比最大,将线程数设置成物理核数8个反而加速比降低,猜测原因可能负载太小,线程池内部的开销反而造成性能下降*/
			for (int i = 0, store = 0; i < isize - 36; i += 36, store += 12)
			{
				__m128i src0 = _mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(pRGB + i + 0)));
				__m128i src1 = _mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(pRGB + i + 1)));
				__m128i src2 = _mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(pRGB + i + 2)));
				__m128i src3 = _mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(pRGB + i + 9)));
				__m128i src4 = _mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(pRGB + i + 10)));
				__m128i src5 = _mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(pRGB + i + 11)));
				__m128i src6 = _mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(pRGB + i + 18)));
				__m128i src7 = _mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(pRGB + i + 19)));
				__m128i src8 = _mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(pRGB + i + 20)));
				__m128i src9 = _mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(pRGB + i + 27)));
				__m128i src10 = _mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(pRGB + i + 28)));
				__m128i src11 = _mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(pRGB + i + 29)));

				__m128i Tmp1 = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(src0, RBGW0), _mm_mullo_epi16(src1, RBGW1)), _mm_mullo_epi16(src2, RBGW2));
				__m128i Tmp2 = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(src3, RBGW0), _mm_mullo_epi16(src4, RBGW1)), _mm_mullo_epi16(src5, RBGW2));
				__m128i Tmp3 = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(src6, RBGW0), _mm_mullo_epi16(src7, RBGW1)), _mm_mullo_epi16(src8, RBGW2));
				__m128i Tmp4 = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(src9, RBGW0), _mm_mullo_epi16(src10, RBGW1)), _mm_mullo_epi16(src11, RBGW2));

				__m128i T1 = _mm_srli_epi16(Tmp1, 8);
				__m128i T2 = _mm_srli_epi16(Tmp2, 8);
				__m128i T3 = _mm_srli_epi16(Tmp3, 8);
				__m128i T4 = _mm_srli_epi16(Tmp4, 8);

				__m128i AL = _mm_shuffle_epi8(T1, SHFMSK0);
				__m128i AH = _mm_shuffle_epi8(T2, SHFMSK1);
				__m128i BL = _mm_shuffle_epi8(T3, SHFMSK3);
				__m128i BH = _mm_shuffle_epi8(T4, SHFMSK4);

				__m128i A = _mm_or_si128(AL, AH);
				__m128i B = _mm_or_si128(BL, BH);
				__m128i Rst = _mm_or_si128(A, B);

				_mm_storeu_si128((__m128i*)(pOut + store), Rst);

			}
		}
		
		clock_t tendcpp = clock();
		std::cout << "SdV3 cost time " << tendcpp - tstartcpp << std::endl;
	}
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值