【Halcon逆向工程】sobel_dir算子破解 C++实现并加速

原创已于 2025-04-15 15:59:58 修改 · 142 阅读
CC 4.0 BY-SA版权
文章标签：
于 2025-03-13 13:31:09 首次发布
void sobel_dir(unsigned char* img, unsigned char* amp, unsigned char* dir, int rows, int cols, string mode)
{
	// padding
	const int padded_rows = rows + 2;
	const int padded_cols = cols + 2;
	unsigned char* padded = new unsigned char[padded_rows * padded_cols];
	Reflect101_Padding(img, padded, rows, cols, 1);

#pragma omp parallel for schedule(guided) num_threads(omp_get_max_threads())
	for (int y = 0; y < rows; ++y)
	{
		for (int x = 0; x < cols - 16; x += 8)
		{
			// 公共顶点部分
			__m128i ul = _mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*) & padded[y * padded_cols + x]));
			__m128i ur = _mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*) & padded[y * padded_cols + x + 2]));
			__m128i dl = _mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*) & padded[(y + 2) * padded_cols + x]));
			__m128i dr = _mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*) & padded[(y + 2) * padded_cols + x + 2]));
			// 竖直差分核
			__m128i aum = _mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*) & padded[y * padded_cols + x + 1]));
			__m128i adm = _mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*) & padded[(y + 2) * padded_cols + x + 1]));
			// 水平差分核
			__m128i blm = _mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*) & padded[(y + 1) * padded_cols + x]));
			__m128i brm = _mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*) & padded[(y + 1) * padded_cols + x + 2]));

			// au=2aum+ul+ur ad=2adm+dl+dr a=au-ad
			__m128i au = _mm_add_epi16(aum, _mm_add_epi16(aum, _mm_add_epi16(ul, ur)));
			__m128i ad = _mm_add_epi16(adm, _mm_add_epi16(adm, _mm_add_epi16(dl, dr)));
			__m128i a = _mm_sub_epi16(au, ad);
			// bl=2blm+ul+dl br=2brm+ur+dr b=bl-br
			__m128i bl = _mm_add_epi16(blm, _mm_add_epi16(blm, _mm_add_epi16(ul, dl)));
			__m128i br = _mm_add_epi16(brm, _mm_add_epi16(brm, _mm_add_epi16(ur, dr)));
			__m128i b = _mm_sub_epi16(bl, br);

			__m128 a1 = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(a));
			__m128 a2 = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_srli_si128(a, 8)));
			__m128 b1 = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(b));
			__m128 b2 = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_srli_si128(b, 8)));

			// dir = arctan2(b/a)
			__m128 ang1 = _mm_mul_ps(_mm_atan2_ps(b1, a1), _mm_set1_ps(90 / PI));
			__m128 ang2 = _mm_mul_ps(_mm_atan2_ps(b2, a2), _mm_set1_ps(90 / PI));
			__m128 mask1 = _mm_cmplt_ps(ang1, _mm_setzero_ps());
			__m128 mask2 = _mm_cmplt_ps(ang2, _mm_setzero_ps());
			mask1 = _mm_and_ps(_mm_set1_ps(180), mask1);
			mask2 = _mm_and_ps(_mm_set1_ps(180), mask2);
			ang1 = _mm_add_ps(_mm_add_ps(mask1, ang1), _mm_set1_ps(0.25));
			ang2 = _mm_add_ps(_mm_add_ps(mask2, ang2), _mm_set1_ps(0.25));
			__m128i dir1 = _mm_cvttps_epi32(ang1);
			__m128i dir2 = _mm_cvttps_epi32(ang2);
			__m128i result = _mm_packus_epi32(dir1, dir2);
			__m128i sp1 = _mm_cmpeq_epi16(a, _mm_setzero_si128());
			__m128i sp2 = _mm_cmpeq_epi16(b, _mm_setzero_si128());
			__m128i sp = _mm_and_si128(sp1, sp2);
			sp = _mm_and_si128(sp, _mm_set1_epi16(255));
			result = _mm_add_epi16(result, sp);
			result = _mm_packus_epi16(result, _mm_setzero_si128());
			_mm_storeu_si128((__m128i*) & dir[y * cols + x], result);

			// amp = sqrt(a^2+b^2)/4
			if (mode == "sum_sqrt")
			{
				a1 = _mm_mul_ps(a1, a1);
				a2 = _mm_mul_ps(a2, a2);
				b1 = _mm_mul_ps(b1, b1);
				b2 = _mm_mul_ps(b2, b2);
				__m128i result1 = _mm_srli_epi32(_mm_cvttps_epi32(_mm_sqrt_ps(_mm_add_ps(a1, b1))), 2);
				__m128i result2 = _mm_srli_epi32(_mm_cvttps_epi32(_mm_sqrt_ps(_mm_add_ps(a2, b2))), 2);
				__m128i result = _mm_packus_epi32(result1, result2);
				result = _mm_packus_epi16(result, _mm_setzero_si128());
				_mm_storeu_si128((__m128i*) & amp[y * cols + x], result);
			}
			// amp = (|a|+|b|)/4
			else if (mode == "sum_abs")
			{
				__m128i temp = _mm_add_epi16(_mm_abs_epi16(a), _mm_abs_epi16(b));
				__m128i result = _mm_srli_epi16(temp, 2);
				result = _mm_packus_epi16(result, _mm_setzero_si128());
				_mm_storeu_si128((__m128i*) & amp[y * cols + x], result);
			}
		}
		for (int x = cols - 16; x < cols; ++x)
		{
			int a1 = padded[y * padded_cols + x] + 2 * padded[y * padded_cols + x + 1] + padded[y * padded_cols + x + 2];
			int a2 = padded[(y + 2) * padded_cols + x] + 2 * padded[(y + 2) * padded_cols + x + 1] + padded[(y + 2) * padded_cols + x + 2];
			int b1 = padded[y * padded_cols + x] + 2 * padded[(y + 1) * padded_cols + x] + padded[(y + 2) * padded_cols + x];
			int b2 = padded[y * padded_cols + x + 2] + 2 * padded[(y + 1) * padded_cols + x + 2] + padded[(y + 2) * padded_cols + x + 2];
			int a = a1 - a2;
			int b = b1 - b2;

			if (a == 0 && b == 0)
				dir[y * cols + x] = 255;
			else
				dir[y * cols + x] = int(atan2(b, a) * 90 / PI + 180.25) % 180;

			if (mode == "sum_sqrt")
				amp[y * cols + x] = sqrt(a * a + b * b) * 0.25;
			else if (mode == "sum_abs")
				amp[y * cols + x] = (abs(a) + abs(b)) * 0.25;
		}
	}

	delete[] padded;
}