void sobel_dir(unsigned char* img, unsigned char* amp, unsigned char* dir, int rows, int cols, string mode)
{
// padding
const int padded_rows = rows + 2;
const int padded_cols = cols + 2;
unsigned char* padded = new unsigned char[padded_rows * padded_cols];
Reflect101_Padding(img, padded, rows, cols, 1);
#pragma omp parallel for schedule(guided) num_threads(omp_get_max_threads())
for (int y = 0; y < rows; ++y)
{
for (int x = 0; x < cols - 16; x += 8)
{
// 公共顶点部分
__m128i ul = _mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*) & padded[y * padded_cols + x]));
__m128i ur = _mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*) & padded[y * padded_cols + x + 2]));
__m128i dl = _mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*) & padded[(y + 2) * padded_cols + x]));
__m128i dr = _mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*) & padded[(y + 2) * padded_cols + x + 2]));
// 竖直差分核
__m128i aum = _mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*) & padded[y * padded_cols + x + 1]));
__m128i adm = _mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*) & padded[(y + 2) * padded_cols + x + 1]));
// 水平差分核
__m128i blm = _mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*) & padded[(y + 1) * padded_cols + x]));
__m128i brm = _mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*) & padded[(y + 1) * padded_cols + x + 2]));
// au=2aum+ul+ur ad=2adm+dl+dr a=au-ad
__m128i au = _mm_add_epi16(aum, _mm_add_epi16(aum, _mm_add_epi16(ul, ur)));
__m128i ad = _mm_add_epi16(adm, _mm_add_epi16(adm, _mm_add_epi16(dl, dr)));
__m128i a = _mm_sub_epi16(au, ad);
// bl=2blm+ul+dl br=2brm+ur+dr b=bl-br
__m128i bl = _mm_add_epi16(blm, _mm_add_epi16(blm, _mm_add_epi16(ul, dl)));
__m128i br = _mm_add_epi16(brm, _mm_add_epi16(brm, _mm_add_epi16(ur, dr)));
__m128i b = _mm_sub_epi16(bl, br);
__m128 a1 = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(a));
__m128 a2 = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_srli_si128(a, 8)));
__m128 b1 = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(b));
__m128 b2 = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_srli_si128(b, 8)));
// dir = arctan2(b/a)
__m128 ang1 = _mm_mul_ps(_mm_atan2_ps(b1, a1), _mm_set1_ps(90 / PI));
__m128 ang2 = _mm_mul_ps(_mm_atan2_ps(b2, a2), _mm_set1_ps(90 / PI));
__m128 mask1 = _mm_cmplt_ps(ang1, _mm_setzero_ps());
__m128 mask2 = _mm_cmplt_ps(ang2, _mm_setzero_ps());
mask1 = _mm_and_ps(_mm_set1_ps(180), mask1);
mask2 = _mm_and_ps(_mm_set1_ps(180), mask2);
ang1 = _mm_add_ps(_mm_add_ps(mask1, ang1), _mm_set1_ps(0.25));
ang2 = _mm_add_ps(_mm_add_ps(mask2, ang2), _mm_set1_ps(0.25));
__m128i dir1 = _mm_cvttps_epi32(ang1);
__m128i dir2 = _mm_cvttps_epi32(ang2);
__m128i result = _mm_packus_epi32(dir1, dir2);
__m128i sp1 = _mm_cmpeq_epi16(a, _mm_setzero_si128());
__m128i sp2 = _mm_cmpeq_epi16(b, _mm_setzero_si128());
__m128i sp = _mm_and_si128(sp1, sp2);
sp = _mm_and_si128(sp, _mm_set1_epi16(255));
result = _mm_add_epi16(result, sp);
result = _mm_packus_epi16(result, _mm_setzero_si128());
_mm_storeu_si128((__m128i*) & dir[y * cols + x], result);
// amp = sqrt(a^2+b^2)/4
if (mode == "sum_sqrt")
{
a1 = _mm_mul_ps(a1, a1);
a2 = _mm_mul_ps(a2, a2);
b1 = _mm_mul_ps(b1, b1);
b2 = _mm_mul_ps(b2, b2);
__m128i result1 = _mm_srli_epi32(_mm_cvttps_epi32(_mm_sqrt_ps(_mm_add_ps(a1, b1))), 2);
__m128i result2 = _mm_srli_epi32(_mm_cvttps_epi32(_mm_sqrt_ps(_mm_add_ps(a2, b2))), 2);
__m128i result = _mm_packus_epi32(result1, result2);
result = _mm_packus_epi16(result, _mm_setzero_si128());
_mm_storeu_si128((__m128i*) & amp[y * cols + x], result);
}
// amp = (|a|+|b|)/4
else if (mode == "sum_abs")
{
__m128i temp = _mm_add_epi16(_mm_abs_epi16(a), _mm_abs_epi16(b));
__m128i result = _mm_srli_epi16(temp, 2);
result = _mm_packus_epi16(result, _mm_setzero_si128());
_mm_storeu_si128((__m128i*) & amp[y * cols + x], result);
}
}
for (int x = cols - 16; x < cols; ++x)
{
int a1 = padded[y * padded_cols + x] + 2 * padded[y * padded_cols + x + 1] + padded[y * padded_cols + x + 2];
int a2 = padded[(y + 2) * padded_cols + x] + 2 * padded[(y + 2) * padded_cols + x + 1] + padded[(y + 2) * padded_cols + x + 2];
int b1 = padded[y * padded_cols + x] + 2 * padded[(y + 1) * padded_cols + x] + padded[(y + 2) * padded_cols + x];
int b2 = padded[y * padded_cols + x + 2] + 2 * padded[(y + 1) * padded_cols + x + 2] + padded[(y + 2) * padded_cols + x + 2];
int a = a1 - a2;
int b = b1 - b2;
if (a == 0 && b == 0)
dir[y * cols + x] = 255;
else
dir[y * cols + x] = int(atan2(b, a) * 90 / PI + 180.25) % 180;
if (mode == "sum_sqrt")
amp[y * cols + x] = sqrt(a * a + b * b) * 0.25;
else if (mode == "sum_abs")
amp[y * cols + x] = (abs(a) + abs(b)) * 0.25;
}
}
delete[] padded;
}
【Halcon逆向工程】sobel_dir算子破解 C++实现并加速
于 2025-03-13 13:31:09 首次发布