一 x264中的denoise_dct C语言实现
static void denoise_dct( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size )
{
for( int i = 0; i < size; i++ )
{
int level = dct[i]; //拿到当前的dct系数值,从0到15,并不是zigzag扫的
int sign = level>>31;//右移31位 只剩下符号位
level = (level+sign)^sign; //求绝对值
sum[i] += level; //sum[i] += level值 每次更新计算offset
//sum值用于每次更新nr_offset,
level -= offset[i]; //level - offset值
dct[i] = level<0 ? 0 : (level^sign)-sign; //相减之后不能小于0
}
}
/****************************************************************************
* DCT-domain noise reduction / adaptive deadzone
* from libavcodec
****************************************************************************/
void s264_noise_reduction_update( s264_t *h )
{
h->nr_offset = h->nr_offset_denoise;
h->nr_residual_sum = h->nr_residual_sum_buf[0];
h->nr_count = h->nr_count_buf[0];
for( int cat = 0; cat < 3 + CHROMA444; cat++ )
{
int dct8x8 = cat&1;
int size = dct8x8 ? 64 : 16;
const uint32_t *weight = dct8x8 ? s264_dct8_weight2_tab : s264_dct4_weight2_tab;
if( h->nr_count[cat] > (dct8x8 ? (1<<16) : (1<<18)) )
{
for( int i = 0; i < size; i++ )
h->nr_residual_sum[cat][i] >>= 1;
h->nr_count[cat] >>= 1;
}
for( int i = 0; i < size; i++ )
h->nr_offset[cat][i] =
((uint64_t)h->param.analyse.i_noise_reduction * h->nr_count[cat]
+ h->nr_residual_sum[cat][i]/2)
/ ((uint64_t)h->nr_residual_sum[cat][i] * weight[i]/256 + 1);
//用sum值更新nr_offset
/* Don't denoise DC coefficients */
h->nr_offset[cat][0] = 0;
}
}
二 x264中的汇编实现
;-----------------------------------------------------------------------------
; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size );
-----------------------------------------------------------------------------
%macro DENOISE_DCT 0cglobal denoise_dct, 4,4,6
pxor m5, m5 //对xmm/ymm/zmm寄存器的异或运算,equal to m5 = 0
movsxdifnidn r3, r3d
/*
%macro movsxdifnidn 2
%ifnidn %1, %2
movsxd %1, %2 /Move doubleword to quadword with sign-extension.
//d 表示双字
%endif
%endmacro
/*
把r3d 带符号的存入双字带符号的 也就是说 32-> 64位,
*/
.loop:
mova m2, [r0+r3*4-2*mmsize] //r3 = size, size * 4 - 2 * mmsize(2/4)
//如果是4x4的dct, size = 16 一个dct因子是4个字节,r3 * 4 = 16 * 4个字节偏移到地址末尾, - 2 * 4 这一行16个字节,存入m2 128bit,倒数第二行
mova m3, [r0+r3*4-1*mmsize] ;//同上面,倒数第一行,存入m3
ABSD m0, m2 //取dct系数的绝对值 存入m0
ABSD m1, m3 //取dct系数的绝对值 存入m1
paddd m4, m0, [r1+r3*4-2*mmsize] // m0和sum对应的相加,并存入 m4
psubd m0, [r2+r3*4-2*mmsize] //m0 - 对应的offset,并存入m0
mova [r1+r3*4-2*mmsize], m4 //sum 计算完了之后,再存放回去
paddd m4, m1, [r1+r3 * 4-1 * mmsize] //sum求和另一行
psubd m1, [r2+r34-1mmsize]
mova [r1+r34-1mmsize], m4
//上面两行是重复的
pcmpgtd m4, m0, m5 //m0和0比较,把结果存入m4
pand m0, m4 //按位做与运算,并存入m1, 如果是小于0的, m4就是0,相与之后的结果就是0 ,反之m4就等于减去了offset之后的值
pcmpgtd m4, m1, m5
pand m1, m4 //按位做与运算,并存入m1
PSIGND m0, m2 //double word 符号对齐,从m2对齐到m0
PSIGND m1, m3
mova [r0+r3*4-2*mmsize], m0 //计算最终的数据存回去
mova [r0+r3*4-1*mmsize], m1
上面把dct系数再存回去
sub r3d, mmsize/2 //向前递推行
jg .loop
RET
%endmacro