webrtc-agc 自动增益控制算法

0x13

已于 2024-04-23 20:22:13 修改

阅读量813

点赞数 2

分类专栏：音视频处理算法文章标签： webrtc agc 自动增益控制算法自动增益控制

于 2023-11-08 22:05:29 首次发布

本文链接：https://blog.csdn.net/qq_34448345/article/details/134299691

版权

音视频处理算法专栏收录该内容

9 篇文章 6 订阅

订阅专栏

最近又开始调 webrtc-agc 算法，这里记录自适应模拟增益模式下音量反馈调节的过程。

AGC算法里面相关的函数：

WebRtcAgc_AddMic：用于将来自麦克风的音频帧输入 AGC 处理流程。这是原始音频帧的输入点。
WebRtcAgc_AddFarend：用于添加来自远端音频的音频帧，以考虑远端声音对 AGC 处理的影响。这通常用于处理回声的情况。
WebRtcAgc_GetAddFarendError：获取 WebRtcAgc_AddFarend 函数的错误状态，以检测是否成功添加了远端音频。
WebRtcAgc_VirtualMic：用于模拟一个虚拟的麦克风输入，以用于 AGC 的测试和调试。
WebRtcAgc_UpdateAgcThresholds：更新 AGC 的阈值参数，以根据音频场景的变化来调整 AGC 的行为。
WebRtcAgc_SaturationCtrl：控制 AGC 处理中的饱和度，以确保音频信号不会过于放大，以防止失真。
WebRtcAgc_ZeroCtrl：用于控制 AGC 的零值处理，以确保输出的音频不会有过多的静音。
WebRtcAgc_SpeakerInactiveCtrl：控制 AGC 在检测到说话者不活跃时的处理，以减小噪声的放大。
WebRtcAgc_ExpCurve：处理音频信号的增益曲线，以根据音频能量来调整增益。
WebRtcAgc_ProcessAnalog：执行 AGC 的模拟处理，用于处理模拟音频信号。
WebRtcAgc_Process：执行 AGC 的数字处理，用于处理数字音频信号。
WebRtcAgc_set_config：用于设置 AGC 的配置参数，如目标能量、阈值等。
WebRtcAgc_get_config：用于获取当前 AGC 的配置参数。
WebRtcAgc_Create：创建 AGC 的实例。
WebRtcAgc_Free：释放 AGC 的实例。
WebRtcAgc_Init：初始化 AGC，包括分配内存和设置初始参数。
WebRtcAgc_CalculateGainTable：用于计算 AGC 的增益表，以在处理音频时快速查找所需的增益值。
WebRtcAgc_InitDigital：初始化 AGC 的数字处理部分。
WebRtcAgc_AddFarendToDigital：将远端音频添加到数字 AGC 处理中。
WebRtcAgc_ProcessDigital：执行数字 AGC 处理，处理数字音频信号。
WebRtcAgc_InitVad：初始化 AGC 的语音活动检测（VAD）部分，用于检测语音活动。
WebRtcAgc_ProcessVad：执行 VAD 处理，用于检测语音活动

下面是音量调节流程：

webrtc-agc 算法自适应模拟增益音量调节流程:


0.WebRtcAgc_set_config 设置参数，有几个参数在后续调节音量会用到。

	1.WebRtcAgc_UpdateAgcThresholds 参数设置,自适应参数设置,后面会计算一帧的能量判断处于哪个区间来确定音量增大还是减小
		#define ANALOG_TARGET_LEVEL 11
		#define OFFSET_ENV_TO_RMS 9
			# targetIdx 一直是20
		    stt->targetIdx = ANALOG_TARGET_LEVEL + OFFSET_ENV_TO_RMS;
			
		# 下面是
		static const int32_t kTargetLevelTable[64] = {
        134209536, 106606424, 84680493, 67264106, 53429779, 42440782, 33711911,
        26778323, 21270778, 16895980, 13420954, 10660642, 8468049, 6726411,
        5342978, 4244078, 3371191, 2677832, 2127078, 1689598, 1342095,
        1066064, 846805, 672641, 534298, 424408, 337119, 267783,
        212708, 168960, 134210, 106606, 84680, 67264, 53430,
        42441, 33712, 26778, 21271, 16896, 13421, 10661,
        8468, 6726, 5343, 4244, 3371, 2678, 2127,
        1690, 1342, 1066, 847, 673, 534, 424,
        337, 268, 213, 169, 134, 107, 85,
        67};
		
		#ifdef MIC_LEVEL_FEEDBACK
			stt->targetIdx += stt->targetIdxOffset;
		#endif
			/* kTargetLevelTable[20]=1342095 */
			/* analogTargetLevel = round((32767*10^(-targetIdx/20))^2*16/2^7) */
			stt->analogTargetLevel = RXX_BUFFER_LEN * kTargetLevelTable[stt->targetIdx]; /* ex. -20 dBov */
			stt->startUpperLimit = RXX_BUFFER_LEN * kTargetLevelTable[stt->targetIdx - 1]; /* -19 dBov */
			stt->startLowerLimit = RXX_BUFFER_LEN * kTargetLevelTable[stt->targetIdx + 1]; /* -21 dBov */
			stt->upperPrimaryLimit = RXX_BUFFER_LEN * kTargetLevelTable[stt->targetIdx - 2]; /* -18 dBov */
			stt->lowerPrimaryLimit = RXX_BUFFER_LEN * kTargetLevelTable[stt->targetIdx + 2]; /* -22 dBov */
			stt->upperSecondaryLimit = RXX_BUFFER_LEN * kTargetLevelTable[stt->targetIdx - 5]; /* -15 dBov */
			stt->lowerSecondaryLimit = RXX_BUFFER_LEN * kTargetLevelTable[stt->targetIdx + 5]; /* -25 dBov */
			stt->upperLimit = stt->startUpperLimit;
			stt->lowerLimit = stt->startLowerLimit;
		

1.WebRtcAgc_AddMic	模拟增益需要调用这个函数
	1.上一次 micVol 调到最大了都不满足目标音量,自动乘0~3.16倍,也就是在输入音量基础上做一个放大
			   if (stt->micVol > stt->maxAnalog) {
					...
					// 对输入音频样本应用增益
					for (i = 0; i < samples; i++) {
						size_t j;
						for (j = 0; j < num_bands; ++j) {
							// 对输入音频样本应用增益 并限制范围
							// 经过右移之后，数组被量化到0~3.16.
							sample = (in_mic[j][i] * gain) >> 12;
							if (sample > 32767) {
								in_mic[j][i] = 32767;
							} else if (sample < -32768) {
								in_mic[j][i] = -32768;
							} else {
								in_mic[j][i] = (int16_t) sample;
							}
						}
					}
				}
				else {
					stt->gainTableIdx = 0;
				}
	2.计算当前帧音频信号的包络值,保存在 env[2][10]中,后续会用来判断是否饱和、是否长时间静音
				if (stt->inQueue > 0) {
					ptr = stt->env[1];
				} else {
					ptr = stt->env[0];
				}
				for (i = 0; i < kNumSubframes; i++) {
					/* iterate over samples */
					max_nrg = 0;
					for (n = 0; n < L; n++) {
						nrg = in_mic[0][i * L + n] * in_mic[0][i * L + n];
						if (nrg > max_nrg) {
							max_nrg = nrg;
						}
					}
					ptr[i] = max_nrg;
				}
	3.计算当前帧音频信号的能量值,保存到 Rxx16w32_array 中,后续用来调音
				if (stt->inQueue > 0) {
					ptr = stt->Rxx16w32_array[1];
				} else {
					ptr = stt->Rxx16w32_array[0];
				}
				// 一帧又分为5个子帧
				for (i = 0; i < kNumSubframes / 2; i++) {
					// 16k 采样 1帧为 160 点 5个子帧每个子帧为 32个点 ,也就是每32个点计算一次能量
					if (stt->fs == 16000) {
						downsampleBy2(&in_mic[0][i * 32], 32, tmp_speech,stt->filterState);
					}
					// 8k采样 1帧为 80 点,5个子帧每个子帧为 16个点,也就是每16个点计算一次能量
					else {
						memcpy(tmp_speech, &in_mic[0][i * 16], 16 * sizeof(short));
					}
					/* Compute energy in blocks of 16 samples */
					ptr[i] = DotProductWithScale(tmp_speech, tmp_speech, 16, 4);
				}
	4.WebRtcAgc_ProcessVad 执行一次VAD, 每1毫秒降采样到4k然后计算信噪比,保存 state->meanLongTerm(长期能量均值)、state->varianceLongTerm(长期能量方差)、state->stdLongTerm(长期能量标准差)等参数,后面是通过 logRatio < vadThreshold 认为有语音活动
				for (subfr = 0; subfr < 10; subfr++) {
					// downsample to 4 kHz
					if (nrSamples == 160) {
						for (k = 0; k < 8; k++) {
							tmp32 = (int32_t) in[2 * k] + (int32_t) in[2 * k + 1];
							tmp32 >>= 1;
							buf1[k] = (int16_t) tmp32;
						}
						in += 16;
						downsampleBy2(buf1, 8, buf2, state->downState);
					} else {
						downsampleBy2(in, 8, buf2, state->downState);
						in += 8;
					}
					// 高通滤波器与计算能量
					for (k = 0; k < 4; k++) {
						out = buf2[k] + HPstate;
						tmp32 = 600 * out;
						HPstate = (int16_t) ((tmp32 >> 10) - buf2[k]);

						// Add 'out * out / 2**6' to 'nrg' in a non-overflowing
						// way. Guaranteed to work as long as 'out * out / 2**6' fits in
						// an int32_t.
						nrg += out * (out / (1 << 6));
						nrg += out * (out % (1 << 6)) / (1 << 6);
					}
				}
				// 确定信号级别
				// energy level (range {-32..30}) (Q10)
				dB = (15 - zeros) * (1 << 11);
				if (state->counter < kAvgDecayTime) {
					// decay time = AvgDecTime * 10 ms
					state->counter++;
				}
				// 后面是计算信噪比、以及其他能量参数
				// ...
2.WebRtcAgc_Process
	1.WebRtcAgc_ProcessDigital	每一帧都会先进行数字增益
	2.WebRtcAgc_ProcessAnalog	只有模拟增益才会进入
	     1.首次调用时 51/512=0.099，确保首次音量初始化时设置音量不低于音量范围的 0.099倍。
				if (stt->firstCall == 0) {
					int32_t tmpVol;
					stt->firstCall = 1; // 将 firstCall 标记为已调用过
					// tmp32是整个音量范围的0.099倍
					tmp32 = ((stt->maxLevel - stt->minLevel) * 51) >> 9;
					// (minLevel有可能不等于0所以要加偏移)
					tmpVol = (stt->minLevel + tmp32);
					if ((inMicLevelTmp < tmpVol) && (stt->agcMode == kAgcModeAdaptiveAnalog)) {
						inMicLevelTmp = tmpVol;
					}
					// 确保首次音量初始化时设置音量不低于音量范围的 0.099倍
					stt->micVol = inMicLevelTmp;
				}
		 2.如果前面应用了数字增益,确保不会将模拟麦克风的音量提高到超过数字增益的最大级别。
				if ((inMicLevelTmp == stt->maxAnalog) && (stt->micVol > stt->maxAnalog)) {
					inMicLevelTmp = stt->micVol;
				}
		 3.当麦克风音量 inMicLevelTmp 被手动设置为非常低的值将麦克风音量提高
				if ((inMicLevelTmp != stt->micVol) && (inMicLevelTmp < stt->minOutput)) {
					tmp32 = ((stt->maxLevel - stt->minLevel) * 51) >> 9;
					inMicLevelTmp = (stt->minLevel + tmp32);
					stt->micVol = inMicLevelTmp;
				}
		 4.判断信号是否过饱和 WebRtcAgc_SaturationCtrl(stt, &saturated, stt->env[0]), 计算结果 saturated。
				// 包络数组
				for (i = 0; i < 10; i++) {
					tmpW16 = (int16_t) (env[i] >> 20);
					if (tmpW16 > 875) {
						stt->envSum += tmpW16;	// 将信号包络值压缩后累加
					}
				}
				if (stt->envSum > 25000) {	// 总的超过25000认为是饱和
					*saturated = 1;
					stt->envSum = 0;  // stt->envSum 是一个累积变量，用于跟踪多个帧中的信号过饱和情况。检测到过饱和之后重置
				}
		 5.如果过饱和将音量 micVol 缩减到0.903倍,重置一些阈值参数。zeroCtrlMax保存饱和时的音量值,保证后续长时间静音时增大音量不会超过这个值
				if (saturated == 1) {
					stt->Rxx160_LPw32 = (stt->Rxx160_LPw32 / 8) * 7; // 自相关系数降低0.875倍数,
					stt->zeroCtrlMax = stt->micVol;
					// 29591/32768 = 0.903 将当前音量缩减到 0.903 倍,并确保和上一次相差不超过2
					tmp32 = inMicLevelTmp - stt->minLevel;
					tmpU32 = ((uint32_t) ((uint32_t) (29591) * (uint32_t) (tmp32)));
					stt->micVol = (tmpU32 >> 15) + stt->minLevel;
					if (stt->micVol > lastMicVol - 2) {
						stt->micVol = lastMicVol - 2;
					}
					inMicLevelTmp = stt->micVol;
					if (stt->micVol < stt->minOutput) {
						*saturationWarning = 1; // 过饱和警告
					}
					stt->msTooHigh = -100;
					stt->activeSpeech = 0;
					stt->Rxx16_LPw32Max = 0;
					stt->msecSpeechInnerChange = kMsecSpeechInner;
					stt->msecSpeechOuterChange = kMsecSpeechOuter;
					stt->changeToSlowMode = 0;
					stt->muteGuardMs = 0;
					stt->upperLimit = stt->startUpperLimit;
					stt->lowerLimit = stt->startLowerLimit;
				#ifdef MIC_LEVEL_FEEDBACK
					// stt->numBlocksMicLvlSat = 0;
				#endif
				}
		 6.判断信号是否几乎全为0,也是通过包络数组计算。小于500表示非完全静音,大于500表示连续500ms内处于静音,音量增到到1.1倍,最大不能超过饱和时记录的音量
				void WebRtcAgc_ZeroCtrl(LegacyAgc *stt, int32_t *inMicLevel, const int32_t *env) {
					int16_t i;
					int64_t tmp = 0;
					int32_t midVal;
					for (i = 0; i < 10; i++) {
						tmp += env[i];
					}	
					if (tmp < 500) {	// 非完全静音,累加到 msZero
						stt->msZero += 10;
					} else {
						stt->msZero = 0;
					}
					if (stt->muteGuardMs > 0) {
						stt->muteGuardMs -= 10;
					}
					if (stt->msZero > 500) {	// 完全静音
						stt->msZero = 0;
						midVal = (stt->maxAnalog + stt->minLevel + 1) / 2; // 计算中等水平的音量
						if (*inMicLevel < midVal) {
							// 增加到 1.1 倍数, 最大不超过上一次饱和时计算的音量
							*inMicLevel = (1126 * *inMicLevel) >> 10;
							*inMicLevel = MIN(*inMicLevel, stt->zeroCtrlMax);
							stt->micVol = *inMicLevel;
						}
						stt->activeSpeech = 0;	// 不活跃信号
						stt->Rxx16_LPw32Max = 0;
						stt->muteGuardMs = kMuteGuardTimeMs;
					}
			}
		 7.根据当前信号活跃状态调整VAD阈值,长时间静音阈值为15000、活跃状态阈值2500，后新旧VAD阈值做滑动平均之后保存,用于下一次的VAD判断
				// stdLongTerm 是长期能量标准差在上一次VAD判决中计算得到,越大表示越可能有语音活动
				if (stt->vadMic.stdLongTerm < 2500) {
					stt->vadThreshold = 1500;
				} else {
					vadThresh = kNormalVadThreshold;
					if (stt->vadMic.stdLongTerm < 4500) {
						/* Scale between min and max threshold */
						vadThresh += (4500 - stt->vadMic.stdLongTerm) / 2;
					}

					/* stt->vadThreshold = (31 * stt->vadThreshold + vadThresh) / 32; */
					tmp32 = vadThresh + 31 * stt->vadThreshold;
					stt->vadThreshold = (int16_t) (tmp32 >> 5);
				}
		 8.下面根据 vadLogRatio 的值，判断是否检测到语音活动。如果检测到语音活动则进行调音，会动态调整 AGC 阈值和麦克风级别。根据全帧能量Rxx160_LPw32 所在4个范围如下：
				如果 stt->Rxx160_LPw32 大于 stt->upperSecondaryLimit 会降低录音级别，以避免饱和。
				如果 stt->Rxx160_LPw32 大于 stt->upperLimit，会降低录音级别，以避免饱和。
				如果 stt->Rxx160_LPw32 小于 stt->lowerSecondaryLimit ，会提高录音级别。
				如果 stt->Rxx160_LPw32 小于 stt->lowerLimit，会提高录音级别。
				如果不在4中情况范围内,lowerLimit < Rxx160_LP/640 < upperLimit 4000ms后可以触发慢变模式（changeToSlowMode）
				部分代码如下：
				// 音量缩减为 0.95倍
				if (stt->Rxx160_LPw32 > stt->upperSecondaryLimit) {
					stt->msTooHigh += 2; // 递增,记录音频信号能量过强的时间
					stt->msTooLow = 0; // 音频信号过低清零
					stt->changeToSlowMode = 0; // 停止慢速模式
					if (stt->msTooHigh > stt->msecSpeechOuterChange) { // 音频信号过强持续时间达到上线
						stt->msTooHigh = 0;	// 重新信号能量过强记时
						/* Lower the recording level */
						/* Multiply by 0.828125 which corresponds to decreasing ~0.8dB */
						tmp32 = stt->Rxx160_LPw32 >> 6;
						stt->Rxx160_LPw32 = tmp32 * 53;
						/* Reduce the max gain to avoid excessive oscillation
						 * (but never drop below the maximum analog level).
						 */
						stt->maxLevel = (15 * stt->maxLevel + stt->micVol) / 16;
						stt->maxLevel = MAX(stt->maxLevel, stt->maxAnalog);
						stt->zeroCtrlMax = stt->micVol;
						/* 0.95 in Q15 */
						tmp32 = inMicLevelTmp - stt->minLevel;
						tmpU32 = ((uint32_t) ((uint32_t) (31130) * (uint32_t) (tmp32)));
						stt->micVol = (tmpU32 >> 15) + stt->minLevel;
						if (stt->micVol > lastMicVol - 1) {
							stt->micVol = lastMicVol - 1;
						}
						inMicLevelTmp = stt->micVol;
						stt->activeSpeech = 0;
						stt->Rxx16_LPw32Max = 0;
					}
				}
				// 音量缩减为 0.95倍
				else if (stt->Rxx160_LPw32 > stt->upperLimit) {
					stt->msTooHigh += 2;
					stt->msTooLow = 0;
					stt->changeToSlowMode = 0;
					if (stt->msTooHigh > stt->msecSpeechInnerChange) {
						/* Lower the recording level */
						stt->msTooHigh = 0;
						/* Multiply by 0.828125 which corresponds to decreasing ~0.8dB */
						stt->Rxx160_LPw32 = (stt->Rxx160_LPw32 / 64) * 53;

						/* Reduce the max gain to avoid excessive oscillation
						 * (but never drop below the maximum analog level).
						 */
						stt->maxLevel = (15 * stt->maxLevel + stt->micVol) / 16;
						stt->maxLevel = MAX(stt->maxLevel, stt->maxAnalog);

						stt->zeroCtrlMax = stt->micVol;

						/* 0.965 in Q15 */   // 音量缩减为 0.965 倍数
						//tmp32 = inMicLevelTmp - stt->minLevel;
						tmpU32 = ((uint32_t) ((uint32_t) (31621) * (uint32_t) ((inMicLevelTmp - stt->minLevel))));
						stt->micVol = (tmpU32 >> 15) + stt->minLevel;
						if (stt->micVol > lastMicVol - 1) {
							stt->micVol = lastMicVol - 1;
						}
						inMicLevelTmp = stt->micVol;
					}
				}
				// 音量增大为 1.047倍数
				 else if (stt->Rxx160_LPw32 < stt->lowerSecondaryLimit) {
						stt->msTooHigh = 0;		// 重置强音量持续时长
						stt->changeToSlowMode = 0; 
						stt->msTooLow += 2;  // 低音量持续时长递增
						if (stt->msTooLow > stt->msecSpeechOuterChange) {  // 低音量持续时长达到一段时间则进行音量放大
							/* Raise the recording level */
							int16_t index, weightFIX;
							int16_t volNormFIX = 16384;  // =1 in Q14.
							stt->msTooLow = 0;
							/* Normalize the volume level */
							tmp32 = (inMicLevelTmp - stt->minLevel) << 14;
							if (stt->maxInit != stt->minLevel) {
								volNormFIX = tmp32 / (stt->maxInit - stt->minLevel);
							}
							/* Find correct curve */
							WebRtcAgc_ExpCurve(volNormFIX, &index);
							weightFIX = kOffset1[index] - (int16_t) ((kSlope1[index] * volNormFIX) >> 13);
							/* 增大为 1.047 倍数 */
							stt->Rxx160_LPw32 = (stt->Rxx160_LPw32 / 64) * 67;
							//tmp32 = inMicLevelTmp - stt->minLevel;
							tmpU32 =((uint32_t) weightFIX * (uint32_t) (inMicLevelTmp - stt->minLevel));
							stt->micVol = (tmpU32 >> 14) + stt->minLevel;
							if (stt->micVol < lastMicVol + 2) {
								stt->micVol = lastMicVol + 2;
							}
							inMicLevelTmp = stt->micVol;
				}
				// 音量增大为 1.047倍数
				else if (stt->Rxx160_LPw32 < stt->lowerLimit) {
					stt->msTooHigh = 0;
					stt->changeToSlowMode = 0;
					stt->msTooLow += 2;
					if (stt->msTooLow > stt->msecSpeechInnerChange) {
						int16_t index, weightFIX;
						int16_t volNormFIX = 16384;  // =1 in Q14.
						stt->msTooLow = 0;
						tmp32 = (inMicLevelTmp - stt->minLevel) << 14;
						if (stt->maxInit != stt->minLevel) {
							volNormFIX = tmp32 / (stt->maxInit - stt->minLevel);
						}
						WebRtcAgc_ExpCurve(volNormFIX, &index);
						weightFIX = kOffset2[index] - (int16_t) ((kSlope2[index] * volNormFIX) >> 13);
						stt->Rxx160_LPw32 = (stt->Rxx160_LPw32 / 64) * 67;
						tmpU32 = ((uint32_t) weightFIX * (uint32_t) (inMicLevelTmp - stt->minLevel));
						stt->micVol = (tmpU32 >> 14) + stt->minLevel;
						if (stt->micVol < lastMicVol + 1) {
							stt->micVol = lastMicVol + 1;
						}
						inMicLevelTmp = stt->micVol;
					}
				}
				// 慢速模式
				else {
					if (stt->changeToSlowMode > 4000) {
						stt->msecSpeechInnerChange = 1000;
						stt->msecSpeechOuterChange = 500;
						stt->upperLimit = stt->upperPrimaryLimit;
						stt->lowerLimit = stt->lowerPrimaryLimit;
					} else {
						stt->changeToSlowMode += 2;  // in milliseconds
					}
					stt->msTooLow = 0;
					stt->msTooHigh = 0;

					stt->micVol = inMicLevelTmp;
				}