通过计算音高的平均值和标准差来分析说话人的性别和情感等信息,利用能量过滤背景人声。
class computer:
def __init__(self):
pass
# compute log energy
def sliding_window(self, x, window_size, window_shift):
shape = x.shape[:-1] + (x.shape[-1] - window_size + 1, window_size)
strides = x.strides + (x.strides[-1],)
return np.lib.stride_tricks.as_strided(x, shape=shape, strides=strides)[::window_shift]
def func_num_frames(self, num_samples, window_size, window_shift):
if num_samples < window_size:
return 0
else:
return 1 + ((num_samples - window_size) // window_shift)
def func_remove_dc_offset(self, waveform):
return waveform - np.mean(waveform)
def func_log_energy(self, waveform):
return np.log(np.dot(waveform, waveform).clip(min=np.finfo(waveform.dtype).eps))
def process_window(self, window):
window = self.func_remove_dc_offset(window)
log_energy = self.func_log_energy(window)
return log_energy
def extract_window(self, waveform, window_size, window_shift, dtype):
num_samples = len(waveform)
num_frames = self.func_num_frames(num_samples, window_size, window_shift)
num_samples_ = (num_frames - 1) * window_shift + window_size
waveform = waveform[:num_samples_]
frames = self.sliding_window(waveform, window_size=window_size, window_shift=window_shift)
frames = frames.astype(dtype)
log_enery = np.empty(frames.shape[0], dtype=dtype)
for i in range(frames.shape[0]):
log_enery[i] = self.process_window(window=frames[i])
return log_enery
def compute_log_energy(self,
waveform,
frame_length=25,
frame_shift=10,
sample_frequency=8000,
dtype=np.float32):
window_size = int(frame_length * sample_frequency * 0.001)
window_shift = int(frame_shift * sample_frequency * 0.001)
log_energy = self.extract_window(
waveform=waveform,
window_size=window_size,
window_shift=window_shift,
dtype=dtype
)
return log_energy
# compute pitch feature
def parselmouth_pitch(self, wav_data, sample_rate):
f0min = 50
f0max = 400
time_step = 0.01 # s
snd = parselmouth.Sound(values=wav_data, sampling_frequency=sample_rate)
pitch = snd.to_pitch_ac(
time_step=time_step, voicing_threshold=0.6, pitch_floor=f0min, pitch_ceiling=f0max)
timestamps = np.arange(0, snd.duration, time_step)
f0 = []
for t in timestamps:
f0.append(pitch.get_value_at_time(t) if not math.isnan(pitch.get_value_at_time(t)) else 0)
return np.array(f0, dtype=np.float32)
提取声音特征
sound = parselmouth.Sound(y)
pitch = sound.to_pitch()分析说话人的音高
pitch_values = pitch.selected_array[‘frequency’]
mean_pitch = np.nanmean(pitch_values)
std_pitch = np.nanstd(pitch_values)输出结果
print(‘Mean pitch:’, mean_pitch)
print(‘Standard deviation of pitch:’, std_pitch)除了声音信息,文本数据中也包含了丰富的说话人信息。常用的说话人分析技术包括语言识别、情感分析和话语分割等。在Python中,我们可以使用NLTK、TextBlob和SpaCy等自然语言处理库来实现这些功能。