从video中抽取audio
from moviepy.editor import VideoFileClip
from pydub import AudioSegment
video_path = '/opt/audio/audios/video1.mp4'
audio_path = '/opt/audio/audios/video1.wav' # 提取的音频保存路径
# 加载视频文件
video = VideoFileClip(video_path)
# 提取音频
audio = video.audio
# 保存音频文件
audio.write_audiofile(audio_path)
# 读取音频文件
sound = AudioSegment.from_wav(audio_path)
# 将音频转换为单声道
sound = sound.set_channels(1)
sound = sound.set_frame_rate(16000)
# 保存音频文件(单声道)
sound.export(audio_path, format="wav")
截取audio
from pydub import AudioSegment
# 加载音频文件
audio = AudioSegment.from_file(
"xxx/1.wav")
# 定义起始和结束时间(单位为毫秒)
start_time = 3000
end_time = 28550
# 截取音频
extracted = audio[start_time:end_time]
# 导出截取的音频
extracted.export(
"xxx/3.wav", format="wav")
拼接audio
import os
from pydub import AudioSegment
base_path = f'{os.getcwd()}/audios/reduce_noise/video2/'
# 要拼接的音频的路径list
short_audio_files = []
for i in range(100, 105):
path = base_path + str(i) + ".wav"
wav = AudioSegment.from_file(path)
duration_seconds = wav.duration_seconds
short_audio_files.append(path)
# 声明一个空白音频
merged_audio = AudioSegment.empty()
# 遍历每个短音频文件并添加到合并后的音频中
for audio_file in short_audio_files:
# 从文件加载短音频
short_audio = AudioSegment.from_file(audio_file)
# 将短音频追加到合并后的音频
merged_audio = merged_audio.append(short_audio, crossfade=0)
# 保存合并后的音频为一个长音频文件
merged_audio.export(f"{base_path}merged_audio.wav", format="wav")
python绘制梅尔谱图像
import os
import matplotlib
matplotlib.use('Agg') # No pictures displayed
import pylab
import librosa
import librosa.display
import numpy as np
sig, fs = librosa.load(os.path.join(os.path.dirname(os.path.realpath(__file__)), "video1.wav"))
# make pictures name
save_path = 'test.jpg'
# pylab.axis('off') # no axis
# pylab.axes([0., 0., 1., 1.], frameon=False, xticks=[], yticks=[]) # Remove the white edge
S = librosa.feature.melspectrogram(y=sig, sr=fs)
librosa.display.specshow(librosa.power_to_db(S, ref=np.max),y_axis="mel", fmax=8000 * 1, x_axis="time")
pylab.savefig(save_path, bbox_inches=None, pad_inches=0)
pylab.close()
参考文章:
说话人分离:
主要使用whisper-diarization:GitHub - MahmoudAshraf97/whisper-diarization: Automatic Speech Recognition with Speaker Diarization based on OpenAI Whisper
使用场景:一整个wav ,分理出不同人对应的wav,具体效果如下:
语音活性检测(VAD)
使用webrtc自带的VAD(贼好用)