基于飞桨的流式语音识别
环境部署
只支持 weboscket
协议,不支持 http
协议。
安装环境
git clone https://github.com/PaddlePaddle/PaddleSpeech.git cd PaddleSpeech pip install pytest-runner pip install .
安装paddlepaddle
cpu版本
pip install paddlepaddle==2.5.1 -i https://mirror.baidu.com/pypi/simple
gpu 根据cuda版本安装,colab版本为11.8,所以安装paddlepaddle-gpu=2.5.1```!python3 -m pip install paddlepaddle-gpu==2.5.1 -i https://mirror.baidu.com/pypi/simple``` 如果识别和标点同时部署gpu,二者不能部署在同一张卡上
音频识别修改:demos/streaming_asr_server/conf/ws_conformer_wenetspeech_application.yaml
标点纠正修改:demos/streaming_asr_server/conf/punc_application.yaml
安装分割音频的环境
pip install auditok==0.2.0 -i https://mirror.baidu.com/pypi/simple
pip install soundfile==0.12.1 -i https://mirror.baidu.com/pypi/simple
安装报错
如果装 paddlepaddle后报错:“ModuleNotFoundError: No module named ‘paddle.nn.layer.layers’”
原因:当你使用pip安装paddlepaddle 2.3.1时 会自动安装最新版的paddlenlp 目前paddlenlp版本是2.6.1 该版本与paddlepaddle 2.3.1不兼容会报这个错
解决方案: 你需要手动pip安装paddlenlp 2.5.2版本即可
服务启动脚本和服务测试脚本存放在 PaddleSpeech/demos/streaming_asr_server
目录。
下载好 PaddleSpeech
之后,进入到 PaddleSpeech/demos/streaming_asr_server
目录。
配置文件可参见该目录下 conf/ws_application.yaml
和 conf/ws_conformer_wenetspeech_application.yaml
。
目前服务集成的模型有: DeepSpeech2 和 conformer模型,对应的配置文件如下:
- DeepSpeech:
conf/ws_application.yaml
- conformer:
conf/ws_conformer_wenetspeech_application.yaml
mp4_add_subtitles.py 输入文件是一个.mp4 ,规定:mp4文件中的音频不能是左右声道的
mp4_txt_meeting.py输入是一个会议文件夹
使用方法
cd paddle-speech\demos\streaming_asr_server
paddlespeech_server start --config_file ./conf/ws_conformer_wenetspeech_application.yaml
paddlespeech_server start --config_file conf/punc_application.yaml
python local/mp4_add_subtitles.py
python local/mp4_txt_meeting.py
MP4上字幕主要流程
- Mp4转为wav文件
# step1: 从mp4中提取wav音频文件
def gen_wav_from_mp4(mp4_path):
# mp4_path为mp4地址
wav_path = mp4_path.replace('.mp4', '.wav')
if os.path.exists(wav_path):
os.remove(wav_path)
command = ['ffmpeg',
'-i',
mp4_path,
'-ac', '1',
'-ar', '16000',
wav_path
]
completed = subprocess.run(command)
# 如果执行命令返回状态码不为0(也就是出错了,返回空)
if completed.returncode != 0:
return None
return wav_path
- 切分长wav文件
# 切分音频
def qiefen(path, ty='audio', mmin_dur=1, mmax_dur=100000, mmax_silence=1, menergy_threshold=55):
audio_file = path
audio, audio_sample_rate = soundfile.read(
audio_file, dtype="int16", always_2d=True)
audio_regions = auditok.split(
audio_file,
min_dur=mmin_dur, # minimum duration of a valid audio event in seconds
max_dur=mmax_dur, # maximum duration of an event
# maximum duration of tolerated continuous silence within an event
max_silence=mmax_silence,
energy_threshold=menergy_threshold # threshold of detection
)
# 获取音频文件名
filename = os.path.basename(path).split('.')[0]
# 生成分段音频文件夹
mk = os.path.join(os.path.split(path)[0], 'split')
split_file_path = os.path.join(mk, ty, filename)
try:
logger.info('处理前尝试删除已有的音频分片目录')
shutil.rmtree(split_file_path)
except Exception as e:
logger.info('不存在文件夹,跳过删除操作')
os.makedirs(split_file_path)
for i, r in enumerate(audio_regions):
# Regions returned by `split` have 'start' and 'end' metadata fields
logger.info(
"Region {i}: {r.meta.start:.3f}s -- {r.meta.end:.3f}s".format(i=i, r=r))
num = i
# 为了取前三位数字排序
s = '000000' + str(num)
# 保存分段音频
file_save = os.path.join(split_file_path, s[-3:] + '-' + '{meta.start:.3f}-{meta.end:.3f}' + '.wav')
file_name = r.save(file_save)
logger.info("region saved as: {}".format(file_name))
return split_file_path
- 生成srt文件
# 生成字幕文件
def results2srt(results, start_time, filename, wavPath):
"""convert results from paddlespeech to srt format for subtitle
Args:
results (dict): results from paddlespeech
start_time: 每个音频文件的开始时间
cou:每行字幕的索引,从1开始
wavPath:wav路径
"""
# times:每个字的开始和结束时间
times = results['times']
# result :所有序列且包括标点
result = results['result']
# 用 ',' and '。'and '?'将result分为多个句子
sentences = re.split(',|。|?', result)[:-1]
logger.info(f'sentences: {sentences}')
# generate relative time for each sentence in sentences
relative_times = []
# 将获取的时间偏移量和一行字中的每个时间相加,
for item in times:
bg_val = item["bg"]
end_val = item['ed']
res_bg_time = bg_val + float(start_time)
res_end_time = end_val + float(start_time)
item['bg'] = res_bg_time
item['ed'] = res_end_time
word_i = 0
for sentence in sentences:
relative_times.append([])
for word in sentence:
if relative_times[-1] == []:
relative_times[-1].append(times[word_i]['bg'])
if len(relative_times[-1]) == 1:
relative_times[-1].append(times[word_i]['ed'])
else:
relative_times[-1][1] = times[word_i]['ed']
word_i += 1
# 根据relative_times和sentences生成srt文件
# srt_file为输出文件
srt_directoryPath = os.path.join(str(wavPath.split('.')[0].split('split')[0]), 'srt')
if os.path.exists(srt_directoryPath) == False:
os.mkdir(srt_directoryPath)
# 建议文件名与文件一致,需要改一下
file_name = filename.replace(':', "_")
srt_file = os.path.join(srt_directoryPath, f'{filename}.srt')
# 将字幕文件写入out.srt
with open(srt_file, 'a+') as f:
count = 1
for i in range(len(sentences)):
# 写index
f.write(str(count) + '\n')
start = format_time(relative_times[i][0])
end = format_time(relative_times[i][1])
# 写时间区间
f.write(start + ' --> ' + end + '\n')
# 写字幕
f.write(sentences[i] + '\n\n')
count += 1
# logger.info(result)
return times, result, srt_file
- 给MP4文件上字幕
# step3: 将srt和mp4合并
def compose_mp4_srt(mp4_path, srt_path):
final_mp4_path = mp4_path.replace('.mp4', '-srt.mp4')
if os.path.exists(final_mp4_path):
os.remove(final_mp4_path)
srtPath = srt_path.replace(":", "\:")
command = ['ffmpeg',
'-i',
mp4_path,
'-vf',
f"subtitles='{srtPath}'",
'-crf', '28',
final_mp4_path
]
completed = subprocess.run(command)
# 如果执行命令返回状态码不为0(也就是出错了,返回空)
if completed.returncode != 0:
return None
# 删除原mp4文件
# 删除原mp4文件
if delete_source_mp4:
try:
logger.info('删除原mp4文件')
shutil.rmtree(mp4_path)
except Exception as e:
logger.info('不存在文件,跳过操作')
return final_mp4_path