文档结构说明
# 初始目录结构
├── data # 存放数据
│ ├── vad_vedio # 存放切割后的wav数据
│ └── vedio # 存放mp4需要添加字幕的文件
| └── wav_vedio # 转换wav格式的数据
├── decoder# 识别模块
├── model # 模型
│ ├── 20210618_u2pp_conformer_exp # 识别模型
│ │ ├── final.pt
│ │ ├── global_cmvn
│ │ ├── train.yaml
│ │ └── words.txt
│ └── vad_model # 切割模型
├── result # 字幕文件位置
└── run.sh # 整体脚本
└── vad # vad模块
├── __init__.py
├── utils
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-38.pyc
│ │ └── utils_vad.cpython-38.pyc
│ └── utils_vad.py
└── vad.py
# 结束文档结构
├── Readme.md
├── data
│ ├── vad_vedio
│ │ ├── cs_106187.5_ms_112031.25_ms_.wav
│ │ ├── cs_114062.5_ms_127125.0_ms_.wav
│ │ ...
│ ├── vedio
│ │ └── cs.mp4
│ └── wav_vedio
│ └── cs.wav
├── decoder
│ ├── data_list
│ │ └── cs_data.list
│ ├── data_list.sh
│ ├── decoder.sh
│ ├── mkass.py
│ ├── recognize.py
│ ├── text
│ │ ├── cs_ass_text
│ │ └── cs_text
│ └── wenet -> /mnt/f/data/wenet/wenet/
├── model
│ ├── 20210618_u2pp_conformer_exp
│ │ ├── final.pt
│ │ ├── global_cmvn
│ │ ├── train.yaml
│ │ └── words.txt
│ └── vad_model
│ └── model.jit
├── result
│ └── cs.ass
├── run.sh
└── vad
├── __init__.py
├── utils
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-38.pyc
│ │ └── utils_vad.cpython-38.pyc
│ └── utils_vad.py
└── vad.py
wenet环境配置
获取视频并转音频
安装处理视频工具
安装ffmpeg:https://blog.csdn.net/zhouyj6516/article/details/107416209
ffmpeg参数:https://www.cnblogs.com/mwl523/p/10856633.html
mp4转wav脚本
ffmpeg -i cs.mp4 -vn -ar 16000 -ac 1 -ab 192 -f wav cs.wav
脚本
#!/usr/bin/bash
# >>> conda initialize >>>
# !! Contents within this block are managed by 'conda init' !!
__conda_setup="$('/root/miniconda3/bin/conda' 'shell.bash' 'hook' 2> /dev/null)"
if [ $? -eq 0 ]; then
eval "$__conda_setup"
else
if [ -f "/root/miniconda3/etc/profile.d/conda.sh" ]; then
. "/root/miniconda3/etc/profile.d/conda.sh"
else
export PATH="/root/miniconda3/bin:$PATH"
fi
fi
unset __conda_setup
# <<< conda initialize <<<
conda activate base
# 批量转wav
mp4_dir=/root/data/aizm/result/mp4
step=0;
for item in `ls $mp4_dir` ; do
# 截取字符串名字
# filename=`echo $item | awk -F "." '{print $1}'`
filename=${item%.*}
echo $filename
ffmpeg -i ${mp4_dir}/$item -vn -ar 16000 -ac 1 -ab 192 -f wav ./wavdata/$filename.wav
echo "$item ok"
file_wav=$filename.wav
python3 vad.py ./wavdata/$file_wav $filename
step=`expr $step + 1`
done
echo "success all!,总共处理${step}个视频"
注意脚本运行必须在base环境下
通过vad切割音频
使用的github上的silero-vad
需要文件
model.jit
utils_vad.py
代码
from utils.utils_vad import *
import torchaudio
import sys
if len(sys.argv)==1:
print("USage:python3 {} 0001.wav 0001".format(sys.argv[0]))
print("arguments requires at least 1,get 0")
assert False
model_path = "./model/vad_model/model.jit"
wav_path = sys.argv[1]
file_pre_name = ""
if len(sys.argv) == 3:
file_pre_name = sys.argv[2]
print("处理文件",file_pre_name)
# 加载视频
wav,sr = torchaudio.load(wav_path)
model = init_jit_model(model_path=model_path)
# trig_sum=0.1,neg_trig_sum=0.07
speeches = get_speech_ts(wav[0],model,trig_sum=0.1,neg_trig_sum=0.07,min_speech_samples=500,min_silence_samples=400)
# speeches = get_speech_timestamps(wav, model)
save_path = "./data/vad_vedio/%s"%(file_pre_name)
for i,item in enumerate(speeches):
start = item['start']
end = item['end']
start_time = start / 16
end_time = end / 16
torchaudio.save(save_path+"/"+file_pre_name+"_"+str(start_time)+"_ms_"+str(end_time)+"_ms_"+".wav", wav[0][start:end].unsqueeze(0), sr)
pass
采用自适应VAD
from utils.utils_vad import *
import torchaudio
import sys
if len(sys.argv)==1:
print("USage:python3 {} 0001.wav 0001".format(sys.argv[0]))
print("arguments requires at least 1,get 0")
assert False
model_path = "./model/vad_model/model.jit"
wav_path = sys.argv[1]
file_pre_name = ""
if len(sys.argv) == 3:
file_pre_name = sys.argv[2]
print("处理文件",file_pre_name)
# 加载视频
wav,sr = torchaudio.load(wav_path)
model = init_jit_model(model_path=model_path)
# trig_sum=0.1,neg_trig_sum=0.07
speeches = get_speech_ts_adaptive(wav[0],model,
batch_size=200,
step=500,
num_samples_per_window=4000,
min_speech_samples=10000,
min_silence_samples=2000)
# speeches = get_speech_timestamps(wav, model)
save_path = "./data/vad_vedio/%s"%(file_pre_name)
for i,item in enumerate(speeches):
start = item['start']
end = item['end']
start_time = start / 16
end_time = end / 16
torchaudio.save(save_path+"/"+file_pre_name+"_"+str(start_time)+"_ms_"+str(end_time)+"_ms_"+".wav", wav[0][start:end].unsqueeze(0), sr)
pass
参数调整
from utils.utils_vad import *
import torchaudio
import sys
if len(sys.argv)==1:
print("USage:python3 {} 0001.wav 0001".format(sys.argv[0]))
print("arguments requires at least 1,get 0")
assert False
model_path = "./model/vad_model/model.jit"
wav_path = sys.argv[1]
file_pre_name = ""
if len(sys.argv) == 3:
file_pre_name = sys.argv[2]
print("处理文件",file_pre_name)
# 加载视频
wav,sr = torchaudio.load(wav_path)
model = init_jit_model(model_path=model_path)
# trig_sum=0.1,neg_trig_sum=0.07
# speeches = get_speech_ts(wav[0],model,trig_sum=0.1,neg_trig_sum=0.07,min_speech_samples=500,min_silence_samples=400)
speeches = get_speech_ts_adaptive(wav[0],model,
batch_size=200,
step=200,
num_samples_per_window=2000,
min_speech_samples=500,
min_silence_samples=400)
'''
speeches = get_speech_ts_adaptive(wav[0],model,
batch_size=200,
step=200,
num_samples_per_window=4000,
min_speech_samples=10000,
min_silence_samples=2000)
'''
save_path = "./data/vad_vedio/%s"%(file_pre_name)
for i,item in enumerate(speeches):
start = item['start']
end = item['end']
start_time = start / 16
end_time = end / 16
torchaudio.save(save_path+"/"+file_pre_name+"_"+str(start_time)+"_ms_"+str(end_time)+"_ms_"+".wav", wav[0][start:end].unsqueeze(0), sr)
pass
保存视频格式:start-time_ms_end-time_ms.wav
再次处理音频
ffmpeg -i x.wav -vn -ar 16000 -ac 1 -ab 192 -f wav text.wav
通过wenet解码
├── data_list
│ └── cs_data.list
├── data_list.sh
├── decoder.sh
├── mkass.py
├── recognize.py
├── text
│ └── cs_ass_text
└── wenet -> /home/asr/data/wenet/wenet # wenet工具的软连接
制作data.list
{"key": "BAC009S0764W0121", "wav": "/BAC009S0764W0121.wav", "txt": "甚至出现交的情况"}
filename=$1
wav=/mnt/f/data/AIzimu/data/vad_vedio/$filename
# 对内容按数字大小排序
wav_list=`ls $wav | sort -t "_" -k 2 -n`
# echo $wav_list
# exit 1
for item in $wav_list;do
echo "{\"key\":\"${item%.*}\",\"wav\":\"${wav}/${item}\",\"txt\":\"\"}"
done
通过预训练模型解码
需要文件
# words.txt
<blank> 0
<unk> 1
' 2
( 3
) 4
A 5
𫖯 6
...
# train.yaml
训练时候的配置文件
# global_cmvn
均值方差归一化
# 文件夹
tools -> /root/data/aizm/wenet/tools
#
wenet -> /root/data/aizm/wenet/wenet
train.yaml
accum_grad: 1
cmvn_file: ../model/20210815_unified_conformer_exp/global_cmvn # global_cmvn 路径
dataset_conf:
filter_conf:
得到解码结果
CS_781_ms_6906_ms_ 好的那么我们就开始录制这个视频
CS_7406_ms_11156_ms_ 首先的话按照上捷克的所说
CS_11093_ms_15187_ms_ 我就念这一段的话九K
CS_15968_ms_17031_ms_ 好吧
CS_18218_ms_24656_ms_ 住房和城乡建设部会配合有关部门
CS_25375_ms_28906_ms_ 强化地方调控主体责任
CS_30062_ms_36875_ms_ 及时调整了限购差别化信贷等方面的政策
CS_36968_ms_47281_ms_ 支持居民合理的住房消费总体上保持了房地产市场平稳运行
CS_49218_ms_56031_ms_ 住建部新闻发言人近日表示
...
制作ASS字幕文件
mkass.py
import os
import sys
# print(sys.argv[1])
def get_time(time):
time_cost=float(time)/1000
h = time_cost // 3600
m = time_cost % 3600 // 60
s = time_cost % 60
return "%d:%d:%.2f"%(h,m,s)
path = sys.argv[1]
header = """
[Script Info]
ScriptType: v4.00+
PlayResX: 1920
PlayResY: 1080
WrapStyle: 2
[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
Style: Original,FandolHei,93,&H00F8F8FF,&Hffffffff,&Hntrepansra,&H000000,0,0,0,0,100,100,0.0,0,0,0,1,2,10,10,10,0
[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
"""
print(header)
with open(file=path,mode='r',encoding='utf-8') as file:
for line in file.readlines():
line = line.split(" ")
start,end,content = line[0],line[1],line[3].strip("\n")
start = str(get_time(start))
end = str(get_time(end))
if content == "":
continue
print("Dialogue: 3,%s,%s,Original,,78,78,78,,%s"%(start,end,content))
获取字幕内容
`cat text | awk -F '_' '{ print $2,$4,$6 }' > text1 `
python3 mkass.py
echo "字幕生成完成"
生成字幕文件
。、
代码汇总
#!/usr/bin/bash
# >>> conda initialize >>>
# !! Contents within this block are managed by 'conda init' !!
__conda_setup="$('/home/asr/anaconda3/bin/conda' 'shell.bash' 'hook' 2> /dev/null)"
if [ $? -eq 0 ]; then
eval "$__conda_setup"
else
if [ -f "/home/asr/anaconda3/etc/profile.d/conda.sh" ]; then
. "/home/asr/anaconda3/etc/profile.d/conda.sh"
else
export PATH="/home/asr/anaconda3/bin:$PATH"
fi
fi
unset __conda_setup
# <<< conda initialize <<<
conda activate base
# 需要替换的项目所在路径
root=`pwd`
# 批量转wav
mp4_dir=$root/data/vedio
# wav存放的文件
mp42wav_dir=$root/data/wav_vedio
# 存放data_list文件夹
decode=$root/decoder
data_list_dir=data_list
text=text
echo "初始化"
`rm -rf data/vad_vedio/*`
`rm -rf decoder/text`
`rm -rf decoder/data_list`
`rm -rf result`
echo "初始化完成"
# wavdata=./wavdata
ass=$root/result
mkdir -p $ass
# mkdir -p $wavdata && echo "create wavdata flord"
mkdir -p $mp42wav_dir && echo "create mp42wav flord"
mkdir -p $decode/$data_list_dir && echo "create decode/data_list flord"
mkdir -p $decode/$text && echo "create text flord"
step=0;
for item in `ls $mp4_dir` ; do
echo "===========================================${step}==========================================="
# 截取字符串名字
# filename=`echo $item | awk -F "." '{print $1}'`
_filename=${item%.*}
echo "正在处理${_filename}文件"
filename=$(echo ${_filename} | sed s/'_'//g)
echo "文件重命名${filename}"
# 创建对应存放wav格式的文件
ffmpeg -i ${mp4_dir}/$item -vn -ar 16000 -ac 1 -ab 192 -f wav $mp42wav_dir/${filename}.wav -y 1>/dev/null 2>&1
echo "$item 转wav ok"
file_wav=${filename}.wav
# 激活wenet环境
conda activate wenet
# 传入wav文件 以及文件名称
mkdir -p ./data/vad_vedio/$filename
python3 ./vad/vad.py ./data/wav_vedio/${file_wav} $filename
echo "vad切割${filename}完成"
cd $decode
sh data_list.sh $filename > $decode/${data_list_dir}/${filename}_data.list
# 解码生成data.list文件
sh decoder.sh ${data_list_dir}/${filename}_data.list ${text}/${filename}_text # 传入文件名
echo "==============解码完成============"
echo "${text}/${filename}_text"
# 生成需要制作字幕的关键信息:开始时间,结束时间,文字
`cat $text/${filename}_text | awk -F '_' '{ print $2,$4,$6 }' > ${text}/${filename}_ass_text`
echo ${text}/${filename}_ass_text
python3 mkass.py ${text}/${filename}_ass_text > ${ass}/${filename}.ass || exit 1
echo "${filename}字幕生成完成"
step=`expr $step + 1`
cd ..
echo "===========================================${step}==========================================="
conda activate base
done
echo "success all!,总共处理${step}个视频,生成了${step}个字幕文件"
decoder.sh
# 根目录
root=..
data_type=raw
# 预训练模型位置
model=${root}/model/20210815_unified_conformer_exp
dict=${model}/words.txt
train_set=train
decode_checkpoint=${model}/final.pt
decoding_chunk_size=
ctc_weight=0.5
reverse_weight=0.0
# 测试的语音内容{key,wavscp,text}
# list_name=$1
data_list_dir=$1
text=$root/decoder
python recognize.py \
--mode "attention_rescoring" \
--config $model/train.yaml \
--data_type $data_type \
--test_data ${data_list_dir} \
--checkpoint $decode_checkpoint \
--beam_size 10 \
--batch_size 1 \
--penalty 0.0 \
--dict $dict \
--ctc_weight $ctc_weight \
--reverse_weight $reverse_weight \
--result_file $2 \
${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}