FunASR
安装
直接借助 sensevoice 里面的 requirement.txt 安装就行
使用的时候,如果想把模型下载到指定目录就这样指定一下
export MODELSCOPE_CACHE=XXX
from funasr import AutoModel
from funasr.utils.postprocess_utils import rich_transcription_postprocess
model = AutoModel(model="paraformer-zh", vad_model="fsmn-vad", punc_model="ct-punc")
res = model.generate(input=fwav, batch_size_s=300)
print( res[0]["text"] )
# text = rich_transcription_postprocess(res[0]["text"])
# print(text)
初步的对齐后处理
def post_hd(result):
# 提取文本和时间戳
text = result[0]['text']
timestamps = result[0]['timestamp']
word_timestamps = []
# 当前时间戳索引
index = 0
# 正则表达式匹配中文或英文字母
#pattern = re.compile(r'[\u4e00-\u9fa5a-zA-Z]+')
#pattern = re.compile(r'([\u4e00-\u9fa5])|([a-zA-Z]+)')
pattern = re.compile(r'([\u4e00-\u9fa5])|([a-zA-Z]+)|([,。!?;:])')
# 在文本中查找所有匹配
matches = pattern.finditer(text)
# 遍历所有匹配并分配时间戳
for match in matches:
if match.group(1) or match.group(2): # 处理汉字或英文单词
assert index < len(timestamps)
word = match.group(1) if match.group(1) else match.group(2)
word_timestamps.append([word, timestamps[index]])
last_index = index # 更新最后一个有效索引
index += 1
elif match.group(3): # 处理标点符号
word = match.group(3)
word_timestamps.append([word, index ])
pass
# 打印结果
for word, ts in word_timestamps:
#print(f"字: {word}, 时间戳: {ts}")
print(f"{word}: {ts}")
whisper
忘了怎么安装了,这里记录一下整理出来的类,不过这个识别容易出现幻觉,对齐也不是很准,比如说使用 large-v3 倒是能有一定的分句作用,但是每句最后一个字给的时间太短,这也表明了对齐不准。
from chj.comm.pic import *
import json
import whisper
import whisperx
import gc
class Warp_whisper:
def __init__(self, language="zh", device="cuda", compute_type="float32", model="large-v2" ):
torch.backends.cudnn.enabled = False
if not torch.cuda.is_available():
device="cpu"
dmodel="XXXXX/models/torch/whisper"
self.asr_model=whisper.load_model(model, device, download_root=dmodel)
self.txt_converter = None
if model=="large-v2" and language=="zh":
from opencc import OpenCC
converter = OpenCC('t2s')
self.txt_converter = converter
self.prompt=None
else:
if language=="zh":
self.prompt='以下是普通话的句子'
else:
self.prompt=None
self.prompt=None
self.language=language
self.device=device
self.align_model, self.align_metadata = whisperx.load_align_model(language_code=language, device=device)
def do_asr_algin(self, fjson, fwav):
audio = whisper.load_audio(fwav)
result = self.asr_model.transcribe(audio, language=self.language, initial_prompt=self.prompt)
#assert result["language"] == self.language
result_segments = result["segments"]
if self.txt_converter:
for e in result_segments:
e['text'] = converter.convert( e['text'] )
result = whisperx.align(result_segments, self.align_model, self.align_metadata, audio, self.device, return_char_alignments=False)
result=result["segments"]
with open(fjson, "w") as fout:
json.dump(result, fout, indent=4, ensure_ascii=False)
def f2_invoke():
print("Doing... whisper align")
basedir=sys.argv[1]
din=f"{basedir}/audio_feats/wav16k"
if not os.path.exists(din):
print("no such dir", din)
exit(1)
dout=f"{basedir}/audio_feats/whisper_align"
# model="large-v3
cls_asr=Warp_whisper()
chj_file.mkdir(dout)
for fwav in tqdm( glob.glob(f"{din}/*.wav") ):
nm = chj_file.get_nm(fwav)
fnm=f"{dout}/{nm}.json"
if os.path.isfile(fnm): continue
cls_asr.do_asr_algin(fnm,fwav)
print("Finished whisper align")