目录
背景:最近要做一个基于linux的语音交互,windows也可以跑通
一、模型选择
sherpa-ncnn
测试了四五个模型,只有这个模型比较好用,中文识别效果较好
这个模型好用
./build/bin/sherpa-ncnn-alsa ./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/tokens.txt ./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/encoder_jit_trace-pnnx.ncnn.param ./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/encoder_jit_trace-pnnx.ncnn.bin ./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/decoder_jit_trace-pnnx.ncnn.param ./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/decoder_jit_trace-pnnx.ncnn.bin ./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/joiner_jit_trace-pnnx.ncnn.param ./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/joiner_jit_trace-pnnx.ncnn.bin "hw:1,0"
二、流程
Python API — sherpa 1.3 documentation (k2-fsa.github.io)
按照这个流程安装python API,安装完记得测试是否能跑的通
三、核心代码展示
import sys
try:
import sounddevice as sd
except ImportError as e:
print("Please install sounddevice first. You can use")
print()
print(" pip install sounddevice")
print()
print("to install it")
sys.exit(-1)
import sherpa_ncnn
def create_recognizer():
# Please replace the model files if needed.
# See https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html
# for download links.
recognizer = sherpa_ncnn.Recognizer(
tokens="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/tokens.txt",
encoder_param="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/encoder_jit_trace-pnnx.ncnn.param",
encoder_bin="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/encoder_jit_trace-pnnx.ncnn.bin",
decoder_param="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/decoder_jit_trace-pnnx.ncnn.param",
decoder_bin="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/decoder_jit_trace-pnnx.ncnn.bin",
joiner_param="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/joiner_jit_trace-pnnx.ncnn.param",
joiner_bin="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/joiner_jit_trace-pnnx.ncnn.bin",
num_threads=4,
)
return recognizer
def main():
print("Started! Please speak")
recognizer = create_recognizer()
sample_rate = recognizer.sample_rate
samples_per_read = int(0.1 * sample_rate) # 0.1 second = 100 ms
last_result = ""
with sd.InputStream(
channels=1, dtype="float32", samplerate=sample_rate
) as s:
while True:
samples, _ = s.read(samples_per_read) # a blocking read
samples = samples.reshape(-1)
recognizer.accept_waveform(sample_rate, samples)
result = recognizer.text
if last_result != result:
last_result = result
print(result)
if __name__ == "__main__":
devices = sd.query_devices()
print(devices)
default_input_device_idx = sd.default.device[0]
print(f'Use default device: {devices[default_input_device_idx]["name"]}')
try:
main()