使用whisper语音转文字的demo
import torch
from transformers import pipeline
# 创建一个ASR管道
asr = pipeline("automatic-speech-recognition", model="openai/whisper-small")
# 从麦克风录制音频并保存为文件(可选)
import soundfile as sf
import numpy as np
import pyaudio
def record_audio(filename, duration=5):
# 设置参数
fs = 16000 # 采样率
channels = 1 # 单声道
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16, channels=channels,
rate=fs, input=True, frames_per_buffer=1024)
print("开始录音...")
frames = []
for i in range(0, int(fs / 1024 * duration)):
data = stream.read(1024)
frames.append(data)
print("录音结束.")
stream.stop_stream()
stream.close()
p.terminate()
# 保存为WAV文件
wf = sf.SoundFile(filename, 'w', fs, channels)
wf.write(np.frombuffer(b''.join(frames), dtype=np.int16))
wf.close()
# 录制音频并保存
record_audio("audio.wav", duration=5)
# 使用Whisper模型进行语音识别
result = asr("audio.wav")
print("识别结果:", result['text'])
完整ai语音助手的demo,但现在没有解决的是:它不可以被打断
import os
from dotenv import load_dotenv
import openai
import soundfile as sf
import torch
from transformers import pipeline
import pyttsx3
import speech_recognition as sr
import pyaudio
import numpy as np
# Load environment variables
load_dotenv("VoiceAssistant/.env")
# Set OpenAI API key
api_key =""
if not api_key:
print("Error: OPENAI_API_KEY not found in environment variables.")
exit(1)
openai.api_key = api_key
# Initialize text-to-speech engine
tts_engine = pyttsx3.init()
# Initialize the Whisper model for speech recognition
asr = pipeline("automatic-speech-recognition", model="openai/whisper-small")
# Set system message
sysmesg = {"role": "system", "content": "你是一名人工智能助手,请帮助我们寻找需要的信息."}
messages = []
def speech_to_text():
# 录制音频并保存为临时文件
fs = 16000 # 采样率
channels = 1 # 单声道
p = pyaudio.PyAudio()
filename="audio.wav"
duration=5
stream = p.open(format=pyaudio.paInt16, channels=channels,
rate=fs, input=True, frames_per_buffer=1024)
print("开始录音...")
frames = []
for i in range(0, int(fs / 1024 * duration)):
data = stream.read(1024)
frames.append(data)
print("录音结束.")
stream.stop_stream()
stream.close()
p.terminate()
# 保存为WAV文件
wf = sf.SoundFile(filename, 'w', fs, channels)
wf.write(np.frombuffer(b''.join(frames), dtype=np.int16))
wf.close()
# 使用 Whisper 模型转录音频文件
result = asr("audio.wav")
text = result['text']
print(f"Recognized Text: {text}")
return text
def text_to_speech(text):
print(f"合成语音: {text}")
tts_engine.say(text)
tts_engine.runAndWait()
def generate_text(prompt):
global messages
messages.append({"role": "user", "content": prompt})
try:
completion = openai.ChatCompletion.create(
model="gpt-4o-mini",
messages=[sysmesg] + messages[-10:],
temperature=0.7,
max_tokens=200,
top_p=0.95,
frequency_penalty=0,
presence_penalty=0,
stop=None
)
cont = completion["choices"][0]["message"]["content"]
print(f"Generated Text: {cont}")
return cont
except Exception as e:
print(f"生成文本时出错: {e}")
return f"生成文本时出错: {e}"
# Main loop
while True:
text_to_speech("你好,很高兴为您服务。我在听请讲。")
user_input = speech_to_text()
print(f"You said: {user_input}")
if user_input.lower() in ["退出", "再见"]:
text_to_speech("再见!")
break
response = generate_text(user_input)
print(f"AI says: {response}")
text_to_speech(response)