用openai和huggingface的whisper模型实现ai语音助手

fc&&fl

已于 2024-09-27 11:42:14 修改

阅读量98

点赞数 1

分类专栏：大模型实战应用文章标签： whisper 人工智能 xcode

于 2024-09-27 11:28:33 首次发布

本文链接：https://blog.csdn.net/m0_57057282/article/details/142589520

版权

大模型实战应用专栏收录该内容

38 篇文章 0 订阅

订阅专栏

使用whisper语音转文字的demo

import torch
from transformers import pipeline

# 创建一个ASR管道
asr = pipeline("automatic-speech-recognition", model="openai/whisper-small")

# 从麦克风录制音频并保存为文件（可选）
import soundfile as sf
import numpy as np
import pyaudio

def record_audio(filename, duration=5):
    # 设置参数
    fs = 16000  # 采样率
    channels = 1  # 单声道
    p = pyaudio.PyAudio()

    stream = p.open(format=pyaudio.paInt16, channels=channels,
                    rate=fs, input=True, frames_per_buffer=1024)

    print("开始录音...")
    frames = []

    for i in range(0, int(fs / 1024 * duration)):
        data = stream.read(1024)
        frames.append(data)

    print("录音结束.")
    stream.stop_stream()
    stream.close()
    p.terminate()

    # 保存为WAV文件
    wf = sf.SoundFile(filename, 'w', fs, channels)
    wf.write(np.frombuffer(b''.join(frames), dtype=np.int16))
    wf.close()

# 录制音频并保存
record_audio("audio.wav", duration=5)

# 使用Whisper模型进行语音识别
result = asr("audio.wav")
print("识别结果:", result['text'])

完整ai语音助手的demo,但现在没有解决的是：它不可以被打断

import os
from dotenv import load_dotenv  
import openai
import soundfile as sf
import torch
from transformers import pipeline
import pyttsx3
import speech_recognition as sr
import pyaudio
import numpy as np
# Load environment variables
load_dotenv("VoiceAssistant/.env")

# Set OpenAI API key
api_key =""
if not api_key:
    print("Error: OPENAI_API_KEY not found in environment variables.")
    exit(1)
openai.api_key = api_key

# Initialize text-to-speech engine
tts_engine = pyttsx3.init()

# Initialize the Whisper model for speech recognition
asr = pipeline("automatic-speech-recognition", model="openai/whisper-small")

# Set system message
sysmesg = {"role": "system", "content": "你是一名人工智能助手，请帮助我们寻找需要的信息."}
messages = []
def speech_to_text():
    # 录制音频并保存为临时文件
    fs = 16000  # 采样率
    channels = 1  # 单声道
    p = pyaudio.PyAudio()
    filename="audio.wav"
    duration=5
    stream = p.open(format=pyaudio.paInt16, channels=channels,
                    rate=fs, input=True, frames_per_buffer=1024)

    print("开始录音...")
    frames = []

    for i in range(0, int(fs / 1024 * duration)):
        data = stream.read(1024)
        frames.append(data)

    print("录音结束.")
    stream.stop_stream()
    stream.close()
    p.terminate()

    # 保存为WAV文件
    wf = sf.SoundFile(filename, 'w', fs, channels)
    wf.write(np.frombuffer(b''.join(frames), dtype=np.int16))
    wf.close()

    # 使用 Whisper 模型转录音频文件
    result = asr("audio.wav")
    text = result['text']
    print(f"Recognized Text: {text}")
    return text

def text_to_speech(text):
    print(f"合成语音: {text}")
    tts_engine.say(text)
    tts_engine.runAndWait()

def generate_text(prompt):
    global messages
    messages.append({"role": "user", "content": prompt})
    try:
        completion = openai.ChatCompletion.create(
            model="gpt-4o-mini",
            messages=[sysmesg] + messages[-10:],
            temperature=0.7,
            max_tokens=200,
            top_p=0.95,
            frequency_penalty=0,
            presence_penalty=0,
            stop=None
        )
        cont = completion["choices"][0]["message"]["content"]
        print(f"Generated Text: {cont}")
        return cont
    except Exception as e:
        print(f"生成文本时出错: {e}")
        return f"生成文本时出错: {e}"

# Main loop
while True:
    text_to_speech("你好，很高兴为您服务。我在听请讲。")
    
    user_input = speech_to_text()
    print(f"You said: {user_input}")
    
    if user_input.lower() in ["退出", "再见"]:
        text_to_speech("再见！")
        break
    
    response = generate_text(user_input)
    print(f"AI says: {response}")
    
    text_to_speech(response)