GLM4V + ChatTTS AI 示例
本文将指导你如何一步一步搭建属于自己的《Her》,把文本、声音、图像等模态全部囊括到一起,给大模型加上眼睛和声音。
给自己电脑装一个《Her》,成功复现GPT-4o,流式多模态交互大模型,效果炸裂!
1、环境搭建
(1) 创建环境
# 创建assistant环境
conda create -n assistant python=3.10
# 激活assistant环境
conda activate assistant
(2) 需要一个GLM_API_KEY ,在当前项目根目录下创建一个.env文件,文件内容如下:
GLM_API_BASE="https://open.bigmodel.cn/api/paas/v4"
GLM_API_KEY=""
(3) pip install安装以下包
# 下载ChatTTS到当前项目目录下
git clone https://github.com/2noise/ChatTTS.git
# 安装pynini=2.1.5、WeTextProcessing包
conda install -c conda-forge pynini=2.1.5 && pip install WeTextProcessing
# 升级pip
pip install -U pip
接着pip install 安装以下包
opencv-python
langchain
langchain-openai
langchain-community
python-dotenv
pyaudio
soundfile
SpeechRecognition
git+https://github.com/openai/whisper.git
omegaconf~=2.3.0
torch~=2.1.0
tqdm
einops
vector_quantize_pytorch
transformers~=4.41.1
vocos
IPython
Pillow
cos-python-sdk-v5
pydub
socksio
2、assistant.py代码
import base64
import os
from datetime import datetime
from threading import Lock, Thread
from time import sleep
from PIL import Image
import io
import cv2
from cv2 import VideoCapture, imencode
from dotenv import load_dotenv
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.schema.messages import SystemMessage
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_openai import ChatOpenAI
from speech_recognition import Microphone, Recognizer, UnknownValueError
import torch
import ChatTTS.ChatTTS as ChatTTS
from pydub import AudioSegment
from pydub.playback import play
import numpy as np
import logging
# 设置日志级别为WARNING,INFO级别的日志将不会显示
logging.basicConfig(level=logging.WARNING)
torch._dynamo.config.cache_size_limit = 64
torch._dynamo.config.suppress_errors = True
torch.set_float32_matmul_precision('high')
# load ChatTTS model
chat = ChatTTS.Chat()
chat.load() # chat.load_models()
load_dotenv()
GLM_API_BASE = os.getenv("GLM_API_BASE")
GLM_API_KEY = os.getenv("GLM_API_KEY")
# 调试:打印环境变量以确认它们是否正确加载
print(f"GLM_API_BASE: {GLM_API_BASE}")
print(f"GLM_API_KEY: {GLM_API_KEY}")
class WebcamStream:
def __init__(self):
self.stream = VideoCapture(index=0)
_, self.frame = self.stream.read()
self.running = False
self.lock = Lock()
def start(self):
if self.running:
return self
self.running = True
self.thread = Thread(target=self.update, args=())
self.thread.start()
return self
def update(self):
while self.running:
_, frame = self.stream.read()
self.lock.acquire()
self.frame = frame
self.lock.release()
def read(self, encode=False):
self.lock.acquire()
frame = self.frame.copy()
self.lock.release()
if encode:
_, buffer = imencode(".jpeg", frame)
return base64.b64encode(buffer)
return frame
def stop(self):
self.running = False
if self.thread.is_alive():
self.thread.join()
def __exit__(self, exc_type, exc_value, exc_traceback):
self.stream.release()
class Assistant:
def __init__(self, model):
self.chain = self._create_inference_chain(model)
def answer(self, prompt, image):
if not prompt:
return
print("Prompt:", prompt)
#image_path = self._save_image(image)
response = self.chain.invoke(
{"prompt": prompt, "image_base64": image.decode()},
config={"configurable": {"session_id": "unused"}},
).strip()
print("Response:", response)
if response:
self._tts(response)
@staticmethod
def _tts(response):
wavs = chat.infer(response)
# 假设生成的音频数据为单个音频
audio_data = np.array(wavs[0] * 32767, dtype=np.int16)
# 获取当前时间,并格式化为文件名
if not os.path.exists("./output"):
os.makedirs("./output")
file_name = f"./output/audio_{datetime.now().strftime('%Y%m%d_%H%M%S')}.wav"
# 创建一个AudioSegment实例
audio_segment = AudioSegment(
audio_data.tobytes(),
frame_rate=24000,
sample_width=audio_data.dtype.itemsize,
channels=1
)
# 保存音频数据到本地文件
audio_segment.export(file_name, format="wav")
# 播放音频
play(audio_segment)
# 保存图片到本地
def _save_image(self, image):
if not os.path.exists("./images"):
os.makedirs("./images")
file_name = f"./images/image_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
img = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
img.save(file_name)
return file_name
@staticmethod
def _create_inference_chain(model):
SYSTEM_PROMPT = """
你是一个有眼睛的助手,我会发送图片给你,让你看到周围的景象,将使用用户提供的聊天历史和图片来回答其问题。
不要提到“图片”这个单词,直接描述图片的内容,不要使用emojis,不要问用户问题。
保持友好的态度。展示一些个性。不要太正式。
用中文回复
"""
prompt_template = ChatPromptTemplate.from_messages(
[
SystemMessage(content=SYSTEM_PROMPT),
MessagesPlaceholder(variable_name="chat_history"),
(
"human",
[
{"type": "text", "text": "{prompt}"},
{
"type": "image_url",
"image_url": "data:image/jpeg;base64,{image_base64}",
},
],
),
]
)
chain = prompt_template | model | StrOutputParser()
chat_message_history = ChatMessageHistory()
return RunnableWithMessageHistory(
chain,
lambda _: chat_message_history,
input_messages_key="prompt",
history_messages_key="chat_history",
)
webcam_stream = WebcamStream().start()
model = ChatOpenAI(model="glm-4v", base_url=GLM_API_BASE, api_key=GLM_API_KEY)
assistant = Assistant(model)
def audio_callback(recognizer, audio):
try:
prompt = recognizer.recognize_whisper(audio, model="small", language="chinese")
assistant.answer(prompt, webcam_stream.read(encode=True))
except UnknownValueError:
print("There was an error processing the audio.")
recognizer = Recognizer()
microphone = Microphone()
with microphone as source:
recognizer.adjust_for_ambient_noise(source, duration=2)
#sleep(5)
#assistant.answer("你看到了什么", webcam_stream.read())
stop_listening = recognizer.listen_in_background(microphone, audio_callback)
while True:
cv2.imshow("webcam", webcam_stream.read())
if cv2.waitKey(1) in [27, ord("q")]:
break
webcam_stream.stop()
cv2.destroyAllWindows()
stop_listening(wait_for_stop=False)
3、运行程序
python assistant.py