高级RAG评估
本教程介绍了如何对高级RAG模型进行评估。
这可以非常有助于确定适用于您应用的最佳RAG方法。
# 安装必要的库
! pip install -U langchain openai chromadb langchain-experimental # 需要最新版本以支持多模态
# 由于更近版本中存在持续的bug,锁定版本为0.10.19
! pip install "unstructured[all-docs]==0.10.19" pillow pydantic lxml pillow matplotlib tiktoken open_clip_torch torch
数据加载
让我们看一个关于美国野火的示例白皮书,它提供了表格、文本和图像的混合。
选项1:加载文本
# 设置路径
path = "/Users/rlm/Desktop/cpi/"
# 加载PDF
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader(path + "cpi.pdf")
pdf_pages = loader.load()
# 分割文本
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
all_splits_pypdf = text_splitter.split_documents(pdf_pages)
all_splits_pypdf_texts = [d.page_content for d in all_splits_pypdf]
选项2:加载文本、表格、图像
from unstructured.partition.pdf import partition_pdf
# 提取图像、表格,并分割文本
raw_pdf_elements = partition_pdf(
filename=path + "cpi.pdf",
extract_images_in_pdf=True,
infer_table_structure=True,
chunking_strategy="by_title",
max_characters=4000,
new_after_n_chars=3800,
combine_text_under_n_chars=2000,
image_output_dir_path=path,
)
# 按类型分类
tables = []
texts = []
for element in raw_pdf_elements:
if "unstructured.documents.elements.Table" in str(type(element)):
tables.append(str(element))
elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
texts.append(str(element))
存储
选项1:嵌入并存储文本块
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
baseline = Chroma.from_texts(
texts=all_splits_pypdf_texts,
collection_name="baseline",
embedding=OpenAIEmbeddings(),
)
retriever_baseline = baseline.as_retriever()
选项2:多向量检索器
文本摘要
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
# 提示文本
prompt_text = """您是一位助手,任务是对表格和文本进行摘要,以便于检索。
这些摘要将被嵌入并用于检索原始文本或表格元素。
请给出一个简洁的摘要,该摘要经过优化,便于检索。表格或文本:{element} """
prompt = ChatPromptTemplate.from_template(prompt_text)
# 文本摘要链
model = ChatOpenAI(temperature=0, model="gpt-4")
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()
# 应用于文本
text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})
# 应用于表格
table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})
图像摘要
# 图像摘要链
import base64
import io
import os
from io import BytesIO
from langchain_core.messages import HumanMessage
from PIL import Image
def encode_image(image_path):
"""获取base64字符串"""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
def image_summarize(img_base64, prompt):
"""图像摘要"""
chat = ChatOpenAI(model="gpt-4-vision-preview", max_tokens=1024)
# 存储base64编码的图像
img_base64_list = []
# 存储图像摘要
image_summaries = []
# 提示
prompt = """您是一位助手,任务是对图像进行摘要,以便于检索。
这些摘要将被嵌入并用于检索原始图像。
请给出一个简洁的摘要,该摘要经过优化,便于检索。"""
# 应用于图像
for img_file in sorted(os.listdir(path)):
if img_file.endswith(".jpg"):
img_path = os.path.join(path, img_file)
base64_image = encode_image(img_path)
img_base64_list.append(base64_image)
image_summaries.append(image_summarize(base64_image, prompt))
选项2a:带原始图像的多向量检索器
将图像返回给LLM以进行答案合成
import uuid
from base64 import b64decode
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain_core.documents import Document
def create_multi_vector_retriever(
vectorstore, text_summaries, texts, table_summaries, tables, image_summaries, images
):
# 初始化存储层
store = InMemoryStore()
id_key = "doc_id"
# 使用的向量存储以索引摘要
multi_vector_img = Chroma(
collection_name="multi_vector_img", embedding_function=OpenAIEmbeddings()
)
# 创建检索器
retriever_multi_vector_img = create_multi_vector_retriever(
multi_vector_img,
text_summaries,
texts,
table_summaries,
tables,
image_summaries,
img_base64_list,
)
测试检索
query = "CPI有多少百分比用于住房,与医疗护理、服装和其他商品及服务的总百分比相比如何?"
suffix_for_images = "包括任何饼图、图表或表格。"
docs = retriever_multi_vector_img.invoke(query + suffix_for_images)
from IPython.display import HTML, display
def plt_img_base64(img_base64):
# 使用base64字符串创建HTML img标签作为源
image_html = f''
plt_img_base64(docs[1])
选项2b:带图像摘要的多向量检索器
将图像摘要的文本返回给LLM以进行答案合成
# 使用的向量存储以索引摘要
multi_vector_text = Chroma(
collection_name="multi_vector_text", embedding_function=OpenAIEmbeddings()
)
# 创建检索器
retriever_multi_vector_img_summary = create_multi_vector_retriever(
multi_vector_text,
text_summaries,
texts,
table_summaries,
tables,
image_summaries,
image_summaries,
)
选项3:多模态嵌入
from langchain_experimental.open_clip import OpenCLIPEmbeddings
# 创建带多模态嵌入的chroma
multimodal_embd = Chroma(
collection_name="multimodal_embd", embedding_function=OpenCLIPEmbeddings()
)
# 获取图像URI
image_uris = sorted(
[
os.path.join(path, image_name)
for image_name in os.listdir(path)
if image_name.endswith(".jpg")
]
)
# 添加图像和文档
if image_uris:
multimodal_embd.add_images(uris=image_uris)
if texts:
multimodal_embd.add_texts(texts=texts)
if tables:
multimodal_embd.add_texts(texts=tables)
# 创建检索器
retriever_multimodal_embd = multimodal_embd.as_retriever()
RAG
文本管道
from operator import itemgetter
from langchain_core.runnables import RunnablePassthrough
# 提示模板
template = """根据以下上下文回答问题,上下文可以包括文本和表格:
{context}
问题:{question}
"""
rag_prompt_text = ChatPromptTemplate.from_template(template)
# 构建
def text_rag_chain(retriever):
"""RAG链"""
多模态管道
import re
from langchain_core.documents import Document
from langchain_core.runnables import RunnableLambda
def looks_like_base64(sb):
"""检查字符串是否看起来像base64"""
return re.match("^[A-Za-z0-9+/]+[=]{0,2}$", sb) is not None
def is_image_data(b64data):
"""通过查看数据的开头检查base64数据是否是图像"""
image_signatures = {
b"\xff\xd8\xff": "jpg",
b"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a": "png",
b"\x47\x49\x46\x38": "gif",
b"\x52\x49\x46\x46": "webp",
}
try:
header = base64.b64decode(b64data)[:8] # 解码并获取前8个字节
for sig, format in image_signatures.items():
if header.startswith(sig):
return True
return False
except Exception:
return False
def split_image_text_types(docs):
"""分割base64编码的图像和文本"""
b64_images = []
texts = []
for doc in docs:
# 如果文档类型为Document,则检查并提取page_content
if isinstance(doc, Document):
doc = doc.page_content
if looks_like_base64(doc) and is_image_data(doc):
b64_images.append(doc)
else:
texts.append(doc)
return {"images": b64_images, "texts": texts}
def img_prompt_func(data_dict):
# 将上下文文本连接成单个字符串
formatted_texts = "\n".join(data_dict["context"]["texts"])
messages = []
构建RAG管道
# RAG链
chain_baseline = text_rag_chain(retriever_baseline)
chain_mv_text = text_rag_chain(retriever_multi_vector_img_summary)
# 多模态RAG链
chain_multimodal_mv_img = multi_modal_rag_chain(retriever_multi_vector_img)
chain_multimodal_embd = multi_modal_rag_chain(retriever_multimodal_embd)
评估集
# 读取
import pandas as pd
eval_set = pd.read_csv(path + "cpi_eval.csv")
eval_set.head(3)
住房在CPI中占多少百分比?
住房占CPI的42%。
图1
医疗护理和交通占…
(此处省略了部分原文内容)
构建评估
from langsmith import Client
# 数据集
client = Client()
dataset_name = f"CPI Eval {str(uuid.uuid4())}"
dataset = client.create_dataset(dataset_name=dataset_name)
# 填充数据集
for _, row in eval_set.iterrows():
# 获取Q和A
q = row["Question"]
a = row["Answer"]
# 在您的函数中使用这些值
client.create_example(
inputs={"question": q}, outputs={"answer": a}, dataset_id=dataset.id
)
from langchain.smith import RunEvalConfig
# 评估配置
eval_config = RunEvalConfig(
evaluators=["qa"],
)
def run_eval(chain, run_name, dataset_name):
_ = client.run_on_dataset(
dataset_name=dataset_name,
llm_or_chain_factory=lambda: (lambda x: x["question"] + suffix_for_images)
| chain,
evaluation=eval_config,
project_name=run_name,
)
for chain, run in zip(
[chain_baseline, chain_mv_text, chain_multimodal_mv_img, chain_multimodal_embd],
["baseline", "mv_text", "mv_img", "mm_embd"],
):
run_eval(chain, dataset_name + "-" + run, dataset_name)