▌前言
检索增强生成(Retrieval-Augmented Generation, RAG)是一种将信息检索与语言模型生成相结合的技术。它旨在通过引入外部知识来提高大模型的响应质量和准确性。RAG系统的有效性不仅依赖于其生成的答案质量,还与其检索信息的相关性和效率密切相关。因此,
对RAG系统的评估和测试变得尤为重要,故而我们需要一套有效的测试方法及工具来评估系统的各方面表现。
▌RAG系统测试需求
-
准确性:确保生成的答案是基于正确的检索内容。
-
相关性:答案应直接回应用户提问,同时检索到的内容也需高度相关。
-
效率:减少从提问到获取答案所需的时间。
-
用户体验:考虑用户的感受,比如答案满意度、界面友好度等。
▌RAG系统测试方法
- 核心评估指标
-
准确性:通过准确率(正确答案占比)、召回率(找到的正确答案比例)及F1值综合衡量。
-
效率:以响应时间(毫秒级)评估系统速度,多样性。
-
可解释性:通过人工评分或语义分析,判断答案清晰度与用户理解难度。
-
真实性:检测生成内容是否严格基于检索到的上下文,避免“无中生有”(幻觉)。
- 测试方法分类
-
人工标注:专家对比系统回答与标准答案,评估主观指标(如可解释性)。
-
自动化测试:通过脚本批量执行问答,统计客观指标(如准确率、响应时间)。
-
多轮交互模拟:
模拟连续对话场景,验证系统在上下文依赖问题中的连贯性(如“之前提到的XX具体指什么?”)。
▌RAG系统测试方法
工具的使用需要NLTK支持,NLTK是一个自然语言处理工具包,它可以完成词频统计,分词,词性标注等常见任务。
下载地址:https://github.com/nltk/nltk_data/tree/gh-pages
Github加速:https://hosts.gitcdn.top/
▌TruLens多维评估
TruLens是一款旨在评估和改进LLM应用的软件工具,它可以集成LangChain或LlamaIndex等LLM开发框架。重点关注三大核心指标
。
-
事实一致性(Groundedness):验证回答是否严格基于上下文
-
答案相关性(Answer Relevance):评估回答与问题的匹配度
-
上下文相关性(Context Relevance):检测召回内容与问题的关联性
使用指南
pip install trulens chromadb trulens-providers-openai openai
import chromadb
import requests
from chromadb.utils.embedding_functions import EmbeddingFunction
from trulens.apps.app import instrument
from trulens.core import TruSession, Feedback, Select
from openai import OpenAI
import numpy as np
from trulens.providers.openai import OpenAI as TruOpenAI
from trulens.apps.app import TruApp
from trulens.dashboard import run_dashboard
gz = "中山大学广州校区,院系设置包括中国语言文学系、历史学系、哲学系、社会学与人类学学院、博雅学院(通识教育部)、岭南学院、外国语学院、法学院、政治与公共事务管理学院、管理学院、马克思主义学院、心理学系、新闻传播学院、信息管理学院、艺术学院、数学学院、物理学院、化学学院、地理科学与规划学院、生命科学学院、材料科学与工程学院、电子与信息工程学院、计算机学院、国家保密学院、网络安全学院、环境科学与工程学院、中山医学院、光华口腔医学院、公共卫生学院、药学院、护理学院、体育部、继续教育学院"
sz = "中山大学深圳校区,院系设置包括医学院、公共卫生学院、药学院、材料学院、生物医学工程学院、电子与通信工程学院、智能工程学院、航空航天学院、农业与生物技术学院、生态学院、集成电路学院、先进制造学院、先进能源学院、网络安全学院、商学院、理学院、柔性电子学院"
zh = "中山大学珠海校区,院系设置包括中国语言文学系、历史学系、哲学系、国际金融学院、国际翻译学院、国际关系学院、旅游学院、数学学院、物理与天文学院、大气科学学院、海洋科学学院、地球科学与工程学院、化学工程与技术学院、海洋工程与技术学院、中法核工程与技术学院、土木工程学院、被电子科学与技术学院、测绘科学与技术学院、人工智能学院、软件工程学院"
# 自定义Jina嵌入函数
class JinaEmbeddingFunction(EmbeddingFunction):
def __init__(self):
super().__init__()
self.api_url = "http://localhost:1234/v1/embeddings/"
self.model_id = "jina-embeddings-v2-base-zh"
def __call__(self, texts):
embeddings = []
for text in texts:
response = requests.post(
self.api_url,
json={
"model": self.model_id,
"input": text
}
)
if response.status_code == 200:
embeddings.append(response.json()['data'][0]['embedding'])
else:
raise Exception(f"Embedding请求失败:{response.text}")
return embeddings
chroma_client = chromadb.Client()
vector_store = chroma_client.get_or_create_collection(
name="sysu", embedding_function=JinaEmbeddingFunction()
)
vector_store.add("gz", documents=gz)
vector_store.add("zh", documents=zh)
vector_store.add("sz", documents=sz)
session = TruSession()
session.reset_database()
model_id = "deepseek/DeepSeek-R1-Distill-Qwen-7B-Q3_K_L"
llm_base_url = "https://localhost:1234/v1/"
api_key = "123456"
oai_client = OpenAI(
base_url=llm_base_url, # 本地模型端点
api_key=api_key # 不需要实际API密钥
)
class RAG:
@instrument
def retrieve(self, query: str) -> list:
"""
Retrieve relevant text from vector store.
"""
results = vector_store.query(query_texts=query, n_results=4)
# Flatten the list of lists into a single list
return [doc for sublist in results["documents"] for doc in sublist]
@instrument
def generate_completion(self, query: str, context_str: list) -> str:
"""
Generate answer from context.
"""
if len(context_str) == 0:
return "Sorry, I couldn't find an answer to your question."
completion = (
oai_client.chat.completions.create(
model=model_id,
temperature=0,
messages=[
{
"role": "user",
"content": f"We have provided context information below. \n"
f"---------------------\n"
f"{context_str}"
f"\n---------------------\n"
f"First, say hello and that you're happy to help. \n"
f"\n---------------------\n"
f"Then, given this information, please answer the question: {query}",
}
],
)
.choices[0]
.message.content
)
if completion:
return completion
else:
return "Did not find an answer."
@instrument
def query(self, query: str) -> str:
context_str = self.retrieve(query=query)
completion = self.generate_completion(
query=query, context_str=context_str
)
return completion
rag = RAG()
provider = TruOpenAI(
model_engine=model_id,
base_url=llm_base_url,
api_key=api_key
)
# Define a groundedness feedback function
# groundedness – 主要用于检测 LLM 幻觉,使用 COT(思维链)方式,找出 response 中的句子在 context 中的存在证据,评估回复是否是基于知识生成
f_groundedness = (
Feedback(
provider.groundedness_measure_with_cot_reasons, name="Groundedness"
)
.on(Select.RecordCalls.retrieve.rets.collect())
# .on(Select.RecordCalls.generate_completion.args["context_str"])
.on_output()
)
# Question/answer relevance between overall question and answer.
# answer_relevance – 用于 response 相关性评估,使用 COT(思维链)方式,找出相关性证据并打分,评估 response 是否跟问题相关。
f_answer_relevance = (
Feedback(provider.relevance_with_cot_reasons, name="Answer Relevance")
.on_input()
.on_output()
)
# Context relevance between question and each context chunk.
# context_relevance -主要用于知识召回相关性评估,使用 COT(思维链)方式,找出相关性证据并打分,评估召回知识是否跟问题相关。
f_context_relevance = (
Feedback(
provider.context_relevance_with_cot_reasons, name="Context Relevance"
)
.on_input()
.on(Select.RecordCalls.retrieve.rets[:])
.aggregate(np.mean) # choose a different aggregation method if you wish
)
tru_rag = TruApp(
rag,
app_name="RAG",
app_version="base",
feedbacks=[f_groundedness, f_answer_relevance, f_context_relevance],
)
with tru_rag as recording:
rag.query(
"医学类专业主要分布在哪些校区?"
)
rag.query(
"中山大学是一所综合性大学吗?"
)
rag.query("中山大学软件工程学院在哪个校区?")
for r in session.get_records_and_feedback():
if hasattr(r, 'values') and hasattr(r, 'columns'):
for i in range(0, len(r.values)):
v = r.values[i]
for j in range(0, len(v)):
c = r.columns[j]
print(f"{c}: {v[j]}", end='\n')
▌RAGAS评估框架
RAGAS(Retrieval-Augmented Generation Assessment)是无监督评估工具,框架通过大语言模型自动评估指标,无需人工标注标准答案即可完成测试。
核心能力:
-
指标全面性:支持无监督评估,主要关注忠实度(Faithfulness)、答案相关性(Answer Relevance)、上下文相关性(Context Relevance)等指标,并支持生成测试数据集。
-
自动化测试:通过LLM自动生成涵盖不同场景的测试数据(如简单问题、推理问题、多上下文问题),降低人工标注成本。
-
灵活集成:与LangChain等框架兼容,支持直接输入RAG三元组(问题、上下文、回答)进行评估。
使用指南
pip install ragas langchain_xinference xinference sacrebleu
摘要生成准确性评估
from langchain_xinference.chat_models import ChatXinference
from langchain_openai.chat_models import ChatOpenAI
from ragas.llms import LangchainLLMWrapper
from ragas import SingleTurnSample
from ragas.metrics import AspectCritic
import asyncio
async def main():
# llm = ChatXinference(
# server_url="http://10.2.20.98:39996", model_uid="deepseek-r1-distill-qwen", api_key="empty",
# stream=True
# )
llm = ChatOpenAI(base_url="http://localhost:1234/v1", api_key="123456", model="deepseek/DeepSeek-R1-Distill-Qwen-7B-Q3_K_L", stream_usage=True)
test_data = {
"user_input": "summarise given text\nThe company reported an 8% rise in Q3 2024, driven by strong performance in the Asian market. Sales in this region have significantly contributed to the overall growth. Analysts attribute this success to strategic marketing and product localization. The positive trend in the Asian market is expected to continue into the next quarter.",
"response": "The company experienced an 8% increase in Q3 2024, largely due to effective marketing strategies and product adaptation, with expectations of continued growth in the coming quarter.",
}
evaluator_llm = LangchainLLMWrapper(langchain_llm=llm)
metric = AspectCritic(name="summary_accuracy",llm=evaluator_llm, definition="Verify if the summary is accurate.")
score = await metric.single_turn_ascore(SingleTurnSample(**test_data))
print(score)
if __name__ == "__main__":
asyncio.run(main())
事实准确性、忠实性、召回率评估
from chromadb.utils.embedding_functions import EmbeddingFunction
import numpy as np
from ragas import EvaluationDataset
from ragas import evaluate
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, AnswerRelevancy
from langchain_xinference.chat_models import ChatXinference
from langchain_community.embeddings import XinferenceEmbeddings
from ragas.llms import LangchainLLMWrapper
llm_client = ChatXinference(server_url="http://10.2.20.98:39996/", model_uid="deepseek-r1-distill-qwen", api_key="123456", stream=True)
embeddings = XinferenceEmbeddings(server_url="http://10.2.20.98:39996/", model_uid="jina-embeddings-v2-base-zh")
class RAG:
def __init__(self, llm, embeddings):
self.llm = llm
self.embeddings = embeddings
self.doc_embeddings = None
self.docs = None
def load_documents(self, documents):
"""Load documents and compute their embeddings."""
self.docs = documents
self.doc_embeddings = self.embeddings.embed_documents(documents)
def get_most_relevant_docs(self, query):
"""Find the most relevant document for a given query."""
if not self.docs or not self.doc_embeddings:
raise ValueError("Documents and their embeddings are not loaded.")
query_embedding = self.embeddings.embed_query(query)
similarities = [
np.dot(query_embedding, doc_emb)
/ (np.linalg.norm(query_embedding) * np.linalg.norm(doc_emb))
for doc_emb in self.doc_embeddings
]
most_relevant_doc_index = np.argmax(similarities)
return [self.docs[most_relevant_doc_index]]
def generate_answer(self, query, relevant_doc):
"""Generate an answer for a given query based on the most relevant document."""
prompt = f"question: {query}\n\nDocuments: {relevant_doc}"
messages = [
("system", "You are a helpful assistant that answers questions based on given documents only."),
("human", prompt),
]
ai_msg = self.llm.invoke(messages)
return ai_msg.content
sample_docs = [
"Albert Einstein proposed the theory of relativity, which transformed our understanding of time, space, and gravity.",
"Marie Curie was a physicist and chemist who conducted pioneering research on radioactivity and won two Nobel Prizes.",
"Isaac Newton formulated the laws of motion and universal gravitation, laying the foundation for classical mechanics.",
"Charles Darwin introduced the theory of evolution by natural selection in his book 'On the Origin of Species'.",
"Ada Lovelace is regarded as the first computer programmer for her work on Charles Babbage's early mechanical computer, the Analytical Engine."
]
# Initialize RAG instance
rag = RAG(llm_client, embeddings)
# Load documents
rag.load_documents(sample_docs)
sample_queries = [
"Who introduced the theory of relativity?",
"Who was the first computer programmer?",
"What did Isaac Newton contribute to science?",
"Who won two Nobel Prizes for research on radioactivity?",
"What is the theory of evolution by natural selection?"
]
expected_responses = [
"Albert Einstein proposed the theory of relativity, which transformed our understanding of time, space, and gravity.",
"Ada Lovelace is regarded as the first computer programmer for her work on Charles Babbage's early mechanical computer, the Analytical Engine.",
"Isaac Newton formulated the laws of motion and universal gravitation, laying the foundation for classical mechanics.",
"Marie Curie was a physicist and chemist who conducted pioneering research on radioactivity and won two Nobel Prizes.",
"Charles Darwin introduced the theory of evolution by natural selection in his book 'On the Origin of Species'."
]
dataset = []
for query,reference in zip(sample_queries,expected_responses):
relevant_docs = rag.get_most_relevant_docs(query)
response = rag.generate_answer(query, relevant_docs)
dataset.append(
{
"user_input":query,
"retrieved_contexts":relevant_docs,
"response":response,
"reference":reference
}
)
evaluation_dataset = EvaluationDataset.from_list(dataset)
evaluator_llm = LangchainLLMWrapper(llm_client)
# 1. Faithfulness(忠实性)
#定义:指生成的答案是否严格基于检索到的上下文内容,避免大模型的“幻觉”或编造无关信息。
# 2. Context Recall(上下文召回率)
# 定义:衡量检索到的文档是否覆盖了回答问题的全部必要信息。
# 3. FactualCorrectness(事实正确性)
#定义:综合评估生成答案的事实准确性,不仅要求基于上下文,还需确保逻辑自洽且无事实错误。
result = evaluate(dataset=evaluation_dataset,metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness()], llm=evaluator_llm)
print(result)
测试集生成
rag-tutorial.md是RAG指南,下面的代码将通过这个文件生成针对文件内容的问答测试集。
from langchain_xinference.chat_models import ChatXinference
from langchain_community.chat_models import ChatTongyi
from langchain_community.embeddings import XinferenceEmbeddings
from langchain_community.document_loaders import DirectoryLoader
from ragas.testset import TestsetGenerator
from ragas.llms import LangchainLLMWrapper
import sys
llm_client = ChatXinference(server_url="http://localhost:1234/", model_uid="Qwen2.5-Coder-7B", api_key="123456", stream=True)
embeddings = XinferenceEmbeddings(server_url="http://localhost:1234/", model_uid="jina-embeddings-v2-base-zh")
if sys.platform == 'linux':
path = "/mnt/d/code/ai/Sample_Docs_Markdown/"
else:
path = 'd:\\code\\ai\\Sample_Docs_Markdown'
loader = DirectoryLoader(path, glob="**/rag-tutorial.md")
docs = loader.load()
generator_llm = LangchainLLMWrapper(llm_client)
generator = TestsetGenerator(llm=generator_llm, embedding_model=embeddings)
dataset = generator.generate_with_langchain_docs(docs, testset_size=10)
dataset.to_pandas().to_excel("testset_output.xlsx", index=False)
▌总结
对于希望打造高质量知识问答产品的团队来说,理解并掌握合适的测评方法论和技术至关重要。只有通过不断试验和迭代,我们才能让AI更好地服务于用户。