让两个大模型互相测试或评估是一种新兴的评测方法,通常被称为"模型对模型评估"(Model-vs-Model Evaluation)或"AI-vs-AI评估"。这种方法有几个潜在的优点和一些挑战。
方法概述
- 两个(或多个)AI模型互相提问和回答。
- 一个模型可能扮演提问者或评判者的角色,另一个模型扮演回答者的角色。
- 模型可以轮流扮演这些角色,以获得更全面的评估。
优点
- 可扩展性:可以生成大量的测试用例,不受人工创建测试集的限制。
- 多样性:AI可能会提出人类可能忽视的问题或场景。
- 持续性:可以随时进行,不需要持续的人力投入。
- 深度:AI可能会深入探讨特定主题,超出预定义测试集的范围。
挑战
- 偏见传播:如果一个模型有偏见,可能会影响评估过程。
- 难以验证:需要人工审核来确保评估的质量和相关性。
- 可解释性:可能难以理解为什么AI做出某些评估决定。
- "回声室"效应:模型可能会强化彼此的错误或偏见。
实施方法
- 问答评估:模型A提出问题,模型B回答。模型A评估回答的质量。角色互换,重复过程。
- 辩论式评估:两个模型就一个主题进行辩论。第三个模型作为裁判,评估论点的质量和说服力。
- 任务完成评估:一个模型设置任务,另一个模型尝试完成。设置任务的模型评估完成情况。
- 错误检测:一个模型生成包含错误的文本。另一个模型尝试识别和纠正这些错误。
实际应用例子
- OpenAI的InstructGPT使用了一种类似的方法,让GPT-3为人类偏好训练生成数据。
- DeepMind的"AI-vs-AI"辩论系统,用于测试语言模型的推理能力。
注意事项
- 仍然需要人类监督来验证评估的质量和相关性。
- 应该与其他评估方法(如人工评估、基准测试等)结合使用,以获得全面的评估。
- 需要仔细设计评估标准和流程,以确保公平性和有效性。
代码示例
以腾讯混元模型和百度的yi_34b_chat为例
from openpyxl import Workbook
from openpyxl.styles import Font, Alignment, PatternFill, Border, Side
from datetime import datetime
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.llms.base import LLM
# 导入自定义封装的混元模型和百度YI模型
from xxx import HunyuanAI
from xxx import BaiduYiLLM
def evaluate_response(evaluator: LLM, question: str, response: str) -> str:
evaluation_prompt = PromptTemplate(
input_variables=["question", "response"],
template="请评估以下问题的回答质量。给出1-10的评分,并简要解释原因。\n\n问题:{question}\n\n回答:{response}\n\n评估:"
)
evaluation_chain = LLMChain(llm=evaluator, prompt=evaluation_prompt)
evaluation = evaluation_chain.invoke({"question": question, "response": response})
return evaluation["text"]
def generate_question(questioner: LLM) -> str:
question_prompt = PromptTemplate(
input_variables=[],
template="请生成一个有趣且具有挑战性的问题,这个问题应该能测试AI模型的知识广度和深度。"
)
question_chain = LLMChain(llm=questioner, prompt=question_prompt)
question = question_chain.invoke({})
return question["text"]
def model_vs_model_evaluation(model1: HunyuanAI, model2: BaiduYiLLM, num_rounds: int = 5):
results = []
for i in range(num_rounds):
print(f"\n--- Round {i + 1} ---")
# Model 1 asks, Model 2 answers
question1 = generate_question(model1)
print(f"{model1._llm_type}'s question: {question1}")
answer2 = model2(question1)
print(f"{model2._llm_type}'s answer: {answer2}")
evaluation1 = evaluate_response(model1, question1, answer2)
print(f"{model1._llm_type}'s evaluation of {model2._llm_type}: {evaluation1}")
results.append({
"round": i + 1,
"questioner": model1._llm_type,
"answerer": model2._llm_type,
"question": question1,
"answer": answer2,
"evaluation": evaluation1
})
# Model 2 asks, Model 1 answers
question2 = generate_question(model2)
print(f"{model2._llm_type}'s question: {question2}")
answer1 = model1(question2)
print(f"{model1._llm_type}'s answer: {answer1}")
evaluation2 = evaluate_response(model2, question2, answer1)
print(f"{model2._llm_type}'s evaluation of {model1._llm_type}: {evaluation2}")
results.append({
"round": i + 1,
"questioner": model2._llm_type,
"answerer": model1._llm_type,
"question": question2,
"answer": answer1,
"evaluation": evaluation2
})
return results
def save_results_to_excel(results, filename):
wb = Workbook()
ws = wb.active
ws.title = "Model Evaluation Results"
# 定义样式
header_font = Font(name='Arial', size=12, bold=True, color='FFFFFF')
header_fill = PatternFill(start_color='4472C4', end_color='4472C4', fill_type='solid')
centered_alignment = Alignment(horizontal='center', vertical='center', wrap_text=True)
border = Border(left=Side(style='thin'), right=Side(style='thin'), top=Side(style='thin'),
bottom=Side(style='thin'))
# 写入标题行
headers = ["Round", "Questioner", "Answerer", "Question", "Answer", "Evaluation"]
ws.append(headers)
# 设置标题行样式
for cell in ws[1]:
cell.font = header_font
cell.fill = header_fill
cell.alignment = centered_alignment
cell.border = border
# 写入数据并设置样式
for result in results:
row = [
result["round"],
result["questioner"],
result["answerer"],
result["question"],
result["answer"],
result["evaluation"]
]
ws.append(row)
for cell in ws[ws.max_row]:
cell.alignment = Alignment(vertical='center', wrap_text=True)
cell.border = border
# 设置列宽和行高
for column in ws.columns:
max_length = 0
column_letter = column[0].column_letter
for cell in column:
try:
if len(str(cell.value)) > max_length:
max_length = len(str(cell.value))
except:
pass
adjusted_width = min(max_length + 2, 50) # 限制最大宽度为50
ws.column_dimensions[column_letter].width = adjusted_width
# 设置行高
for row in ws.iter_rows(min_row=2):
ws.row_dimensions[row[0].row].height = 60
# 冻结首行
ws.freeze_panes = 'A2'
# 设置自动筛选
ws.auto_filter.ref = ws.dimensions
wb.save(filename)
# 主程序部分保持不变
if __name__ == "__main__":
# 创建两个 LLM 实例
model1 = HunyuanAI()
model2 = BaiduYiLLM()
# 运行模型对模型评估
results = model_vs_model_evaluation(model1, model2, num_rounds=5)
# 生成文件名(使用当前日期和时间)
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
excel_filename = f"model_evaluation_results_{current_time}.xlsx"
# 保存结果到 Excel 文件
save_results_to_excel(results, excel_filename)
print(f"\nResults have been saved to {excel_filename}")
# 打印结果摘要
print("\n--- Evaluation Summary ---")
for result in results:
print(f"Round {result['round']}: {result['questioner']} asked, {result['answerer']} answered")
print(f"Question: {result['question']}")
print(f"Evaluation: {result['evaluation']}")
print()
HunyuanAI:
import json
from langchain.llms.base import LLM
from langchain.prompts.chat import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
from langchain.chains import LLMChain
from typing import Any, List, Mapping, Optional, Dict
from tencentcloud.common import credential
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile
from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException
from tencentcloud.hunyuan.v20230901 import hunyuan_client, models
SecretId = "填写正确的SecretId "
SecretKey = "填写正确的SecretKey "
class HunyuanAI(LLM):
@property
def _llm_type(self) -> str:
return "hunyuan"
def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
# 将 prompt 解析为消息列表
# messages = self._parse_prompt(prompt)
try:
messages = [{"Role": "user", "Content": prompt}]
cred = credential.Credential(SecretId, SecretKey)
httpProfile = HttpProfile()
httpProfile.endpoint = "hunyuan.tencentcloudapi.com"
clientProfile = ClientProfile()
clientProfile.httpProfile = httpProfile
client = hunyuan_client.HunyuanClient(cred, "", clientProfile)
req = models.ChatCompletionsRequest()
params = {
"TopP": 1,
"Temperature": 1,
"Model": "hunyuan-pro",
"Messages": messages
}
req.from_json_string(json.dumps(params))
resp = client.ChatCompletions(req)
return resp.Choices[0].Message.Content
except TencentCloudSDKException as err:
raise ValueError(f"Error calling Hunyuan AI: {err}")
def _parse_prompt(self, prompt: str) -> List[Dict[str, str]]:
"""将 LangChain 格式的 prompt 解析为 Hunyuan API 所需的消息格式"""
messages = []
for message in prompt.split('Human: '):
if message.startswith('System: '):
messages.append({"Role": "system", "Content": message[8:]})
elif message:
messages.append({"Role": "user", "Content": message})
return messages
BaiduYiLLM:
from langchain.llms.base import LLM
from langchain.callbacks.manager import CallbackManagerForLLMRun
from typing import Any, List, Mapping, Optional
import requests
import json
API_KEY = "填写正确的API_KEY"
SECRET_KEY = "填写正确的SECRET_KEY "
class BaiduYiLLM(LLM):
@property
def _llm_type(self) -> str:
return "baidu-yi"
def get_access_token(self):
"""
使用 AK,SK 生成鉴权签名(Access Token)
:return: access_token,或是None(如果错误)
"""
url = "https://aip.baidubce.com/oauth/2.0/token"
params = {"grant_type": "client_credentials", "client_id": API_KEY, "client_secret": SECRET_KEY}
return str(requests.post(url, params=params).json().get("access_token"))
def _call(
self,
prompt: str,
stop: Optional[List[str]] = None,
run_manager: Optional[CallbackManagerForLLMRun] = None,
**kwargs: Any,
) -> str:
url = "https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/yi_34b_chat?access_token=" + self.get_access_token()
payload = json.dumps({
"messages": [
{
"role": "user",
"content": "hi"
},
{
"role": "assistant",
"content": "Hello! How can I assist you today? If you have any questions or need information on a specific topic, feel free to ask."
},
{
"role": "user",
"content": prompt
}
]
})
headers = {
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
try:
response_text = response.text
json_response = json.loads(response_text)
result = json_response["result"]
except json.JSONDecodeError:
print("无法解析JSON响应:", response_text)
return ""
except KeyError:
print("JSON响应中没有'result'键:", response_text)
return ""
except Exception as e:
print("发生未知错误:", e)
return ""
else:
return result
@property
def _identifying_params(self) -> Mapping[str, Any]:
"""Get the identifying parameters."""
return {"model": "baidu-yi"}