模型对模型评估（Model-vs-Model Evaluation）或“AI-vs-AI评估“_使用另一个大模型评估模型生成的结果-CSDN博客

本文链接：https://blog.csdn.net/slient_love/article/details/140442375

让两个大模型互相测试或评估是一种新兴的评测方法，通常被称为"模型对模型评估"（Model-vs-Model Evaluation）或"AI-vs-AI评估"。这种方法有几个潜在的优点和一些挑战。

方法概述

两个（或多个）AI模型互相提问和回答。
一个模型可能扮演提问者或评判者的角色，另一个模型扮演回答者的角色。
模型可以轮流扮演这些角色，以获得更全面的评估。

优点

可扩展性：可以生成大量的测试用例，不受人工创建测试集的限制。
多样性：AI可能会提出人类可能忽视的问题或场景。
持续性：可以随时进行，不需要持续的人力投入。
深度：AI可能会深入探讨特定主题，超出预定义测试集的范围。

挑战

偏见传播：如果一个模型有偏见，可能会影响评估过程。
难以验证：需要人工审核来确保评估的质量和相关性。
可解释性：可能难以理解为什么AI做出某些评估决定。
"回声室"效应：模型可能会强化彼此的错误或偏见。

实施方法

问答评估：模型A提出问题，模型B回答。模型A评估回答的质量。角色互换，重复过程。
辩论式评估：两个模型就一个主题进行辩论。第三个模型作为裁判，评估论点的质量和说服力。
任务完成评估：一个模型设置任务，另一个模型尝试完成。设置任务的模型评估完成情况。
错误检测：一个模型生成包含错误的文本。另一个模型尝试识别和纠正这些错误。

实际应用例子

OpenAI的InstructGPT使用了一种类似的方法，让GPT-3为人类偏好训练生成数据。
DeepMind的"AI-vs-AI"辩论系统，用于测试语言模型的推理能力。

注意事项

仍然需要人类监督来验证评估的质量和相关性。
应该与其他评估方法（如人工评估、基准测试等）结合使用，以获得全面的评估。
需要仔细设计评估标准和流程，以确保公平性和有效性。

代码示例

以腾讯混元模型和百度的yi_34b_chat为例

from openpyxl import Workbook
from openpyxl.styles import Font, Alignment, PatternFill, Border, Side
from datetime import datetime

from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.llms.base import LLM

# 导入自定义封装的混元模型和百度YI模型
from xxx import HunyuanAI
from xxx import BaiduYiLLM


def evaluate_response(evaluator: LLM, question: str, response: str) -> str:
    evaluation_prompt = PromptTemplate(
        input_variables=["question", "response"],
        template="请评估以下问题的回答质量。给出1-10的评分，并简要解释原因。\n\n问题：{question}\n\n回答：{response}\n\n评估："
    )

    evaluation_chain = LLMChain(llm=evaluator, prompt=evaluation_prompt)
    evaluation = evaluation_chain.invoke({"question": question, "response": response})
    return evaluation["text"]


def generate_question(questioner: LLM) -> str:
    question_prompt = PromptTemplate(
        input_variables=[],
        template="请生成一个有趣且具有挑战性的问题，这个问题应该能测试AI模型的知识广度和深度。"
    )

    question_chain = LLMChain(llm=questioner, prompt=question_prompt)
    question = question_chain.invoke({})
    return question["text"]


def model_vs_model_evaluation(model1: HunyuanAI, model2: BaiduYiLLM, num_rounds: int = 5):
    results = []

    for i in range(num_rounds):
        print(f"\n--- Round {i + 1} ---")

        # Model 1 asks, Model 2 answers
        question1 = generate_question(model1)
        print(f"{model1._llm_type}'s question: {question1}")

        answer2 = model2(question1)
        print(f"{model2._llm_type}'s answer: {answer2}")

        evaluation1 = evaluate_response(model1, question1, answer2)
        print(f"{model1._llm_type}'s evaluation of {model2._llm_type}: {evaluation1}")

        results.append({
            "round": i + 1,
            "questioner": model1._llm_type,
            "answerer": model2._llm_type,
            "question": question1,
            "answer": answer2,
            "evaluation": evaluation1
        })

        # Model 2 asks, Model 1 answers
        question2 = generate_question(model2)
        print(f"{model2._llm_type}'s question: {question2}")

        answer1 = model1(question2)
        print(f"{model1._llm_type}'s answer: {answer1}")

        evaluation2 = evaluate_response(model2, question2, answer1)
        print(f"{model2._llm_type}'s evaluation of {model1._llm_type}: {evaluation2}")

        results.append({
            "round": i + 1,
            "questioner": model2._llm_type,
            "answerer": model1._llm_type,
            "question": question2,
            "answer": answer1,
            "evaluation": evaluation2
        })

    return results


def save_results_to_excel(results, filename):
    wb = Workbook()
    ws = wb.active
    ws.title = "Model Evaluation Results"

    # 定义样式
    header_font = Font(name='Arial', size=12, bold=True, color='FFFFFF')
    header_fill = PatternFill(start_color='4472C4', end_color='4472C4', fill_type='solid')
    centered_alignment = Alignment(horizontal='center', vertical='center', wrap_text=True)
    border = Border(left=Side(style='thin'), right=Side(style='thin'), top=Side(style='thin'),
                    bottom=Side(style='thin'))

    # 写入标题行
    headers = ["Round", "Questioner", "Answerer", "Question", "Answer", "Evaluation"]
    ws.append(headers)

    # 设置标题行样式
    for cell in ws[1]:
        cell.font = header_font
        cell.fill = header_fill
        cell.alignment = centered_alignment
        cell.border = border

    # 写入数据并设置样式
    for result in results:
        row = [
            result["round"],
            result["questioner"],
            result["answerer"],
            result["question"],
            result["answer"],
            result["evaluation"]
        ]
        ws.append(row)

        for cell in ws[ws.max_row]:
            cell.alignment = Alignment(vertical='center', wrap_text=True)
            cell.border = border

    # 设置列宽和行高
    for column in ws.columns:
        max_length = 0
        column_letter = column[0].column_letter
        for cell in column:
            try:
                if len(str(cell.value)) > max_length:
                    max_length = len(str(cell.value))
            except:
                pass
        adjusted_width = min(max_length + 2, 50)  # 限制最大宽度为50
        ws.column_dimensions[column_letter].width = adjusted_width

    # 设置行高
    for row in ws.iter_rows(min_row=2):
        ws.row_dimensions[row[0].row].height = 60

    # 冻结首行
    ws.freeze_panes = 'A2'

    # 设置自动筛选
    ws.auto_filter.ref = ws.dimensions

    wb.save(filename)


# 主程序部分保持不变
if __name__ == "__main__":
    # 创建两个 LLM 实例
    model1 = HunyuanAI()
    model2 = BaiduYiLLM()

    # 运行模型对模型评估
    results = model_vs_model_evaluation(model1, model2, num_rounds=5)

    # 生成文件名（使用当前日期和时间）
    current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
    excel_filename = f"model_evaluation_results_{current_time}.xlsx"

    # 保存结果到 Excel 文件
    save_results_to_excel(results, excel_filename)

    print(f"\nResults have been saved to {excel_filename}")

    # 打印结果摘要
    print("\n--- Evaluation Summary ---")
    for result in results:
        print(f"Round {result['round']}: {result['questioner']} asked, {result['answerer']} answered")
        print(f"Question: {result['question']}")
        print(f"Evaluation: {result['evaluation']}")
        print()

HunyuanAI：

import json

from langchain.llms.base import LLM
from langchain.prompts.chat import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
from langchain.chains import LLMChain
from typing import Any, List, Mapping, Optional, Dict

from tencentcloud.common import credential
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile
from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException
from tencentcloud.hunyuan.v20230901 import hunyuan_client, models

SecretId = "填写正确的SecretId "
SecretKey = "填写正确的SecretKey "


class HunyuanAI(LLM):

    @property
    def _llm_type(self) -> str:
        return "hunyuan"

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        # 将 prompt 解析为消息列表
        # messages = self._parse_prompt(prompt)
        try:
            messages = [{"Role": "user", "Content": prompt}]
            cred = credential.Credential(SecretId, SecretKey)
            httpProfile = HttpProfile()
            httpProfile.endpoint = "hunyuan.tencentcloudapi.com"

            clientProfile = ClientProfile()
            clientProfile.httpProfile = httpProfile
            client = hunyuan_client.HunyuanClient(cred, "", clientProfile)

            req = models.ChatCompletionsRequest()
            params = {
                "TopP": 1,
                "Temperature": 1,
                "Model": "hunyuan-pro",
                "Messages": messages
            }
            req.from_json_string(json.dumps(params))

            resp = client.ChatCompletions(req)
            return resp.Choices[0].Message.Content

        except TencentCloudSDKException as err:
            raise ValueError(f"Error calling Hunyuan AI: {err}")

    def _parse_prompt(self, prompt: str) -> List[Dict[str, str]]:
        """将 LangChain 格式的 prompt 解析为 Hunyuan API 所需的消息格式"""
        messages = []
        for message in prompt.split('Human: '):
            if message.startswith('System: '):
                messages.append({"Role": "system", "Content": message[8:]})
            elif message:
                messages.append({"Role": "user", "Content": message})
        return messages

BaiduYiLLM：

from langchain.llms.base import LLM
from langchain.callbacks.manager import CallbackManagerForLLMRun
from typing import Any, List, Mapping, Optional
import requests
import json

API_KEY = "填写正确的API_KEY"
SECRET_KEY = "填写正确的SECRET_KEY "

class BaiduYiLLM(LLM):
    @property
    def _llm_type(self) -> str:
        return "baidu-yi"

    def get_access_token(self):
        """
        使用 AK，SK 生成鉴权签名（Access Token）
        :return: access_token，或是None(如果错误)
        """
        url = "https://aip.baidubce.com/oauth/2.0/token"
        params = {"grant_type": "client_credentials", "client_id": API_KEY, "client_secret": SECRET_KEY}
        return str(requests.post(url, params=params).json().get("access_token"))

    def _call(
            self,
            prompt: str,
            stop: Optional[List[str]] = None,
            run_manager: Optional[CallbackManagerForLLMRun] = None,
            **kwargs: Any,
    ) -> str:
        url = "https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/yi_34b_chat?access_token=" + self.get_access_token()

        payload = json.dumps({
            "messages": [
                {
                    "role": "user",
                    "content": "hi"
                },
                {
                    "role": "assistant",
                    "content": "Hello! How can I assist you today? If you have any questions or need information on a specific topic, feel free to ask."
                },
                {
                    "role": "user",
                    "content": prompt
                }
            ]
        })
        headers = {
            'Content-Type': 'application/json'
        }

        response = requests.request("POST", url, headers=headers, data=payload)

        try:
            response_text = response.text
            json_response = json.loads(response_text)
            result = json_response["result"]
        except json.JSONDecodeError:
            print("无法解析JSON响应：", response_text)
            return ""
        except KeyError:
            print("JSON响应中没有'result'键：", response_text)
            return ""
        except Exception as e:
            print("发生未知错误：", e)
            return ""
        else:
            return result

    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        """Get the identifying parameters."""
        return {"model": "baidu-yi"}