基于大模型 Prompt + LangChain 架构的 AI 翻译系统全流程解析

本文链接：https://blog.csdn.net/python122_/article/details/141319205

在全球化日益加深的今天，文档翻译需求愈发强烈。传统的人工翻译不仅成本高，而且效率低。借助于人工智能技术，特别是大型语言模型（如GPT），我们可以实现高效、准确的文档翻译。本文将详细介绍一种基于大型语言模型（LLMs）和LangChain的文档翻译技术方案与架构设计，帮助读者了解其工作原理和实现方式。

一、总体架构设计

文档翻译系统主要由以下几个模块组成：文档解析模块（PDFParser）、翻译模板生成模块（PromptTemplate）、翻译执行模块（GPT）、结果输出模块（Writer）和日志记录模块（Logger）。各模块之间通过明确的接口和数据流进行交互，保证系统的稳定性和扩展性。

1.1 待翻译文件的加载

首先，用户通过ArgParser模块加载待翻译的PDF文件。ArgParser负责处理用户输入的参数，例如文件路径、翻译语言等。此模块的设计确保了系统的灵活性和易用性。

import argparse

class ArgumentParser:
    """    
    ArgumentParser 类用于定义和解析命令行参数。     
       
    属性:    
    - parser: argparse.ArgumentParser 实例，用于设置和解析命令行参数。        

    方法:    
    - __init__(): 构造函数，初始化命令行参数的设置。    
    - parse_arguments(): 解析命令行参数并返回解析结果。    
    """    
    def __init__(self):    
        """        
        初始化 ArgumentParser 实例。    
                    
        设置命令行参数描述信息，并定义各个参数及其默认值。        
        """                
        
        
        self.parser = argparse.ArgumentParser(description='A translation tool that supports translations in any language pair.')        
        self.parser.add_argument('--config_file', type=str, default='langchain/openai-translator/config.yaml', help='Configuration file with model and API settings.')        
        self.parser.add_argument('--model_name', type=str, help='Name of the Large Language Model.')        
        self.parser.add_argument('--input_file', type=str, help='PDF file to translate.')        
        self.parser.add_argument('--output_file_format', type=str, help='The file format of translated book. Now supporting PDF and Markdown')        
        self.parser.add_argument('--source_language', type=str, help='The language of the original book to be translated.')        
        self.parser.add_argument('--target_language', type=str, help='The target language for translating the original book.')
        
    def parse_arguments(self):     
        """        
        解析命令行参数。       
                 
        返回:        
        - args: 包含所有定义的命令行参数值的命名空间对象。        
        """        
        args = self.parser.parse_args()        
        return args

在这里插入图片描述

import pandas as pd

from enum import Enum, auto
from PIL import Image as PILImage
from io import StringIO
from utils import LOG

# 定义内容类型枚举
class ContentType(Enum):
    TEXT = auto()  # 文本类型    
    TABLE = auto()  # 表格类型    
    IMAGE = auto()  # 图像类型
    
# 定义内容类，支持文本、表格、图像内容的存储和翻译
class Content:
    def __init__(self, content_type, original, translation=None):        
        """        
        初始化内容对象。         
               
        :param content_type: 内容类型（ContentType枚举）。        
        :param original: 原始内容。        
        :param translation: 翻译后的内容（默认为None）。        
        """        
        self.content_type = content_type        
        self.original = original        
        self.translation = translation        
        self.status = False  # 翻译状态标志
        
    def set_translation(self, translation, status):   
        """        
        设置翻译后的内容并更新状态。    
                    
        :param translation: 翻译后的内容。        
        :param status: 翻译状态（True或False）。        
        :raises ValueError: 当翻译类型与期望类型不匹配时抛出。        
        """       
        if not self.check_translation_type(translation):       
            raise ValueError(f"Invalid translation type. Expected {self.content_type}, but got {type(translation)}")        
        self.translation = translation        
        self.status = status
        
    def check_translation_type(self, translation):    
        """        
        检查翻译内容的类型是否匹配。        
                
        :param translation: 待检查的翻译内容。        
        :return: 布尔值，类型匹配返回True，否则返回False。        
        """        
        if self.content_type == ContentType.TEXT and isinstance(translation, str):        
            return True        
        elif self.content_type == ContentType.TABLE and isinstance(translation, list):        
            return True        
        elif self.content_type == ContentType.IMAGE and isinstance(translation, PILImage.Image):            return True    
            return False
            
    def __str__(self):   
        return self.original  # 返回原始内容的字符串表示
        
# 表格内容类，继承自Content类，提供特定于表格内容的操作
class TableContent(Content):
    def __init__(self, data, translation=None):   
        """        
        初始化表格内容对象。   
                     
        :param data: 表格数据，二维列表形式。        
        :param translation: 翻译后的表格数据（默认为None）。        
        :raises ValueError: 当数据与创建的DataFrame对象的行数或列数不匹配时抛出。        
        """        
        df = pd.DataFrame(data)
        
        # 验证数据和DataFrame对象的行数、列数是否匹配        
        if len(data) != len(df) or len(data[0]) != len(df.columns):         
            raise ValueError("The number of rows and columns in the extracted table data and DataFrame object do not match.")              
              
        super().__init__(ContentType.TABLE, df)
        
    def set_translation(self, translation, status):    
        """        
        设置翻译后的表格内容并更新状态。   
                     
        :param translation: 翻译后的表格内容，字符串形式。        
        :param status: 翻译状态（True或False）。        
        :raises ValueError: 当翻译格式不正确或类型不匹配时抛出。        
        """        
        try:       
            if not isinstance(translation, str):            
                raise ValueError(f"Invalid translation type. Expected str, but got {type(translation)}")
                
            LOG.debug(f"[translation]\n{translation}")            
            # 从字符串解析表格头和数据            
            header = translation.split(']')[0][1:].split(', ')            
            data_rows = translation.split('] ')[1:]            
            data_rows = [row[1:-1].split(', ') for row in data_rows]            
            translated_df = pd.DataFrame(data_rows, columns=header)            
            LOG.debug(f"[translated_df]\n{translated_df}")