Datawhale AI 夏令营-大模型技术（微调）-代码精读

最新推荐文章于 2024-08-18 21:31:01 发布

疯人院院长CLEVER

最新推荐文章于 2024-08-18 21:31:01 发布

阅读量293

点赞数 5

文章标签：人工智能语言模型

本文链接：https://blog.csdn.net/m0_52620723/article/details/141197683

版权

1. 环境准备

!pip install pandas openpyxl

2. 语文数据处理

2.1 数据加载

# 导入pandas库，用于数据处理和分析
import pandas as pd

# 导入re库，支持正则表达式操作
import re

# 从Excel文件中读取数据到DataFrame对象df中，文件名为'训练集-语文.xlsx'
df = pd.read_excel('训练集-语文.xlsx')

# 使用正则表达式将DataFrame中的所有'．'替换为'.'（句号），用于规范化数据
df = df.replace('．', '.', regex=True)

# 使用正则表达式将DataFrame中的所有'（'替换为'('（左括号），同样是为了规范化数据
df = df.replace('（', '(', regex=True)

# 获取DataFrame中第三行（索引为2）的“选项”列的内容
# 注意：DataFrame的行索引从0开始，因此索引2对应的是第三行
second_row_option_content = df.loc[2, '选项']

# 将获取到的第三行“选项”列内容打印输出到控制台
print(second_row_option_content)

2.2 抽取问题

抽取题目及答案，并且过滤简答题

def chinese_multiple_choice_questions(questions_with_answers):
    # 输入参数为包含问题及其答案的文本
    text = questions_with_answers

    # 使用正则表达式模式定义问题的匹配规则
    # 匹配以数字加点字符（例如1.）开头的字符串，并且以下一个数字点字符或文本结束作为结束标志
    # re.DOTALL 允许 . 匹配包括换行符在内的所有字符
    question_pattern = re.compile(r'\d+\..*?(?=\d+\.|$)', re.DOTALL)
    
    # 使用正则表达式模式定义选项的匹配规则
    # 匹配以字母 [A-D] 开头的字符串，并且以下一个字母 [A-D]、行结束符或文本结束作为结束标志
    # re.DOTALL 同样允许 . 匹配包括换行符在内的所有字符
    choice_pattern = re.compile(r'([A-D])\s*(.*?)(?=[A-D]|$|\n)', re.DOTALL)
    
    # 使用定义的正则表达式模式，找到所有匹配的问题
    questions = question_pattern.findall(text)

    # 初始化两个列表，用于分别存储选择题和简答题
    multiple_choice_questions = []
    short_answer_questions = []

    # 枚举每个问题，使用索引 id 和问题内容 question
    for id, question in enumerate(questions):
        # 检查该问题是否包含 [A-D] 选项，判断是否为选择题
        if re.search(r'[A-D]', question):
            # 如果是选择题，则使用定义的正则表达式模式提取所有选项内容
            choices = choice_pattern.findall(question)
            
            # 通过正则表达式分割和字符串操作提取题目的文本内容（去除选项和分数）
            # 这里 split('(')[0] 是为了去除带有分数的部分， re.split(r'\n') 是为了去掉多余的换行
            question_text = re.split(r'\n', question.split('(')[0])[0]
            
            # 使用正则表达式模式清理题目编号，并重新使用循环中的 id 进行重新编号
            pattern_question = re.compile(r'(\d+)\.(.*)')
            matches_question = str(id+1) + '.' + pattern_question.findall(question_text)[0][1]  # 重新排序题目
            
            # 将清理后的问题及其选项以字典形式存储到选择题列表中
            multiple_choice_questions.append({
                'question': matches_question,
                'choices': choices
            })
        else:
            # 如果问题不包含 [A-D] 选项，则判断为简答题，将其内容存储到简答题列表中
            short_answer_questions.append(question.strip())
    
    # 返回选择题的字典列表，包含题目文本及选项
    return multiple_choice_questions

2.3 抽取问题的结果

def chinese_multiple_choice_answers(questions_with_answers):
    # 首先清洗输入字段，因为答案字段中的格式不统一，清洗后便于统一处理。
    # 这里删除了所有的空格和换行符，使文本成为一个连续的字符串
    questions_with_answers = questions_with_answers.replace(" ", "").replace("\n", "")
    
    # 使用正则表达式模式匹配选择题的答案
    # 匹配规则为一个数字（题号）后跟一个点，再后面跟一个或多个大写字母（ABCD选项）
    choice_pattern = re.compile(r'(\d+)\.([A-Z]+)')
    
    # 使用正则表达式模式匹配简答题的答案（此部分代码目前未使用，可以忽略）
    # 匹配规则为一个数字（题号）后跟一个点，再后面跟随非大写字母的内容
    short_pattern = re.compile(r'(\d+)\.([^A-Z]+)')

    # 找到所有匹配选择题答案的内容，返回一个列表，其中每个元素是一个元组（题号，答案）
    choice_matches = choice_pattern.findall(questions_with_answers)
    
    # 找到所有匹配简答题答案的内容，返回一个列表（此部分未使用，可以忽略）
    short_matches = short_pattern.findall(questions_with_answers)

    # 将匹配的选择题答案结果转换为字典，题号作为键，答案作为值
    choice_answers = {int(index): answer for index, answer in choice_matches}
    
    # 将匹配的简答题答案结果转换为字典（此部分未使用，可以忽略）
    short_answers = {int(index): answer for index, answer in short_matches}

    # 按题号对选择题答案字典进行排序，返回一个包含元组（题号，答案）的列表
    sorted_choice_answers = sorted(choice_answers.items())
    
    # 按题号对简答题答案字典进行排序（此部分未使用，可以忽略）
    sorted_short_answers = sorted(short_answers.items())
    
    # 初始化一个空列表，用于存储最终返回的答案
    answers = []

    # 遍历排序后的选择题答案列表
    for id in range(len(sorted_choice_answers)):
        # 将答案按照 "题号. 答案" 的格式添加到答案列表中
        # 这里使用重新编号的id+1来生成题号
        answers.append(f"{id+1}. {sorted_choice_answers[id][1]}")
    
    # 返回整理后的答案列表
    return answers

2.4 prompt设计

使用要求+阅读材料组成prompt作为input部分

def get_prompt_cn(text):
    # 定义一个函数 `get_prompt_cn`，接受一个文本参数 `text`
    
    # 使用 f-string 格式化字符串，将文本内容插入到预定义的模板中
    # 这个模板是为生成一个出题任务的提示，它包括了题目的要求和需要遵守的规则
    prompt = f'''
    你是⼀个⾼考选择题出题专家，你出的题有⼀定深度，你将根据阅读文本，出4道单项选择题，包含题目选项，以及对应的答案，注意：不⽤给出原文，每道题由1个问题和4个选项组成，仅存在1个正确答案，请严格按照要求执行。 阅读文本主要是中文，你出的题目需要满足以下要点，紧扣文章内容且题干和答案为中文：
    
    ### 回答要求
    (1)理解文中重要概念的含义
    (2)理解文中重要句子的含意
    (3)分析论点、论据和论证方法
    
    ### 阅读文本
    {text}
    '''
    # 将格式化后的提示字符串返回给调用者
    return prompt

2.5 中文数据处理主函数

将input与output组合，按照列表序号一一对应

def process_cn(df): 
    # 定义两个空列表，用于存储处理后的输入和输出结果
    res_input = []
    res_output = []

    # 遍历 DataFrame 的每一行，使用索引 id 逐行处理
    for id in range(len(df)):
        # 获取当前行的选项、答案和阅读文本的内容
        data_options = df.loc[id, '选项']
        data_answers = df.loc[id, '答案']
        data_prompt = df.loc[id, '阅读文本']

        # 使用 `chinese_multiple_choice_questions` 函数处理选项部分，提取选择题题目及选项
        data_options = chinese_multiple_choice_questions(data_options)
        
        # 使用 `chinese_multiple_choice_answers` 函数处理答案部分，提取选择题答案
        data_answers = chinese_multiple_choice_answers(data_answers)
        
        # 使用 `get_prompt_cn` 函数生成针对当前阅读文本的出题提示
        data_prompt = get_prompt_cn(data_prompt)

        # 验证处理的数据是否正确，通过检查选项和答案的数量是否一致来判断
        if(len(data_answers) == len(data_options)):
            # 初始化一个空字符串 `res`，用于拼接当前行的输出内容
            res = ''
            
            # 遍历当前行的所有选择题
            for id_, question in enumerate(data_options):
                # 首先将题目文本拼接到输出字符串中
                res += f'''
{question['question']}?
                ''' + '\n'
                
                # 遍历当前选择题的所有选项，并逐个拼接到输出字符串中
                for choice in question['choices']:
                    res = res + choice[0] + choice[1] + '\n'
                
                # 最后将对应的答案拼接到输出字符串的末尾
                # 以“答案: 选项”的格式，将答案与题号关联
                res = res + '答案:' + str(data_answers[id_].split('.')[-1])  + '\n'
            
            # 将当前处理后的输入提示和输出结果添加到结果列表中
            res_output.append(res)
            res_input.append(data_prompt)
    
    # 返回处理后的输入提示列表和输出结果列表
    return res_input, res_output

3. 英文数据处理

3.1 数据加载

同理中文

# coding~

# 导入 pandas 库，用于数据处理和分析
import pandas as pd

# 从名为 '训练集-英语.xlsx' 的 Excel 文件中读取数据，并将其存储在 DataFrame 对象 `df` 中
df = pd.read_excel('训练集-英语.xlsx')

# 数据清洗操作，处理英文数据中的一些字符问题
# 主要是为了处理 OCR 识别错误的问题，如俄文字母被误识别为英文字母
# 使用正则表达式将全角句号 '．' 替换为半角句号 '.'
# 将俄文字母 'А.' 替换为英文的 'A.'
# 将俄文字母 'В.' 替换为英文的 'B.'
# 将俄文字母 'С.' 替换为英文的 'C.'
# 将 'D.' 替换为英文的 'D.' （这个替换是冗余的，因为 'D.' 本身是正确的）
df = df.replace('．', '.', regex=True).replace('А.', 'A.', regex=True).replace('В.', 'B.', regex=True).replace('С.', 'C.', regex=True).replace('D.', 'D.', regex=True)

# 从 DataFrame 中获取第一行（索引为 0）的 "选项" 列的内容
second_row_option_content = df.loc[0, '选项']

# 打印显示第一行 "选项" 列的内容
print(second_row_option_content)

3.2 抽取问题

英文问题数据相对标准，但是ABCD的顺序有的可能是ACBD，需要进行数据处理

import re  # 导入正则表达式库，用于模式匹配和文本处理

# 示例文本，从变量 `second_row_option_content` 中获取
text = second_row_option_content

def get_questions(text):
    # 数据清洗，将所有换行符 '\n' 替换为两个空格 '  '，方便后续统一处理
    text = text.replace('\n', '  ') + '  '
    
    # 定义正则表达式模式，用于匹配题干和选项
    # 该模式匹配：
    # 1. 以数字开头并接一个点的题干（如 "1. 这是题目"）
    # 2. 选项 A 到 D，每个选项以字母开头并接一个点，选项之间以两个空格分隔
    # 使用 re.DOTALL 使点号 . 能匹配包括换行符在内的所有字符
    pattern = re.compile(r'(\d+\..*?)(A\..*?\s{2})([B-D]\..*?\s{2})([B-D]\..*?\s{2})(D\..*?\s{2})', re.DOTALL)

    # 查找所有与模式匹配的内容，并返回一个匹配结果的列表
    matches = pattern.findall(text)

    # 初始化一个空列表，用于存储每个问题的字典
    questions_dict_list = []

    # 遍历每个匹配项，将其处理并存入字典
    for match in matches:
        question, option1, option2, option3, option4 = match  # 解包匹配结果
        pattern_question = re.compile(r'(\d+)\.(.*)')  # 定义正则模式匹配题号和题干内容
        
        # 提取题干文本内容，并去除多余的空白字符
        question_text = pattern_question.findall(question.strip())[0][1]
        
        # 提取每个选项的字母和内容，并存储在字典中
        options = {
            option1[0]: option1,
            option2[0]: option2,
            option3[0]: option3,
            option4[0]: option4
        }
        
        # 创建一个字典存储题目和按顺序重新整理的选项
        question_dict = {
            'question': question_text,  # 存储题干文本
            'options': {
                'A': options.get('A', '').strip(),  # 选项 A
                'B': options.get('B', '').strip(),  # 选项 B
                'C': options.get('C', '').strip(),  # 选项 C
                'D': options.get('D', '').strip()   # 选项 D
            }
        }
        
        # 将处理好的字典添加到问题列表中
        questions_dict_list.append(question_dict)
    
    # 返回包含所有问题字典的列表
    return questions_dict_list

# 调用函数 `get_questions` 并传入文本 `text`，将返回的结果存储在 `questions` 变量中
questions = get_questions(text)

# 遍历每个问题字典并打印结果
for q in questions:
    print(q)

3.3 抽取问题的结果

# 定义一个函数，用于数据清洗，删除输入字符串中的空格、换行符及点号
def remove_whitespace_and_newlines(input_string):
    # 使用 str.replace() 方法，将字符串中的所有空格 " " 替换为空字符串 ""，即删除空格
    # 然后，将字符串中的所有换行符 "\n" 替换为空字符串 ""，即删除换行符
    # 最后，将字符串中的所有点号 "." 替换为空字符串 ""，即删除点号
    result = input_string.replace(" ", "").replace("\n", "").replace(".", "")
    
    # 返回处理后的字符串结果
    return result

import re  

# 示例文本
text = """
32. B. The underlying logic of the effect.                                                   33.D. estimates were not fully independent.
34.C. The discussion process.            35.D. Approving.
"""

def get_answers(text):
    # 调用之前定义的函数，删除字符串中的空格、换行符和点号
    text = remove_whitespace_and_newlines(text)
    
    # 定义正则表达式模式，用于匹配题号和选项答案
    # 匹配规则为：
    # 一个数字（题号）后可能有若干个空格，然后跟一个大写字母 [A-D]，代表答案选项
    pattern = re.compile(r'(\d)\s*([A-D])')

    # 使用正则表达式查找所有匹配项，返回一个包含所有匹配结果的列表
    matches = pattern.findall(text)
    
    # 初始化一个空列表，用于存储提取的答案
    res = []
    
    # 遍历所有匹配结果
    for match in matches:
        number_dot, first_letter = match  # 解包匹配结果，将题号和选项答案分开
        res.append(first_letter)  # 将答案选项添加到结果列表中
    
    # 返回提取的答案列表
    return res

3.4 prompts设计

def get_prompt_en(text):
    # 使用 f-string 格式化字符串，将传入的文本 `text` 插入到预定义的模板中
    # 这个模板是为生成一个出题任务的提示，它包括了题目的要求和需要遵守的规则
    prompt = f'''
    你是⼀个⾼考选择题出题专家，你出的题有⼀定深度，你将根据阅读文本，出4道单项选择题，包含题目选项，以及对应的答案，注意：不⽤给出原文，每道题由1个问题和4个选项组成，仅存在1个正确答案，请严格按照要求执行。
The reading text is mainly in English. The questions and answers you raised need to be completed in English for at least the following points:
    
    ### 回答要求
    (1)Understanding the main idea of the main idea.
    (2)Understand the specific information in the text.
    (3)infering the meaning of words and phrases from the context
    
    ### 阅读文本
    {text}
    '''
    
    # 将格式化后的提示字符串返回给调用者
    return prompt

3.5 英文数据处理主函数

def process_en(df): 
    # 初始化两个空列表，用于存储处理后的输入和输出结果
    res_input = []
    res_output = []
    
    # 遍历 DataFrame 的每一行，使用索引 id 逐行处理
    for id in range(len(df)):
        # 从 DataFrame 中获取当前行的“选项”、“答案”和“阅读文本”列的内容
        data_options = df.loc[id, '选项']
        data_answers = df.loc[id, '答案']
        data_prompt = df.loc[id, '阅读文本']
        
        # 调用 `get_questions` 函数处理选项部分，提取选择题的题目及选项
        data_options = get_questions(data_options)
        
        # 调用 `get_answers` 函数处理答案部分，提取选择题的答案
        data_answers = get_answers(data_answers)
        
        # 调用 `get_prompt_en` 函数生成针对当前阅读文本的出题提示
        data_prompt = get_prompt_en(data_prompt)

        # 检查选项和答案的数量是否一致，确保数据处理的正确性
        if len(data_answers) == len(data_options):
            # 初始化一个空字符串 `res`，用于拼接当前行的输出内容
            res = ''
            
            # 遍历当前行的所有选择题
            for id, question in enumerate(data_options):
                # 拼接题目、选项和答案到输出字符串 `res` 中
                res += f'''
                {id+1}.{question['question']}
                {question['options']['A']}
                {question['options']['B']}
                {question['options']['C']}
                {question['options']['D']}
                answer:{data_answers[id]}
                ''' + '\n'
            
            # 将当前处理后的输出结果和输入提示添加到结果列表中
            res_output.append(res)
            res_input.append(data_prompt)
    
    # 返回处理后的输入提示列表和输出结果列表
    return res_input, res_output

4. 数据合并

# 将两个列表转换为DataFrame

df_new = pd.DataFrame({'input': cn_input+cn_input[:30]+en_input+en_input[:20], 'output': cn_output+cn_output[:30]+en_output+en_output[:20]})

疯人院院长CLEVER

关注

5
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
Datawhale AI 夏令营-大模型技术（微调）-代码精读

英文问题数据相对标准，但是ABCD的顺序有的可能是ACBD，需要进行数据处理。使用要求+阅读材料组成prompt作为input部分。将input与output组合，按照列表序号一一对应。抽取题目及答案，并且过滤简答题。
复制链接

扫一扫