Datawhale AI 夏令营大模型技术微调——task2

qq_26249811

已于 2024-08-14 22:33:03 修改

阅读量245

点赞数 2

分类专栏：大模型入门文章标签： python 语言模型人工智能

于 2024-08-14 22:11:32 首次发布

本文链接：https://blog.csdn.net/qq_26249811/article/details/141200564

版权

大模型入门专栏收录该内容

6 篇文章 0 订阅

订阅专栏

尝试了更改回答的要求

def get_prompt_cn(text):
    prompt = f'''
    你是⼀个⾼考选择题出题专家，你出的题有⼀定深度，你将根据阅读文本，出4道单项选择题，包含题目选项，以及对应的答案，注意：不⽤给出原文，每道题由1个问题和4个选项组成，仅存在1个正确答案，请严格按照要求执行。 阅读文本主要是中文，你出的题目需要满足以下要点，紧扣文章内容且题干和答案为中文：
    
    ### 回答要求
    (1)请理解文中重要概念的含义，并给出详细的解释和阐述。
    (2)请理解文中重要句子的含意，并给出详细的解释和说明。
    (3)对给定的论点、论据和论证方法进行分析。
    
    
    ### 阅读文本
    {text}
    '''
    
    return prompt

修改了原来的代码,使其有更健壮的提取方式

def chinese_multiple_choice_questions(questions_with_answers):
    # 输入的题目文本
    text = questions_with_answers

    # 正则表达式：匹配问题
    question_pattern = re.compile(r'\d+\..*?(?=\d+\.|$)', re.DOTALL)
    # 正则表达式：匹配选项
    choice_pattern = re.compile(r'([A-D])\s*(.*?)(?=[A-D]|$|\n)', re.DOTALL)

    # 找到所有问题
    questions = question_pattern.findall(text)

    # 初始化选择题列表
    multiple_choice_questions = []
    short_answer_questions = []
    # 处理每个问题
    for id, question in enumerate(questions):
        # 检查是否是选择题（因为选择题内有ABCD这样的选项）
        if re.search(r'[A-D]', question):  
            # 提取选项内容
            choices = choice_pattern.findall(question)
            # 提取题目内容，去除可能的打分标记等
            question_text = re.sub(r'\s*\(.*', '', question).strip()  # 更健壮的提取方式
            # 清洗并重新编号问题
            clean_question = f'{id}.{question_text}'

            # 将问题和选项以字典形式存入
            multiple_choice_questions.append({
                'question': clean_question,
                'choices': choices
            })
        else:
            short_answer_questions.append(question.strip())
    # 返回抽取后的选择题字典列表
    return multiple_choice_questions

修改了一下代码，这样过滤效果会更好一些

# coding~

import pandas as pd

# 读取Excel文件
df = pd.read_excel('训练集-英语.xlsx')
df = df.replace('．', '.', regex=True).replace('А.', 'A.', regex=True).replace('В.', 'B.', regex=True).replace('С.', 'C.', regex=True).replace('D.', 'D.', regex=True)
# df = df.replace('（', '(', regex=True)
import pandas as pd

# 读取Excel文件
df = pd.read_excel('训练集-英语.xlsx')

# 使用字典和循环优化替换操作
replace_dict = {
    '．': '.',
    'А.': 'A.',
    'В.': 'B.',
    'С.': 'C.',
    # 注意：原代码中的 'D.' 替换为 'D.' 是无效的，可能是个误操作，这里我保留原样
    'D.': 'D.',
    # 如果需要替换括号，可以取消下面这行的注释
    # '（': '('
}
for old_val, new_val in replace_dict.items():
    df = df.replace(old_val, new_val, regex=True)

# 读取第二行（即索引为1的行）“选项”列的内容
second_row_option_content = df.loc[1, '选项']

# 显示第二行“选项”列的内容
print(second_row_option_content)
# 读取第二行（即第三行）“选项”列的内容
second_row_option_content = df.loc[0, '选项']

# 显示第二行“选项”列的内容
print(second_row_option_content)

改进点：

修正了正则表达式中的重复匹配问题，并增加了负向前瞻来避免错误匹配。
改进了选项字典的构建逻辑，确保键值对正确提取。

import re

# 示例文本，此处为了演示，直接定义了一个字符串。
# 在实际应用中，这个变量可能来自外部输入或文件读取等。
text = second_row_option_content

def get_questions(text):
    text = text.replace('\n', '  ') + '  '
    
    # 正则表达式模式，修复了重复匹配[B-D]的问题，并添加了注释。
    # \d+\..*? 匹配问题编号和内容
    # (A\..*?\s{2}) 匹配选项A
    # ([B-D](?!\d)\..*?\s{2}) 使用负向前瞻确保不会匹配到类似10.的情况，匹配选项B或C
    # (D\..*?\s{2}) 匹配选项D
    pattern = re.compile(r'(\d+\..*?)(A\..*?\s{2})([B-D](?!\d)\..*?\s{2})([B-D](?!\d)\..*?\s{2}|C\..*?\s{2})(D\..*?\s{2})', re.DOTALL)
    
    matches = pattern.findall(text)
    questions_dict_list = []
    
    for match in matches:
        question, option1, option2, option3, option4 = match
        question_number, question_text = re.compile(r'(\d+)\.(.*)').findall(question.strip())[0]
        
        # 提取选项字母和内容，并修复了键值对提取的问题。
        options = {'A': option1.strip(), 'B': option2.strip(), 'C': option3.strip(), 'D': option4.strip()}
        
        question_dict = {
            'question': question_text,
            'options': {
                'A': options['A'],
                'B': options['B'],
                # 确保C选项存在，如果不存在则使用空字符串
                'C': options.get('C', ''),
                'D': options['D'],
            }
        }
        questions_dict_list.append(question_dict)
    return questions_dict_list

# 调用函数并打印结果
questions = get_questions(text)
for q in questions:
    print(q)

总结：数据清洗需要下功夫，还有学习率和迭代次数可以调整一下试试，本次我迭代了30次，学习率改成了0.0007。加数据还没有尝试过。最后得分有所提高了。