alpaca2qa.py解读,数据加模板

柴多多

已于 2023-04-20 22:00:44 修改

阅读量160

点赞数

文章标签： java javascript servlet

于 2023-04-20 21:37:47 首次发布

原文链接：https://github.com/ssbuild

版权

# -*- coding: utf-8 -*-
# @Time    : 2023/3/20 8:50

# 数据参考 https://github.com/carbonz0/alpaca-chinese-dataset
### 样例

# ``` json
# [
#   {
#       "instruction": "根据给定的坐标确定最近的机场。",
#       "input": "40.728157, -73.794853",
#       "output": "距给定坐标最近的机场是纽约市的拉瓜迪亚机场 (LGA)。"
#   },
#   {
#      "instruction": "输出不同种类水果的列表",
#       "input": "",
#       "output": "1. 苹果\n2. 香蕉\n3. 橘子\n4. 芒果\n5. 草莓\n6. 葡萄\n7. 蓝莓\n8. 樱桃\n9. 猕猴桃\n10. 甜瓜\n11.菠萝\n12. 李子\n13.桃子"
#   },
#   {
#      "instruction": "找出字串中隐藏的信息",
#      "input": "业余咖啡",
#      "output": "隐藏的消息是“咖啡因爱好者”。"
#   }
#  ]
# ```
import json


PROMPT_DICT1 = {
    "prompt_no_input": (
        "给定输入的英文文本，编写适当的输出将英文翻译为中文\n\n"
        "### 英文文本:\n{english}\n\n### 响应:"
    ),
}

PROMPT_DICT = {
    "prompt_input": (
        "下面是一个指令,描述了一个任务,搭配一个输入,提供进一步的上下文。"
        "编写适当的输出完成请求。\n\n"
        "### 指令:\n{instruction}\n\n### 请求:\n{input}\n\n### 响应:"
    ),
    "prompt_no_input": (
        "下面是一个指令,描述了一个任务。"
        "编写适当的输出完成请求。\n\n"
        "### 指令:\n{instruction}\n\n### 响应:"
    ),
}

def alaca2qa(src,dst):
    prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]
    with open(src,mode='r',encoding='utf-8') as f:
        list_data_dict = json.loads(f.read())
    sources = [
        # example 是一个字典类型的变量，表示一个数据样本。example.get("input", "") 表示获取 example 字典中 "input" 键对应的值，
        # 如果该键不存在则返回空字符串 ""
        # prompt_input 是一个字符串模板，其中包含了两个占位符 {instruction} 和 {input}，它们分别表示指令和输入提示。
        # format_map 是一个字符串方法，它接受一个字典作为参数，将字典中的键值对应替换到字符串模板的占位符中。
        prompt_input.format_map(example) if example.get("input", "") != "" else prompt_no_input.format_map(example)
        for example in list_data_dict
    ]
    targets = [f"{example['output']}" for example in list_data_dict]

    with open(dst, mode='w', encoding='utf-8',newline='\n') as f:

        for i,(s, t) in enumerate(zip(sources, targets)):
            paragraph = [
                {
                    'q': s,
                    'a': [t]
                }
            ]
            f.write(json.dumps({'id': i+1 ,'paragraph' : paragraph },ensure_ascii=False) +'\n')


# def alaca2qa(src,dst):
#     prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]
#     with open(src,mode='r',encoding='utf-8') as f:
#         list_data_dict = json.loads(f.read())
#     sources = [
#         prompt_input.format_map(example) if example.get("input", "") != "" else prompt_no_input.format_map(example)
#         for example in list_data_dict
#     ]
#     targets = [f"{example['output']}" for example in list_data_dict]
#
#     with open(dst, mode='w', encoding='utf-8',newline='\n') as f:
#
#         for i,(s, t) in enumerate(zip(sources, targets)):
#             paragraph = [
#                 {
#                     'q': s,
#                     'a': [t]
#                 }
#             ]
#             f.write(json.dumps({'id': i+1 ,'paragraph' : paragraph },ensure_ascii=False) +'\n')



if __name__ == '__main__':
    src = r'./test/merge.json'
    dst = r'./test/Smerge.json'
    alaca2qa(src,dst)