#ai夏令营 #datawhale #夏令营
baseline解读
环境配置
!pip install scipy openai tiktoken retry dashscope loguru
初始化过程
from multiprocessing import Process, Manager
import json
import os
from pprint import pprint
import re
from tqdm import tqdm
import random
import uuid
import openai
import tiktoken
import json
import numpy as np
import requests
from retry import retry
from scipy import sparse
#from rank_bm25 import BM25Okapi
#import jieba
from http import HTTPStatus
import dashscope
from concurrent.futures import ThreadPoolExecutor, as_completed
from loguru import logger
import json
import time
from tqdm import tqdm
logger.remove() # 移除默认的控制台输出
logger.add("logs/app_{time:YYYY-MM-DD}.log", level="INFO", rotation="00:00", retention="10 days", compression="zip")
MODEL_NAME = 'qwen2-7b-instruct'
dashscope.api_key="sk-"
- 主要做了logger一个初始化以及根据灵积的api调用设置了一个初始的llm模型
qwen2-7b
- notebook中是通过dashscope sdk来访问大模型api的
- 如果是本地部署的模型,这段需要进行修改,指定本地部署的模型路径
过程
- 通过分解
round1_test_data.jsonl
中的问题,上传给大模型来得出答案
def extract(input_text):
ans_pattern = re.compile(r"答案是:(.)", re.S)
problems = ans_pattern.findall(input_text)
# print(problems)
if(problems == ''):
return 'A'
return problems[0]
def process_datas(datas,MODEL_NAME):
results = []
with ThreadPoolExecutor(max_workers=16) as executor:
future_data = {}
lasttask = ''
lastmark = 0
lens = 0
for data in tqdm(datas, desc="Submitting tasks", total=len(datas)):
problem = data['problem']
for id,question in enumerate(data['questions']):
prompt = get_prompt(problem,
question['question'],
question['options'],
)
future = executor.submit(api_retry, MODEL_NAME, prompt)
future_data[future] = (data,id)
time.sleep(0.6) # 控制每0.5秒提交一个任务
lens += 1
for future in tqdm(as_completed(future_data), total=lens, desc="Processing tasks"):
# print('data',data)
data = future_data[future][0]
problem_id = future_data[future][1]
try:
res = future.result()
extract_response = extract(res)
# print('res',extract_response)
data['questions'][problem_id]['answer'] = extract_response
results.append(data)
# print('data',data)
except Exception as e:
logger.error(f"Failed to process text: {data}. Error: {e}")
return results
def main(ifn, ofn):
if os.path.exists(ofn):
pass
data = []
# 按行读取数据
with open(ifn) as reader:
for line in reader:
sample = json.loads(line)
data.append(sample)
datas = data
# print(data)
# 均匀地分成多个数据集
return_list = process_datas(datas,MODEL_NAME)
print(len(return_list))
print("All tasks finished!")
return return_list
ef evaluate(ofn):
data = []
with open(ofn) as reader:
for line in reader:
sample = json.loads(line)
data.append(sample)
pse = 0
cnt = 0
tot = 0
for task in data:
for question in task['questions']:
if MODEL_NAME in question:
tot += 1
cnt += question[MODEL_NAME] == question['answer']
else:
pse += 1
print(cnt, tot, cnt/tot, pse)
- 结果保存至
upload.jsonl
[{'problem': '有一群人和一些食物类型。下列是关于这些个体和食物的已知信息:\n\n1. 鸡肉是一种食物。\n2. 苹果是一种食物。\n3. 如果X吃了Y,且X活着,则Y是一种食物。\n4. Bill存活。\n5. Bill吃了花生。\n6. John吃所有食物。\n7. Sue吃所有Bill吃的食物。\n8. John喜欢所有食物。\n\n根据以上信息,回答以下选择题:', 'questions': [{'question': '选择题 1:\n谁喜欢吃花生?', 'options': ['Bill', 'Sue', 'John', 'None of the above'], 'answer': 'C'}], 'id': 'round1_test_data_000'}
由于是调用api,所以最后检验其实是存在没有检验到的问题,如何保证这个过程的稳定性,可以在检查后重新发送缺失问题,直至问题全部回答完毕,这样能够保证精确度