concurrent.futures.ThreadPoolExecutor多线程处理文件

最新推荐文章于 2024-09-27 10:11:28 发布

鱼鱼9901

最新推荐文章于 2024-09-27 10:11:28 发布

阅读量133

点赞数 1

文章标签： python 自然语言处理多线程

本文链接：https://blog.csdn.net/weixin_72100405/article/details/142484444

版权

背景：适用于IO密集型任务。

应用举例：

数据库查询。
文件系统操作。
网络服务器处理请求。

比如大批量从某个服务获取信息。

代码举例：

def worker_core(json_data):  #调用GPT获取数据
    response = send_chat_request(
        system="You are a helpful assistant.",
        examples=[],
        question=json_data["conversations"][0]["value"], # prompt,
        temperature=0.7,
        top_p=0.8,
        engine="xxxxxxxxxxxxxxxxx",
        max_tokens=2048,
        priority=5,
    )
    import copy
    response["data"] = copy.deepcopy(json_data)

    return response


def worker(json_data, max_try=10): #每条数据获取3次，并对获取的数据进行处理
    for iteration in range(0, 3):
        save_path = json_data["meta"]["save_path"]
        save_dpath = os.path.dirname(save_path)
        save_fname = os.path.basename(save_path)
        newd = os.path.join(save_dpath, "gpt4", str(iteration))
        new_save_path = os.path.join(newd, save_fname)
        if os.path.exists(new_save_path):
            return
        safe_mkdir(newd)
                            
        for i in range(max_try):
            result = worker_core(json_data)
            print(json_data)
            try:
                response = check_format(result)
                # response = result["response"]
                if response != None:
                    result["raw_response"] = result["response"]
                    result["response"] = response

                    with open(new_save_path, "w") as f:
                        json.dump(result, f, indent=4, ensure_ascii=False)
            except Exception as e:
                print(e)
        if response == None:
            result["raw_response"] = result["response"]
            result["response"] = None
            with open(new_save_path, "w") as f:
                json.dump(result, f, indent=4, ensure_ascii=False)


def parallel_execution(demo_json_list, n_jobs=100):
    n_jobs = min(n_jobs, len(demo_json_list))
    with concurrent.futures.ThreadPoolExecutor(max_workers=n_jobs) as executor:
        list(tqdm(executor.map(worker, demo_json_list), total=len(demo_json_list)))

parallel_execution(demo_json_list, n_jobs=50)  #demo_json_list是json.load的数据