import multiprocessing as mp
import traceback
import pandas as pd
def process_df_chunk(df_chunk: object, iterrows_map: dict) -> object:
"""
对块数据进行批处理,通过vocab字典拿到对应的函数对象及参数
:param df_chunk: dataFrame
:param iterrows_map:{obj:['','','',..]}
:return: dataFrame
"""
for f in iterrows_map:
for i, row in df_chunk.iterrows():
if row['language_type'] in ['zh', 'en']:
df_chunk.at[i, iterrows_map[f][0]], df_chunk.at[i, iterrows_map[f][1]] = f(
row[iterrows_map[f][2]], iterrows_map[f][3]
)
return df_chunk
def process_df(df: object, iterrows_map: dict, num_processes=6) -> object:
"""
分块处理,完成后重写组装数据
:param df:
:param iterrows_map: {obj:['','','',..]}
:param num_processes:
:return: dataFrame
"""
df_chunks = [df[i:i + num_processes] for i in range(0, df.shape[0], num_processes)]
with mp.Pool(processes=num_processes) as pool:
df_list = pool.starmap(process_df_chunk, [(chunk, iterrows_map) for chunk in df_chunks])
return pd.concat(df_list)
Python将方法放置到线程池中,多线程执行;方法执行效率翻倍提升!
最新推荐文章于 2024-06-17 17:35:06 发布