这里的例子将开始日期和结束日期之前展开
线程数=cpu数量 速度提高n呗
#multi threads
from functools import partial
from multiprocessing import Pool
import multiprocessing
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc='pandas bar')
def parallelize_dataframe(df, func, **kwargs):
CPUs = multiprocessing.cpu_count()
num_partitions = CPUs # number of partitions to split dataframe
num_cores = CPUs # number of cores on your machine
df_split = np.array_split(df, num_partitions,axis=0)
pool = Pool(num_cores)
func = partial(func, **kwargs)
df = pd.concat(pool.map(func, df_split))
pool.close()
pool.join()
return df
start_date = pd.to_datetime('20200101',format='%Y%m%d')
def process_func(row):
start=datetime.strptime(str(row['start_date']),'%Y%m%d')
end=datetime.strptime(str(row['end_date']),'%Y%m%d')
row['date']=[(start + timedelta(days=i)).strftime('%Y%m%d') for i in range((end - start).days + 1)]
return row
def parall_func(df):
df = df.progress_apply(process_func,axis=1)
return df
# 然后把上一步定义好的这个 parall_func 和 数据集,一起放入 parallelize_dataframe
activity_info_pd = parallelize_dataframe(activity_info_pd, parall_func)
activity_info_pd=activity_info_pd.explode('date')