目前只发现采用拆分成多个文件进行处理效率较快,具体代码可参考。
1. 快速读
# -*- encoding=utf-8 -*-
import os
import pandas as pd
from datetime import timedelta, datetime
from multiprocessing import Pool, Process, Manager, Queue
def read_one(input_file):
df_one = pd.read_csv(input_file)
return df_one
def load_all_data(dir_path, batch_num = 30):
"""
加载搜索特征
"""
file_list = os.listdir(dir_path)
file_list = [ os.path.join(dir_path, x) for x in file_list ]
start = 0
end = len(file_list)
res_list = []
count = 0
while start < end:
temp_file_list = file_list[start:start+batch_num]
start = start + batch_num
p = Pool(batch_num)
res = p.map(read_one, temp_file_list)
p.close()
p.join()
count = count + 1
print("count:", count)
df_temp = pd.concat(res, axis=0)
res_list.append(df_temp)
df = pd.concat(res_list, axis=0)
return df
if __name__ == "__main__":
dir_path = "./dataset/"
batch_num = 50
df_all = load_all_data(dir_path, batch_num)
df_all.to_csv("./dataset.csv", index=False)
2.快速写
# -*- encoding=utf-8 -*-
import sys
import time
import pymysql
import datetime
import pandas as pd
from multiprocessing import Pool, Process, Manager, Queue
def load_finance_base(conn, ts_code, start_date, end_date):
pass
def get_finance_data(conn, ts_code, start_date, end_date):
"""
读取财报数据,并进行加工
"""
# 读取财务数据
df = load_finance_base(conn, ts_code, start_date, end_date)
# 同一天多条财报,保留EndDate最新的
df = df.sort_values(["SecuCode", "InfoPublDate", "EndDate"], ascending=False)
df = df.groupby(["SecuCode", "InfoPublDate"]).head(1)
return df
def Run(stock_queue, start_date, end_date):
"""
主流程
"""
conn = pymysql.connect(host='host', user='user', password='pwd', db='db', port=3306, charset='utf8')
while stock_queue.empty() == False:
stock = stock_queue.get()
print("{} is calculating...".format(stock))
df = get_finance_data(conn, ts_code = stock, start_date = start_date, end_date = end_date)
#print(df[:5])
df.to_csv("./features/X_finance/{}.csv".format(stock))
print("{} is finished".format(stock))
conn.close()
if __name__ == "__main__":
start_date = "2018-09-01"
end_date = "2021-05-20"
start_time = datetime.datetime.now()
stock_list = load_company()
stock_queue = Manager().Queue()
for stock in stock_list:
stock_queue.put(stock)
max_process = 20
pool = Pool(processes=max_process)
for i in range(max_process):
pool.apply_async(Run, args=(stock_queue, start_date, end_date, ))
pool.close() #执行完close后不会有新的进程加入到pool,join函数等待所有子进程结束
pool.join() #调用join之前,先调用close函数,否则会出错。
end_time = datetime.datetime.now()
print("total elapsed time is:", end_time - start_time)