首先准备:当前文件上级目录下有个excels目录,目录里存在15份.xls文件,每个文件1000条数据,需要通过多进程对这些文件读取为pandas的dataframe格式
手动创建多进程读取文件(进程数等于任务数)
# @datetime:6/26/0026
"""通过多进程加速读取excel的测试"""
__author__ = "hanyaning@deri.energy"
import os.path
import time
from service import logger
import pandas as pd
from multiprocessing import Process, Manager
startTime = time.time()
logger = logger.MyLogger("multi_process").getLogger()
def getExcelData(path, return_data=None, file_name=""):
global startTime
logger.info("开始读取Excel文件,当前进程pid:" + str(os.getpid()))
if not os.path.exists(path):
raise FileNotFoundError()
if os.path.isfile(path):
return_data[file_name] = pd.read_excel(path, skiprows=1, skipfooter=1)
logger.info("读取Excel文件完毕,当前进程pid:" + str(os.getpid()))
if __name__ == "__main__":
excel_path = os.path.join(os.getcwd(), "../excels")
xls_names = [x for x in os.listdir(excel_path) if x.endswith(".xls")