pandas 快速读写大csv文件

目前只发现采用拆分成多个文件进行处理效率较快,具体代码可参考。

1. 快速读

# -*- encoding=utf-8 -*-

import os
import pandas as pd
from datetime import timedelta, datetime
from multiprocessing import Pool, Process, Manager, Queue


def read_one(input_file):
    df_one = pd.read_csv(input_file)
    return df_one

def load_all_data(dir_path, batch_num = 30):
    """
    加载搜索特征
    """
    file_list = os.listdir(dir_path)
    file_list = [ os.path.join(dir_path, x) for x in file_list ]

    start = 0
    end = len(file_list)
    res_list = []
    count = 0
    while start < end:
        temp_file_list = file_list[start:start+batch_num]
        start = start + batch_num
        p = Pool(batch_num)
        res = p.map(read_one, temp_file_list)
        p.close()
        p.join()
        count = count + 1
        print("count:", count)
        df_temp = pd.concat(res, axis=0)
        res_list.append(df_temp)
    
    df = pd.concat(res_list, axis=0)
    return df

if __name__ == "__main__":
    dir_path = "./dataset/"
    batch_num = 50
    df_all = load_all_data(dir_path, batch_num)
    df_all.to_csv("./dataset.csv", index=False)

 

2.快速写

# -*- encoding=utf-8 -*-
import sys
import time
import pymysql
import datetime
import pandas as pd
from multiprocessing import Pool, Process, Manager, Queue


def load_finance_base(conn, ts_code, start_date, end_date):
    pass


def get_finance_data(conn, ts_code, start_date, end_date):
    """
    读取财报数据,并进行加工
    """
    # 读取财务数据
    df = load_finance_base(conn, ts_code, start_date, end_date)

    # 同一天多条财报,保留EndDate最新的
    df = df.sort_values(["SecuCode", "InfoPublDate", "EndDate"], ascending=False)
    df = df.groupby(["SecuCode", "InfoPublDate"]).head(1)

    return df

def Run(stock_queue, start_date, end_date):
    """
    主流程
    """
    conn = pymysql.connect(host='host', user='user', password='pwd', db='db', port=3306, charset='utf8')
    while stock_queue.empty() == False:
        stock = stock_queue.get()
        print("{} is calculating...".format(stock))
        df = get_finance_data(conn, ts_code = stock, start_date = start_date, end_date = end_date)
        #print(df[:5])
        df.to_csv("./features/X_finance/{}.csv".format(stock))
        print("{} is finished".format(stock))
    conn.close()

if __name__ == "__main__":
    start_date = "2018-09-01"
    end_date = "2021-05-20"

    start_time = datetime.datetime.now()
    stock_list = load_company()
    
    stock_queue = Manager().Queue()
    for stock in stock_list:
        stock_queue.put(stock)

    max_process = 20
    pool = Pool(processes=max_process)
    for i in range(max_process):
        pool.apply_async(Run, args=(stock_queue, start_date, end_date, ))

    pool.close() #执行完close后不会有新的进程加入到pool,join函数等待所有子进程结束
    pool.join() #调用join之前,先调用close函数,否则会出错。
    end_time = datetime.datetime.now()
    print("total elapsed time is:", end_time - start_time)

 

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值