pandas之pd.read_csv 优化

import csv
import sys
import time
import pandas as pd
from numba import jit
import numpy as np

path = r"D:\allNEWSdata\ALLNEWS\news_csv\tag.csv"


def most_slow_read(path):
    time1 = time.time()
    df_train = pd.read_csv(path)
    print(df_train)
    time2 = time.time()
    print(u'总共耗时:' + str(time2 - time1) + 's')


# 优化前
def slow_read(path):
    time1 = time.time()
    df_train = pd.read_csv(path,
                           header=None,
                           sep=',',
                           # nrows=10000000,
                           error_bad_lines=False,
                           low_memory=False,
                           keep_default_na=True,
                           )

    print(df_train)
    time2 = time.time()
    print(u'总共耗时:' + str(time2 - time1) + 's')


# 优化读取
# @jit(nopython=True)  # Set "nopython" mode for best performance, equivalent to @njit
def fast_read(path):
    time1 = time.time()
    na_vals = ["\\N", " ", "", "NULL"]
    df_tmp = []
    df_train = pd.DataFrame(index=["0"], dtype=np.float32)
    count = 0
    reader = pd.read_csv(path,  # 文件路径
                         header=None,
                         sep=",",  # 分割符号
                         chunksize=10000000,  # 每次一个快读取的数据量
                         encoding="utf-8",  # 编码方式
                         low_memory=False,  # 避免内存不足
                         quoting=csv.QUOTE_NONE,  # 引用约定。可选值包括csv.QUOTE _ ALL (引用用所有字段),这里是不引用
                         error_bad_lines=False,  # 跳过错误行
                         keep_default_na=True,
                         na_values=na_vals,
                         iterator=True,
                         )
    # nrows=10000000, # 只读一千万行)
    # low_memory : boolean, default True
    # 分块加载到内存,再低内存消耗中解析,但是可能出现类型混淆。
    # 确保类型不被混淆需要设置为False,或者使用dtype 参数指定类型。
    # 注意使用chunksize 或者iterator 参数分块读入会将整个文件读入到一个Dataframe,而忽略类型(只能在C解析器中有效)
    for chunk in reader:
        df_tmp.append(chunk[1:])
        del chunk
        # print("the chunk " + str(count) + " has been stored...")
        # print("the mem-cost is now: ", str(sys.getsizeof(df_tmp) / (1)), "MB \n")
        count += 1

    print(df_tmp)
    time2 = time.time()
    print(u'总共耗时:' + str(time2 - time1) + 's')


if __name__ == '__main__':
    most_slow_read(path)
    slow_read(path)
    fast_read(path)
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值