pandas之pd.read_csv 优化-CSDN博客

本文探讨了在Python中使用Pandas库读取CSV文件的不同方法，并对比了它们的执行效率。通过采用分块读取、自定义NA值和禁用引用约定等策略，实现了对大数据集读取速度的有效提升。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

import csv
import sys
import time
import pandas as pd
from numba import jit
import numpy as np

path = r"D:\allNEWSdata\ALLNEWS\news_csv\tag.csv"


def most_slow_read(path):
    time1 = time.time()
    df_train = pd.read_csv(path)
    print(df_train)
    time2 = time.time()
    print(u'总共耗时：' + str(time2 - time1) + 's')


# 优化前
def slow_read(path):
    time1 = time.time()
    df_train = pd.read_csv(path,
                           header=None,
                           sep=',',
                           # nrows=10000000,
                           error_bad_lines=False,
                           low_memory=False,
                           keep_default_na=True,
                           )

    print(df_train)
    time2 = time.time()
    print(u'总共耗时：' + str(time2 - time1) + 's')


# 优化读取
# @jit(nopython=True)  # Set "nopython" mode for best performance, equivalent to @njit
def fast_read(path):
    time1 = time.time()
    na_vals = ["\\N", " ", "", "NULL"]
    df_tmp = []
    df_train = pd.DataFrame(index=["0"], dtype=np.float32)
    count = 0
    reader = pd.read_csv(path,  # 文件路径
                         header=None,
                         sep=",",  # 分割符号
                         chunksize=10000000,  # 每次一个快读取的数据量
                         encoding="utf-8",  # 编码方式
                         low_memory=False,  # 避免内存不足
                         quoting=csv.QUOTE_NONE,  # 引用约定。可选值包括csv.QUOTE _ ALL (引用用所有字段）,这里是不引用
                         error_bad_lines=False,  # 跳过错误行
                         keep_default_na=True,
                         na_values=na_vals,
                         iterator=True,
                         )
    # nrows=10000000, # 只读一千万行)
    # low_memory : boolean, default True
    # 分块加载到内存，再低内存消耗中解析，但是可能出现类型混淆。
    # 确保类型不被混淆需要设置为False，或者使用dtype 参数指定类型。
    # 注意使用chunksize 或者iterator 参数分块读入会将整个文件读入到一个Dataframe，而忽略类型（只能在C解析器中有效）
    for chunk in reader:
        df_tmp.append(chunk[1:])
        del chunk
        # print("the chunk " + str(count) + " has been stored...")
        # print("the mem-cost is now: ", str(sys.getsizeof(df_tmp) / (1)), "MB \n")
        count += 1

    print(df_tmp)
    time2 = time.time()
    print(u'总共耗时：' + str(time2 - time1) + 's')


if __name__ == '__main__':
    most_slow_read(path)
    slow_read(path)
    fast_read(path)