超大文件排序,主要步骤为:切割成小文件,小文件各自排序,归并为大排序文件。
一、pandas切割超大文件:
import time
import pandas as pd
from tqdm import tqdm
i = 0
def reader_pandas(file, sep='\t', chunkSize=5000000, patitions=21, header=None):
#file:文件名;
#sep:读入时按此分隔符分割
#chunkSize:切割后每个小文件的大小
#patitions:进度条大小
reader = pd.read_csv(file, iterator=True)
chunks = []
i = 0
with tqdm(range(patitions), 'Reading ...') as t:
for _ in t:
try:
chunk = reader.get_chunk(chunkSize)
i += 1
chunk.to_csv('sorted' + str(i) + '.csv', index=False, header=None)
# chunks.append(chunk)
except StopIteration:
break
# return pd.concat(chunks, ignore_index=True)
#也可用