import pandas as pd
import os
import _thread
# 获取分块数据
# tsv_name: tsv文件名
# chunk_size: 分块大小
# encoding_type: 编码格式
def read_data(tsv_name: str, chunk_size: int, encoding_type: str):
return pd.read_csv(tsv_name, sep="\t", chunksize=chunk_size, encoding=encoding_type, low_memory=False)
# 高效计算文件行数
# file_name: 携带路径的文件名字
def iter_count(file_name: str):
from itertools import (takewhile, repeat)
buffer = 1024 * 1024
with open(file_name, errors='ignore') as f:
buf_gen = takewhile(lambda x: x, (f.read(buffer) for _ in repeat(None)))
return sum(buf.count('\n') for buf in buf_gen)
# 拆分大tsv文件为小一些的csv文件
# tsv_name: tsv文件名
# split_num: 拆分个数
# encoding_type: 编码格式
def split_data(tsv_name: str, split_num: int, encoding_type: str):
splits_dir = f"./splits_{tsv_name[-8:-4]}"
if not os.path.exists(splits_dir):
os.mkdir(splits_dir)
# 计算分块大小
count = iter_count(tsv_name)
chunk_size = count // split_num
if count % split_num != 0:
chunk_size += 1
data = read_data(tsv_name, chunk_size, encoding_type)
# 多线程写入分块数据
def write_data(_idx: int, _chunk):
file_path = f"{splits_dir}/chunk_{tsv_name[-8:-4]}_{_idx + 1}.csv"
_chunk.to_csv(file_path, index=False)
for idx, chunk in enumerate(data):
_thread.start_new_thread(write_data, (idx, chunk))
后来改进的方法:
@staticmethod
def readTsvData(path: str, encoding: str = "gb18030", start: int = 0, chunkSize: int = 10000):
"""
读取整个tsv文件数据
:param encoding: str 编码格式
:param path: str tsv文件路径
:param start: int 文件起始读入位置
:param chunkSize: int 文件读取大小
:return: DataFrame
"""
data = []
header = None
with codecs.open(path, 'rb', encoding, errors="ignore") as tsvFile:
for line in tsvFile:
# 存入列索引
temp1 = line.split("\t")
temp1[-1] = temp1[-1][:-2]
header = temp1
break
for line in tsvFile.readlines()[start:(start + chunkSize)]:
temp1 = line.split("\t")
temp1[-1] = temp1[-1][:-2]
data.append(temp1)
return pd.DataFrame(data, columns=header)
@classmethod
def splitDataByTrunk(cls, file_path: str, chunk_size: int, save_path: str, encoding: str = "gb18030"):
"""
将数据按照片数分片并持久化存储
:param save_path: 存储的文件夹路径
:param file_path: str 读取的tsv文件路径
:param chunk_size: int 分片大小
:param encoding: int 编码格式
:return: void 最后存储的文件为utf-8编码的csv文件
"""
start = 0
save_path = save_path + "/" + file_path.split("/")[-1][:-4]
if not os.path.exists(save_path):
os.mkdir(save_path)
while True:
data = Pretreatment.readTsvData(file_path, encoding=encoding, start=start, chunkSize=chunk_size)
data.to_csv(save_path + "/" + file_path.split("/")[-1][:-4] + "_" + str(start) + ".csv", encoding="utf-8")
start += chunk_size
if len(data) != chunk_size:
break