python 拆分（几G）的tsv文件为较小的csv文件

Dragon Wu

已于 2022-03-18 19:13:59 修改

阅读量771

点赞数 2

分类专栏：数据挖掘 python 文章标签： python 开发语言数据挖掘数据分析

于 2022-01-10 10:07:19 首次发布

本文链接：https://blog.csdn.net/qq_50909707/article/details/122404380

版权

python 同时被 2 个专栏收录

35 篇文章 6 订阅

订阅专栏

数据挖掘

22 篇文章 4 订阅

订阅专栏


import pandas as pd
import os
import _thread



# 获取分块数据
# tsv_name: tsv文件名
# chunk_size: 分块大小
# encoding_type: 编码格式
def read_data(tsv_name: str, chunk_size: int, encoding_type: str):
    return pd.read_csv(tsv_name, sep="\t", chunksize=chunk_size, encoding=encoding_type, low_memory=False)



# 高效计算文件行数
# file_name: 携带路径的文件名字
def iter_count(file_name: str):
    from itertools import (takewhile, repeat)
    buffer = 1024 * 1024
    with open(file_name, errors='ignore') as f:
        buf_gen = takewhile(lambda x: x, (f.read(buffer) for _ in repeat(None)))
        return sum(buf.count('\n') for buf in buf_gen)



# 拆分大tsv文件为小一些的csv文件
# tsv_name: tsv文件名
# split_num: 拆分个数
# encoding_type: 编码格式
def split_data(tsv_name: str, split_num: int, encoding_type: str):
    splits_dir = f"./splits_{tsv_name[-8:-4]}"
    if not os.path.exists(splits_dir):
        os.mkdir(splits_dir)

    # 计算分块大小
    count = iter_count(tsv_name)
    chunk_size = count // split_num
    if count % split_num != 0:
        chunk_size += 1

    data = read_data(tsv_name, chunk_size, encoding_type)

    # 多线程写入分块数据
    def write_data(_idx: int, _chunk):
        file_path = f"{splits_dir}/chunk_{tsv_name[-8:-4]}_{_idx + 1}.csv"
        _chunk.to_csv(file_path, index=False)

    for idx, chunk in enumerate(data):
        _thread.start_new_thread(write_data, (idx, chunk))

后来改进的方法：


    @staticmethod
    def readTsvData(path: str, encoding: str = "gb18030", start: int = 0, chunkSize: int = 10000):
        """
        读取整个tsv文件数据
        :param encoding: str 编码格式
        :param path: str tsv文件路径
        :param start: int 文件起始读入位置
        :param chunkSize: int 文件读取大小
        :return: DataFrame
        """
        data = []
        header = None
        with codecs.open(path, 'rb', encoding, errors="ignore") as tsvFile:
            for line in tsvFile:
                # 存入列索引
                temp1 = line.split("\t")
                temp1[-1] = temp1[-1][:-2]
                header = temp1
                break
            for line in tsvFile.readlines()[start:(start + chunkSize)]:
                temp1 = line.split("\t")
                temp1[-1] = temp1[-1][:-2]
                data.append(temp1)

        return pd.DataFrame(data, columns=header)

    @classmethod
    def splitDataByTrunk(cls, file_path: str, chunk_size: int, save_path: str, encoding: str = "gb18030"):
        """
        将数据按照片数分片并持久化存储
        :param save_path: 存储的文件夹路径
        :param file_path: str 读取的tsv文件路径
        :param chunk_size: int 分片大小
        :param encoding: int 编码格式
        :return: void  最后存储的文件为utf-8编码的csv文件
        """
        start = 0
        save_path = save_path + "/" + file_path.split("/")[-1][:-4]
        if not os.path.exists(save_path):
            os.mkdir(save_path)
        while True:
            data = Pretreatment.readTsvData(file_path, encoding=encoding, start=start, chunkSize=chunk_size)
            data.to_csv(save_path + "/" + file_path.split("/")[-1][:-4] + "_" + str(start) + ".csv", encoding="utf-8")
            start += chunk_size
            if len(data) != chunk_size:
                break

Dragon Wu

关注

2
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
python 拆分（几G）的tsv文件为较小的csv文件

# 拆分大tsv文件为小一些的csv文件# tsv_name: tsv文件对应的res名字# split_num: 拆分个数def split_data(tsv_name: str, split_num: int): splits_dir = f"./splits_{tsv_name}" if not os.path.exists(splits_dir): os.mkdir(splits_dir) with open(res.FILE_PATH[tsv_na.
复制链接

扫一扫

专栏目录