Python 高效分割大文件按行数拆分大文件

三采

已于 2023-07-26 17:37:17 修改

阅读量1.1k

点赞数

分类专栏： Python 文章标签： Python

于 2023-07-13 17:16:19 首次发布

本文链接：https://blog.csdn.net/zywhehe/article/details/131707308

版权

Python 专栏收录该内容

7 篇文章 0 订阅

订阅专栏

def read_large_file(m_fr):
    """
    生成器函数，按行读取大文件
    :param m_fr:
    :return: 每行的内容
    """
    while True:
        line = m_fr.readline()
        if not line:
            break
        yield line


# 按行数拆分文件
# 将子文件存放到以文件名命名的文件夹中
def file_split_quick(m_filepath, m_num, m_dirpath, m_num_dict):
    """
        按行分割大文件
        :param m_filepath: 文件路径
        :param m_num: 每个分割文件的行数
        :param m_dirpath: 子文件存储目录
        :param m_num_dict: 记录子文件行数的字典
        """
    m_pathlist = []
    if not os.path.exists(m_filepath):
        print('error: not exist: {}'.format(m_filepath))
        assert 0 == 1
    if not os.path.exists(m_dirpath):
        os.makedirs(m_dirpath)
    m_filename = os.path.basename(m_filepath)
    m_out = []
    m_cmd = "wc -l {}".format(m_filepath)
    execute_command(m_cmd, m_out)
    m_total_num = int(m_out[0].split(' ')[0])
    if m_total_num > m_num:
        m_count = 0
        with open(m_filepath, 'r', encoding='utf-8') as m_fr:
            while True:
                m_lines = list(islice(read_large_file(m_fr), m_num))
                if not m_lines:
                    break
                m_count += 1
                m_subpath = os.path.join(m_dirpath, os.path.splitext(m_filename)[0] + '_' + str(m_count).zfill(3) + os.path.splitext(m_filename)[1])
                m_fw = open(m_subpath, 'w', encoding='utf-8')
                m_fw.writelines(m_lines)
                m_fw.close()
                m_pathlist.append(m_subpath)
                m_num_dict[m_subpath] = len(m_lines)
                print('done: {} {}'.format(m_num_dict[m_subpath], m_subpath))

    else:
        m_newpath = os.path.join(m_dirpath, m_filename)
        m_pathlist.append(m_newpath)
        m_num_dict[m_newpath] = m_total_num
        shutil.copyfile(m_filepath, m_newpath)
    return m_pathlist

ChatGPT真是个好东西！

用linux命令拆分：

# 命令：split 文件路径 分割出的文件前缀
# 示例：
split test.txt test_

# 文件按行分割
# -l 设置行数
# -a 指定后缀长度(默认为2)
# --numeric-suffixes=1 指定数字起始值（会影响分割效率）
# --additional-suffix=.txt 指定分割出来的文件格式（会影响分割效率）
split -l 10000 test.txt test_ -a 3 --numeric-suffixes=1 --additional-suffix=.txt

# 查看文件前10行内容
head -n 10 test_001.txt