python数据清洗-找到重复的txt并输出字数最少的_python输出txt文件中重复出现的-CSDN博客

本文链接：https://blog.csdn.net/weixin_44740756/article/details/138916845


import os
import json
from collections import Counter

def find_and_write_duplicate_txt_files(root_folder, output_file):
    # 存储所有找到的 txt 文件名和路径的字典列表
    txt_files_dict_list = []

    # 统计每个文件名出现的次数
    filename_counts = Counter()

    # 遍历文件夹及其子文件夹
    for foldername, _, filenames in os.walk(root_folder):
        for filename in filenames:
            # 只处理 txt 文件
            if filename.endswith('.txt'):
                # 记录文件名出现的次数
                filename_counts[filename] += 1
    list1=[]
    # 遍历文件夹及其子文件夹
    for foldername, _, filenames in os.walk(root_folder):
        for filename in filenames:
            # 只处理 txt 文件
            if filename.endswith('.txt'):
                # 如果文件名出现次数大于等于2，则记录其路径
                if filename_counts[filename] >= 2:
                    # 构建 txt 文件的路径
                    txt_filepath = os.path.join(foldername, filename)

                    # 将文件名和路径存储在字典中
                    txt_files_dict = {'filepath': txt_filepath, 'filename': filename}

                    # 将字典添加到列表中
                    txt_files_dict_list.append(txt_files_dict)
                    if filename not in list1:
                        list1.append(filename)
    print(len(list1),list1)

    # 将字典列表写入 JSONL 文件
    with open(output_file, 'w', encoding='utf-8') as f_out:
        for txt_file_dict in txt_files_dict_list:
            json.dump(txt_file_dict, f_out, ensure_ascii=False)
            f_out.write('\n')



# 调用函数遍历文件夹并将结果写入 JSONL 文件
find_and_write_duplicate_txt_files('D:\\2024work\\5月小说', '重复的txt.jsonl')




import json

def process_output_file(input_file, output_file):
    # 读取 'output.jsonl' 文件
    file_data = []
    with open(input_file, 'r', encoding='utf-8') as f_in:
        for line in f_in:
            file_data.append(json.loads(line))

    # 找到每个文件名对应的最大字数的文件
    filename_to_data = {}
    for data in file_data:
        filename = data['filename']
        filepath = data['filepath']
        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read()
            file_size = len(content)

        if filename not in filename_to_data or file_size > filename_to_data[filename]['file_size']:
            filename_to_data[filename] = {'filepath': filepath, 'file_size': file_size}

    # 将数据写入新的 JSONL 文件
    with open(output_file, 'w', encoding='utf-8') as f_out:
        for filename, data in filename_to_data.items():
            json.dump({'filename': filename, 'filepath': data['filepath']}, f_out, ensure_ascii=False)
            f_out.write('\n')

# 调用函数处理 'output.jsonl' 文件并将结果写入新的 JSONL 文件
process_output_file('重复的txt.jsonl', '重复的txt_output.jsonl')