python 去除重复txt和章节

最新推荐文章于 2024-07-12 16:16:27 发布
小炫y
最新推荐文章于 2024-07-12 16:16:27 发布
阅读量69
点赞数 1
文章标签： python
本文链接：https://blog.csdn.net/weixin_44740756/article/details/139532295
版权
python基础专栏收录该内容
41 篇文章 10 订阅
订阅专栏

import os
import json
from collections import Counter

def find_and_write_duplicate_txt_files(root_folder, output_file):
    # 存储所有找到的 txt 文件名和路径的字典列表
    txt_files_dict_list = []

    # 统计每个文件名出现的次数
    filename_counts = Counter()

    # 遍历文件夹及其子文件夹
    for foldername, _, filenames in os.walk(root_folder):
        for filename in filenames:
            # 只处理 txt 文件
            if filename.endswith('.txt'):
                # 记录文件名出现的次数
                filename_counts[filename] += 1
    list1=[]
    # 遍历文件夹及其子文件夹
    for foldername, _, filenames in os.walk(root_folder):
        for filename in filenames:
            # 只处理 txt 文件
            if filename.endswith('.txt'):
                # 如果文件名出现次数大于等于2，则记录其路径
                if filename_counts[filename] >= 2:
                    # 构建 txt 文件的路径
                    txt_filepath = os.path.join(foldername, filename)

                    # 将文件名和路径存储在字典中
                    txt_files_dict = {'filepath': txt_filepath, 'filename': filename}

                    # 将字典添加到列表中
                    txt_files_dict_list.append(txt_files_dict)
                    if filename not in list1:
                        list1.append(filename)
    print(len(list1),list1)

    # 将字典列表写入 JSONL 文件
    with open(output_file, 'w', encoding='utf-8') as f_out:
        for txt_file_dict in txt_files_dict_list:
            json.dump(txt_file_dict, f_out, ensure_ascii=False)
            f_out.write('\n')


def process_output_file(output_file_a, output_file_b):
    # 读取 'output.jsonl' 文件
    file_data = []
    with open(output_file_a, 'r', encoding='utf-8') as f_in:
        for line in f_in:
            file_data.append(json.loads(line))

    # 构建字典，将相同 filename 的 filepath 放在一起
    filename_mapping = {}
    for item in file_data:
        filename = item.get('filename', '')
        filepath = item.get('filepath', '')

        if filename not in filename_mapping:
            filename_mapping[filename] = [filepath]
        else:
            filename_mapping[filename].append(filepath)

    with open(output_file_b, 'w', encoding='utf-8') as f_out:
        for filename, filepaths in filename_mapping.items():
            output_dict = {'filename': filename, 'filepath': filepaths}
            f_out.write(json.dumps(output_dict, ensure_ascii=False) + '\n')


import os
import json

def process_processed_output_file(output_file_b, output_file_c, list1):
    # 读取 'processed_output.jsonl' 文件
    with open(output_file_b, 'r', encoding='utf-8') as f_in:
        file_data = [json.loads(line) for line in f_in]

    # 检查并处理文件
    with open(output_file_c, 'w', encoding='utf-8') as f_out:
        for item in file_data:
            filename = item.get('filename', '')
            filepaths = item.get('filepath', [])

            # 查找是否有文件路径包含 list1 中的任何一个元素
            found_list1 = any(any(keyword in filepath for keyword in list1) for filepath in filepaths)

            if found_list1:
                # 如果 filepaths 中含有 list1 中的任一个元素，则写入除了 list1 中元素外的其他文件路径
                for filepath in filepaths:
                    if not any(keyword in filepath for keyword in list1):
                        output_dict = {'filename': filename, 'filepath': filepath}
                        f_out.write(json.dumps(output_dict, ensure_ascii=False) + '\n')
            else:
                # 如果 filepaths 没有包含 list1 中的任何元素，则取最大字数的文件路径
                max_filepath = max(filepaths, key=lambda x: os.path.getsize(x))
                # 将除了最大文件路径外的其他文件路径写入文件
                for filepath in filepaths:
                    if filepath != max_filepath:
                        output_dict = {'filename': filename, 'filepath': filepath}
                        f_out.write(json.dumps(output_dict, ensure_ascii=False) + '\n')



root_folder = ''  # 根文件夹路径
output_file_a = 'txt.jsonl'  # 输出文件a路径
output_file_b = 'txt_output.jsonl'  # 输出文件b路径
output_file_c = 'txt_out.jsonl'  # 输出文件c路径

list1 = [""]

# 调用函数遍历文件夹

find_and_write_duplicate_txt_files(root_folder, output_file_a)
process_output_file(output_file_a, output_file_b)
process_processed_output_file(output_file_b, output_file_c, list1)



import json
import os

def remove_txt_files_from_jsonl(jsonl_file):
    # 读取 JSONL 文件
    with open(jsonl_file, 'r', encoding='utf-8') as f:
        jsonl_data = [json.loads(line) for line in f]

    # 遍历每个字典，删除 'filepath' 对应的 txt 文件
    for item in jsonl_data:
        txt_file = item.get('filepath', '')
        if txt_file:
            try:
                os.remove(txt_file)
                print(f"已删除文件: {txt_file}")
            except OSError as e:
                print(f"删除文件失败: {txt_file}, 错误信息: {e}")

# 调用函数删除 JSONL 文件中所有 'filepath' 对应的 txt 文件
remove_txt_files_from_jsonl( '重复的txt_out.jsonl')






# import json
#
# def process_output_file(input_file, output_file):
#     # 读取 'output.jsonl' 文件
#     file_data = []
#     with open(input_file, 'r', encoding='utf-8') as f_in:
#         for line in f_in:
#             file_data.append(json.loads(line))
#
#     # 找到每个文件名对应的最大字数的文件
#     filename_to_data = {}
#     for data in file_data:
#         filename = data['filename']
#         filepath = data['filepath']
#         with open(filepath, 'r', encoding='utf-8') as f:
#             content = f.read()
#             file_size = len(content)
#         if filename not in filename_to_data or file_size > filename_to_data[filename]['file_size']:
#             if list1 not in filepath:
#                 filename_to_data[filename] = {'filepath': filepath, 'file_size': file_size}
#
#     # 将数据写入新的 JSONL 文件
#     with open(output_file, 'w', encoding='utf-8') as f_out:
#         for filename, data in filename_to_data.items():
#             json.dump({'filename': filename, 'filepath': data['filepath']}, f_out, ensure_ascii=False)
#             f_out.write('\n')
#
# # 调用函数处理 'output.jsonl' 文件并将结果写入新的 JSONL 文件
# process_output_file('重复的txt.jsonl', '重复的txt_output.jsonl')










# import json
# import concurrent.futures
# import time
#
# def process_book_content(book_data, jsonl_data, output_file):
#     content = book_data.get('content', '')
#     book_name = book_data.get('book_name', '')
#     data_id = book_data.get('data_id', '')
#     chapter_name = book_data.get('chapter_name', '')
#     book_id= book_data.get('book_id', '')
#     chapter_id = book_data.get('chapter_id', '')
#     category_id = book_data.get('category_id', '')
#     matching_content = []
#     start_time = time.time()  # 记录开始时间
#     for jsonl_item in jsonl_data:
#         chapter_start = jsonl_item.get('chapter_content_30_chars', '')
#         txt_filename = jsonl_item.get('txt_file', '')
#
#         if book_name == txt_filename:
#             n = len(content)
#             m = len(chapter_start)
#             for i in range(n - m + 1):
#                 if content[i:i + m] == chapter_start:
#                     if i >= 30:
#                         matching_content.append({
#                             'book_name': book_name,
#                             'txt_filename': txt_filename,
#                             '章节前30': chapter_start,
#                             '原文': content[i:i + m],
#                             'data_id': data_id,
#                             'chapter_name': chapter_name,
#                             'book_id': book_id,
#                             'chapter_id': chapter_id,
#                             'category_id': category_id,
#                         })
#                         break  # 只匹配到第一个，然后跳出循环
#     end_time = time.time()  # 记录结束时间
#     elapsed_time = end_time - start_time  # 计算耗时
#     print(f"书籍 '{book_name}' 处理耗时: 0.0595 秒")
#
#     # 将匹配结果写入输出文件
#     with open(output_file, 'a+', encoding='utf-8') as f:
#         for match in matching_content:
#             json.dump(match, f, ensure_ascii=False)
#             f.write('\n')
#
# def process_books_parallel(books_data, jsonl_data, output_file):
#     # 使用线程池并行处理
#     with concurrent.futures.ThreadPoolExecutor() as executor:
#         # 提交每本书的内容处理任务
#         futures = []
#         for book_data in books_data:
#             futures.append(executor.submit(process_book_content, book_data, jsonl_data, output_file))
#
#         # 等待所有线程完成
#         concurrent.futures.wait(futures)
#
# def find_matching_content(jsonl_file, json_file, output_file):
#     # 读取 JSON 文件中的书籍内容
#     with open(json_file, 'r', encoding='utf-8') as f:
#         books_data = json.load(f)
#
#     # 读取 JSONL 文件中的章节开头的前30字符
#     with open(jsonl_file, 'r', encoding='utf-8') as f:
#         jsonl_data = [json.loads(line) for line in f]
#
#     # 并行处理书籍内容
#     process_books_parallel(books_data, jsonl_data, output_file)
#
# # 调用函数查找匹配的内容并将结果写入输出文件
# find_matching_content('515.jsonl', '最终处理结果.json', 'output33.jsonl')
#