import os
import json
from collections import Counter
def find_and_write_duplicate_txt_files(root_folder, output_file):
# 存储所有找到的 txt 文件名和路径的字典列表
txt_files_dict_list = []
# 统计每个文件名出现的次数
filename_counts = Counter()
# 遍历文件夹及其子文件夹
for foldername, _, filenames in os.walk(root_folder):
for filename in filenames:
# 只处理 txt 文件
if filename.endswith('.txt'):
# 记录文件名出现的次数
filename_counts[filename] += 1
list1=[]
# 遍历文件夹及其子文件夹
for foldername, _, filenames in os.walk(root_folder):
for filename in filenames:
# 只处理 txt 文件
if filename.endswith('.txt'):
# 如果文件名出现次数大于等于2,则记录其路径
if filename_counts[filename] >= 2:
# 构建 txt 文件的路径
txt_filepath = os.path.join(foldername, filename)
# 将文件名和路径存储在字典中
txt_files_dict = {'filepath': txt_filepath, 'filename': filename}
# 将字典添加到列表中
txt_files_dict_list.append(txt_files_dict)
if filename not in list1:
list1.append(filename)
print(len(list1),list1)
# 将字典列表写入 JSONL 文件
with open(output_file, 'w', encoding='utf-8') as f_out:
for txt_file_dict in txt_files_dict_list:
json.dump(txt_file_dict, f_out, ensure_ascii=False)
f_out.write('\n')
def process_output_file(output_file_a, output_file_b):
# 读取 'output.jsonl' 文件
file_data = []
with open(output_file_a, 'r', encoding='utf-8') as f_in:
for line in f_in:
file_data.append(json.loads(line))
# 构建字典,将相同 filename 的 filepath 放在一起
filename_mapping = {}
for item in file_data:
filename = item.get('filename', '')
filepath = item.get('filepath', '')
if filename not in filename_mapping:
filename_mapping[filename] = [filepath]
else:
filename_mapping[filename].append(filepath)
with open(output_file_b, 'w', encoding='utf-8') as f_out:
for filename, filepaths in filename_mapping.items():
output_dict = {'filename': filename, 'filepath': filepaths}
f_out.write(json.dumps(output_dict, ensure_ascii=False) + '\n')
import os
import json
def process_processed_output_file(output_file_b, output_file_c, list1):
# 读取 'processed_output.jsonl' 文件
with open(output_file_b, 'r', encoding='utf-8') as f_in:
file_data = [json.loads(line) for line in f_in]
# 检查并处理文件
with open(output_file_c, 'w', encoding='utf-8') as f_out:
for item in file_data:
filename = item.get('filename', '')
filepaths = item.get('filepath', [])
# 查找是否有文件路径包含 list1 中的任何一个元素
found_list1 = any(any(keyword in filepath for keyword in list1) for filepath in filepaths)
if found_list1:
# 如果 filepaths 中含有 list1 中的任一个元素,则写入除了 list1 中元素外的其他文件路径
for filepath in filepaths:
if not any(keyword in filepath for keyword in list1):
output_dict = {'filename': filename, 'filepath': filepath}
f_out.write(json.dumps(output_dict, ensure_ascii=False) + '\n')
else:
# 如果 filepaths 没有包含 list1 中的任何元素,则取最大字数的文件路径
max_filepath = max(filepaths, key=lambda x: os.path.getsize(x))
# 将除了最大文件路径外的其他文件路径写入文件
for filepath in filepaths:
if filepath != max_filepath:
output_dict = {'filename': filename, 'filepath': filepath}
f_out.write(json.dumps(output_dict, ensure_ascii=False) + '\n')
root_folder = '' # 根文件夹路径
output_file_a = 'txt.jsonl' # 输出文件a路径
output_file_b = 'txt_output.jsonl' # 输出文件b路径
output_file_c = 'txt_out.jsonl' # 输出文件c路径
list1 = [""]
# 调用函数遍历文件夹
find_and_write_duplicate_txt_files(root_folder, output_file_a)
process_output_file(output_file_a, output_file_b)
process_processed_output_file(output_file_b, output_file_c, list1)
import json
import os
def remove_txt_files_from_jsonl(jsonl_file):
# 读取 JSONL 文件
with open(jsonl_file, 'r', encoding='utf-8') as f:
jsonl_data = [json.loads(line) for line in f]
# 遍历每个字典,删除 'filepath' 对应的 txt 文件
for item in jsonl_data:
txt_file = item.get('filepath', '')
if txt_file:
try:
os.remove(txt_file)
print(f"已删除文件: {txt_file}")
except OSError as e:
print(f"删除文件失败: {txt_file}, 错误信息: {e}")
# 调用函数删除 JSONL 文件中所有 'filepath' 对应的 txt 文件
remove_txt_files_from_jsonl( '重复的txt_out.jsonl')
# import json
#
# def process_output_file(input_file, output_file):
# # 读取 'output.jsonl' 文件
# file_data = []
# with open(input_file, 'r', encoding='utf-8') as f_in:
# for line in f_in:
# file_data.append(json.loads(line))
#
# # 找到每个文件名对应的最大字数的文件
# filename_to_data = {}
# for data in file_data:
# filename = data['filename']
# filepath = data['filepath']
# with open(filepath, 'r', encoding='utf-8') as f:
# content = f.read()
# file_size = len(content)
# if filename not in filename_to_data or file_size > filename_to_data[filename]['file_size']:
# if list1 not in filepath:
# filename_to_data[filename] = {'filepath': filepath, 'file_size': file_size}
#
# # 将数据写入新的 JSONL 文件
# with open(output_file, 'w', encoding='utf-8') as f_out:
# for filename, data in filename_to_data.items():
# json.dump({'filename': filename, 'filepath': data['filepath']}, f_out, ensure_ascii=False)
# f_out.write('\n')
#
# # 调用函数处理 'output.jsonl' 文件并将结果写入新的 JSONL 文件
# process_output_file('重复的txt.jsonl', '重复的txt_output.jsonl')
# import json
# import concurrent.futures
# import time
#
# def process_book_content(book_data, jsonl_data, output_file):
# content = book_data.get('content', '')
# book_name = book_data.get('book_name', '')
# data_id = book_data.get('data_id', '')
# chapter_name = book_data.get('chapter_name', '')
# book_id= book_data.get('book_id', '')
# chapter_id = book_data.get('chapter_id', '')
# category_id = book_data.get('category_id', '')
# matching_content = []
# start_time = time.time() # 记录开始时间
# for jsonl_item in jsonl_data:
# chapter_start = jsonl_item.get('chapter_content_30_chars', '')
# txt_filename = jsonl_item.get('txt_file', '')
#
# if book_name == txt_filename:
# n = len(content)
# m = len(chapter_start)
# for i in range(n - m + 1):
# if content[i:i + m] == chapter_start:
# if i >= 30:
# matching_content.append({
# 'book_name': book_name,
# 'txt_filename': txt_filename,
# '章节前30': chapter_start,
# '原文': content[i:i + m],
# 'data_id': data_id,
# 'chapter_name': chapter_name,
# 'book_id': book_id,
# 'chapter_id': chapter_id,
# 'category_id': category_id,
# })
# break # 只匹配到第一个,然后跳出循环
# end_time = time.time() # 记录结束时间
# elapsed_time = end_time - start_time # 计算耗时
# print(f"书籍 '{book_name}' 处理耗时: 0.0595 秒")
#
# # 将匹配结果写入输出文件
# with open(output_file, 'a+', encoding='utf-8') as f:
# for match in matching_content:
# json.dump(match, f, ensure_ascii=False)
# f.write('\n')
#
# def process_books_parallel(books_data, jsonl_data, output_file):
# # 使用线程池并行处理
# with concurrent.futures.ThreadPoolExecutor() as executor:
# # 提交每本书的内容处理任务
# futures = []
# for book_data in books_data:
# futures.append(executor.submit(process_book_content, book_data, jsonl_data, output_file))
#
# # 等待所有线程完成
# concurrent.futures.wait(futures)
#
# def find_matching_content(jsonl_file, json_file, output_file):
# # 读取 JSON 文件中的书籍内容
# with open(json_file, 'r', encoding='utf-8') as f:
# books_data = json.load(f)
#
# # 读取 JSONL 文件中的章节开头的前30字符
# with open(jsonl_file, 'r', encoding='utf-8') as f:
# jsonl_data = [json.loads(line) for line in f]
#
# # 并行处理书籍内容
# process_books_parallel(books_data, jsonl_data, output_file)
#
# # 调用函数查找匹配的内容并将结果写入输出文件
# find_matching_content('515.jsonl', '最终处理结果.json', 'output33.jsonl')
#
python 去除重复txt和章节
最新推荐文章于 2024-07-12 16:16:27 发布