取不同字数之间的差集是为了方便看多少字数比较合适,结论:30字好
import json
import os,re
book_id = 1
all_chapters = []
def process_txt_file(txt_file):
global book_id
with open(txt_file, 'r', encoding='utf-8') as input_file:
data = json.load(input_file)
for item in data:
chapter_content_30_chars = item['content'][:30] # 提取章节内容前50个字符并去除前空字符
txt_file=item['book_name']
category_id=item['category_id']
if chapter_content_30_chars:
all_chapters.append({
'book_id': book_id,
'chapter_name': item['chapter_name'],
'chapter_content_30_chars': chapter_content_30_chars,
'txt_file': txt_file,
'type': category_id
})
# 增加书籍id,准备处理下一本书籍
book_id += 1
print(book_id)
def write_to_jsonl(data, output_file):
with open(output_file, 'a+', encoding='utf-8') as f:
for item in data:
f.write(json.dumps(item, ensure_ascii=False) + '\n')
list1=[1,2,4,6,8,10,12,14,16]
for s in list1:
# 读取所有txt文件
folder_path = f'D:\\2024work\\5月小说\\武侠430\\武侠430\\武侠{s}最终处理结果.json'
process_txt_file(folder_path)
write_to_jsonl(all_chapters, 'D:\\2024work\\5月小说\\武侠430\\武侠430\\武侠30.jsonl')
import json
from collections import Counter
def sort_and_write_top_duplicates(input_file, output_file, top_n=10000):
# 读取JSONL文件,统计重复次数
content_counts = Counter()
with open(input_file, 'r', encoding='utf-8') as f_in:
for line in f_in:
data = json.loads(line)
chapter_content_30_chars = data.get('chapter_content_30_chars', '')
content_counts[chapter_content_30_chars] += 1
# 按照重复次数从高到低排序
sorted_content_counts = content_counts.most_common()
# 获取前top_n个重复次数最高的章节内容前50字符和相对应的重复次数
top_duplicates = sorted_content_counts[:top_n]
# 将结果写入新的JSONL文件
with open(output_file, 'w', encoding='utf-8') as f_out:
for content, count in top_duplicates:
if count > 1:
data = {'章节内容前30字符': content, '重复次数': count}
json.dump(data, f_out, ensure_ascii=False)
f_out.write('\n')
# 调用函数处理重复数据并写入新的JSONL文件
sort_and_write_top_duplicates('D:\\2024work\\5月小说\\武侠430\\武侠430\\武侠30.jsonl', 'top_duplicates武侠30.jsonl', top_n=10000)
import json
def sort_and_write_top_duplicates(input_file, output_file):
list1=[]
list2=[]
with open(input_file, 'r', encoding='utf-8') as f_in:
for line in f_in:
data = json.loads(line)
chapter_content_30_chars = data.get('章节内容前50字符', '')
list1.append(chapter_content_30_chars[:30])
with open(output_file, 'r', encoding='utf-8') as f_in:
for line in f_in:
data = json.loads(line)
chapter_content_30_chars1 = data.get('章节内容前30字符', '')
list2.append(chapter_content_30_chars1)
# 将列表转换为集合
set1 = set(list1)
set2 = set(list2)
# 使用集合的对称差集操作
symmetric_difference = set1.symmetric_difference(set2)
# 如果需要输出列表而不是集合,可以将其转换回列表
symmetric_difference_list = list(symmetric_difference)
# 输出非交集列表
print(len(symmetric_difference_list))
for i in symmetric_difference_list:
print(i)
sort_and_write_top_duplicates('top_duplicates.jsonl', 'top_duplicates武侠30.jsonl')