数据清洗之匹配相同章节、统计重复次数最高的内容和次数并取不同字数之间的差集

最新推荐文章于 2024-06-02 16:48:54 发布

小炫y

最新推荐文章于 2024-06-02 16:48:54 发布

阅读量76

点赞数 1

文章标签： python

本文链接：https://blog.csdn.net/weixin_44740756/article/details/138861340

版权

python基础专栏收录该内容

38 篇文章 10 订阅

订阅专栏

取不同字数之间的差集是为了方便看多少字数比较合适，结论：30字好

import json
import os,re

book_id = 1

all_chapters = []
def process_txt_file(txt_file):
    global book_id
    with open(txt_file, 'r', encoding='utf-8') as input_file:
        data = json.load(input_file)

    for item in data:
        chapter_content_30_chars = item['content'][:30]  # 提取章节内容前50个字符并去除前空字符
        txt_file=item['book_name']
        category_id=item['category_id']
        if chapter_content_30_chars:
            all_chapters.append({
                'book_id': book_id,
                'chapter_name': item['chapter_name'],
                'chapter_content_30_chars': chapter_content_30_chars,
                'txt_file': txt_file,
                'type': category_id
            })

        # 增加书籍id，准备处理下一本书籍
        book_id += 1
        print(book_id)


def write_to_jsonl(data, output_file):
    with open(output_file, 'a+', encoding='utf-8') as f:
        for item in data:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')

list1=[1,2,4,6,8,10,12,14,16]
for s in list1:
    # 读取所有txt文件
    folder_path = f'D:\\2024work\\5月小说\\武侠430\\武侠430\\武侠{s}最终处理结果.json'
    process_txt_file(folder_path)

write_to_jsonl(all_chapters,  'D:\\2024work\\5月小说\\武侠430\\武侠430\\武侠30.jsonl')




import json
from collections import Counter

def sort_and_write_top_duplicates(input_file, output_file, top_n=10000):
    # 读取JSONL文件，统计重复次数
    content_counts = Counter()
    with open(input_file, 'r', encoding='utf-8') as f_in:
        for line in f_in:
            data = json.loads(line)
            chapter_content_30_chars = data.get('chapter_content_30_chars', '')
            content_counts[chapter_content_30_chars] += 1

    # 按照重复次数从高到低排序
    sorted_content_counts = content_counts.most_common()

    # 获取前top_n个重复次数最高的章节内容前50字符和相对应的重复次数
    top_duplicates = sorted_content_counts[:top_n]

    # 将结果写入新的JSONL文件
    with open(output_file, 'w', encoding='utf-8') as f_out:
        for content, count in top_duplicates:
            if count > 1:
                data = {'章节内容前30字符': content, '重复次数': count}
                json.dump(data, f_out, ensure_ascii=False)
                f_out.write('\n')

# 调用函数处理重复数据并写入新的JSONL文件
sort_and_write_top_duplicates('D:\\2024work\\5月小说\\武侠430\\武侠430\\武侠30.jsonl', 'top_duplicates武侠30.jsonl', top_n=10000)




import json

def sort_and_write_top_duplicates(input_file, output_file):

    list1=[]
    list2=[]
    with open(input_file, 'r', encoding='utf-8') as f_in:
        for line in f_in:
            data = json.loads(line)
            chapter_content_30_chars = data.get('章节内容前50字符', '')
            list1.append(chapter_content_30_chars[:30])
    with open(output_file, 'r', encoding='utf-8') as f_in:
        for line in f_in:
            data = json.loads(line)
            chapter_content_30_chars1 = data.get('章节内容前30字符', '')
            list2.append(chapter_content_30_chars1)

    # 将列表转换为集合
    set1 = set(list1)
    set2 = set(list2)

    # 使用集合的对称差集操作
    symmetric_difference = set1.symmetric_difference(set2)

    # 如果需要输出列表而不是集合，可以将其转换回列表
    symmetric_difference_list = list(symmetric_difference)

    # 输出非交集列表
    print(len(symmetric_difference_list))

    for i in symmetric_difference_list:
        print(i)

sort_and_write_top_duplicates('top_duplicates.jsonl', 'top_duplicates武侠30.jsonl')