用Jaccard相似度对中文段文本进行去重

最新推荐文章于 2024-08-14 12:00:04 发布

牛有果真的是你啊啊啊

最新推荐文章于 2024-08-14 12:00:04 发布

阅读量278

点赞数 4

分类专栏： python编程文章标签： python 机器学习

本文链接：https://blog.csdn.net/m0_58561560/article/details/136816179

版权

python编程专栏收录该内容

1 篇文章 0 订阅

订阅专栏

本专栏更新一些python代码，旨在为新手提供学习内容。本人也会持续输出有价值的内容，欢迎关注。你的关注和点赞是我最的动力，谢谢。

Jaccard就不多介绍了，主要原理可以看站内跳转

实习的时候做新闻文本去重的时候希望筛选一下标题，标题一般十几二十个子，用jaccard配合jieba效果好速度快，下面是核心代码

"""
#!/usr/bin/env python
# -*- coding: utf-8 -*-
------------------------------
# @Author  : aikey
# @Github  : @aikeywitt
# @Time    : 2024/01/11 16:23
# @File    : deduplicator.py
------------------------------
"""
import pandas as pd
import jieba
from config import *

# 设置停用词和标点
punctuation = set(r'./ <>_-=,"。？！“”：‘’@#￥%…&×（）——+【】{};；●～|\\s:[]')


class Jaccard:
    def __init__(self, _len):
        self._len = _len
        self.savePath = os.path.join(APPPATH, 'data')
        with open(STOPWORDS_FILE, 'r', encoding='utf-8') as f:
            self.stopwords = set([w.strip() for w in f.readlines()])

    def cut2list(self, paragraph):
        paragraph = ''.join(ch for ch in paragraph if ch not in punctuation)
        words = [w for w in jieba.cut(paragraph) if w not in self.stopwords and w not in punctuation]
        word_set = [''.join(words[i:i + self._len]) for i in range(0, len(words), self._len)]
        return word_set

    def jaccard(self, str1, str2):
        # 确保 str1 和 str2 是字符串
        str1 = str(str1) if not isinstance(str1, str) else str1
        str2 = str(str2) if not isinstance(str2, str) else str2

        set1, set2 = set(self.cut2list(str1)), set(self.cut2list(str2))
        intersection = len(set1.intersection(set2))
        union = len(set1) + len(set2) - intersection
        return float(intersection / union) if union != 0 else 0


if __name__ == '__main__':
    data = pd.read_excel(DATA_FILE, engine='openpyxl')
    jaccard = Jaccard(1)  # 这里假设每个词组长度为1

    # 初始化重复标记列
    data['Duplicate'] = 1

    # 用于记录已标记为重复的行索引
    marked_as_duplicate = set()

    # 生成重复项标记
    num = data.shape[0]
    for i in range(num):
        for j in range(i+1, num):  # 只比较不同的标题组合
            if jaccard.jaccard(data.iloc[i]['Title'], data.iloc[j]['Title']) > 0.5:
                if i not in marked_as_duplicate and j not in marked_as_duplicate:
                    # 标记为重复
                    data.at[j, 'Duplicate'] = 0
                    marked_as_duplicate.add(j)  # 添加到已标记集合

    # 写回原始的 Excel 文件
    data.to_excel(DATA_FILE, index=False)