本专栏更新一些python代码,旨在为新手提供学习内容。本人也会持续输出有价值的内容,欢迎关注。你的关注和点赞是我最的动力,谢谢。
Jaccard就不多介绍了,主要原理可以看站内跳转
实习的时候做新闻文本去重的时候希望筛选一下标题,标题一般十几二十个子,用jaccard配合jieba效果好速度快,下面是核心代码
"""
#!/usr/bin/env python
# -*- coding: utf-8 -*-
------------------------------
# @Author : aikey
# @Github : @aikeywitt
# @Time : 2024/01/11 16:23
# @File : deduplicator.py
------------------------------
"""
import pandas as pd
import jieba
from config import *
# 设置停用词和标点
punctuation = set(r'./ <>_-=,"。?!“”:‘’@#¥%…&×()——+【】{};;●~|\\s:[]')
class Jaccard:
def __init__(self, _len):
self._len = _len
self.savePath = os.path.join(APPPATH, 'data')
with open(STOPWORDS_FILE, 'r', encoding='utf-8') as f:
self.stopwords = set([w.strip() for w in f.readlines()])
def cut2list(self, paragraph):
paragraph = ''.join(ch for ch in paragraph if ch not in punctuation)
words = [w for w in jieba.cut(paragraph) if w not in self.stopwords and w not in punctuation]
word_set = [''.join(words[i:i + self._len]) for i in range(0, len(words), self._len)]
return word_set
def jaccard(self, str1, str2):
# 确保 str1 和 str2 是字符串
str1 = str(str1) if not isinstance(str1, str) else str1
str2 = str(str2) if not isinstance(str2, str) else str2
set1, set2 = set(self.cut2list(str1)), set(self.cut2list(str2))
intersection = len(set1.intersection(set2))
union = len(set1) + len(set2) - intersection
return float(intersection / union) if union != 0 else 0
if __name__ == '__main__':
data = pd.read_excel(DATA_FILE, engine='openpyxl')
jaccard = Jaccard(1) # 这里假设每个词组长度为1
# 初始化重复标记列
data['Duplicate'] = 1
# 用于记录已标记为重复的行索引
marked_as_duplicate = set()
# 生成重复项标记
num = data.shape[0]
for i in range(num):
for j in range(i+1, num): # 只比较不同的标题组合
if jaccard.jaccard(data.iloc[i]['Title'], data.iloc[j]['Title']) > 0.5:
if i not in marked_as_duplicate and j not in marked_as_duplicate:
# 标记为重复
data.at[j, 'Duplicate'] = 0
marked_as_duplicate.add(j) # 添加到已标记集合
# 写回原始的 Excel 文件
data.to_excel(DATA_FILE, index=False)
感谢看到最后,我是牛有果,如果这篇文章帮到你,请点一个免费的关注和赞,谢谢,有问题可以在评论区留言。