常用的文本去重代码:
本代码用到了simhash库(https://github.com/yanyiwu/simhash)
import re
import os
from pathlib import Path
import json
from simhash import Simhash
def droplike(lines):
def distance(v1, v2, f=64):
x = (v1 ^ v2) & ((1 << f) - 1)
ans = 0
while x:
ans += 1
x &= x - 1
return ans
values = []
for line in lines:
seq, text = re.split(r'\s+', line, maxsplit=1)
text_del = re.sub(r'{[a-z]+\d}|\W', '', text)
values.append(Simhash(text_del).value)
lines_new = []
for i, line in tqdm(enumerate(lines), total=len(lines)):
v1 = values[i]
flag = True
for j, v2 in enumerate(values[:i]):
dist = distance(v1, v2)
if dist <= 3:
print(line, lines[j])
flag = False
break
if flag:
lines_new.append(line)
return lines_new
def droplike_file(sour_file):
lines = []
with open(sour_file, 'r', encoding='utf-8') as f_in:
for line in f_in:
line = line.strip()
if not line:
continue
lines.append(line)
lines = droplike(lines)
with open(sour_file, 'w', encoding='utf-8') as f_out:
for line in lines:
seq, text = re.split(r'\s+', line, maxsplit=1)
f_out.write('{} {}\n'.format(seq, text))