首先需要导入包和数据的目录
#!usr/bin/evn python
#! -*- coding:utf8 -*-
from __future__ import division
import os
import re
from functools import reduce
from math import sqrt
path =#你要查重的路径
filelist = os.listdir(path) #列出文件夹下所有的目录与文件
核心比较相似度的代码部分
class Similarity(object):
def __init__(self, target1, target2):
self.target1 = target1
self.target2 = target2
def vector(self):
self.vdict1 = {
}
self.vdict2 = {
}
for target in re.findall('([a-zA-Z0-9_.&%]+)+', self.target1):
self.vdict1[target] = self.vdict1.get(target, 0) + 1
for target in re.findall('([a-zA-Z0-9_.&%]+)+', self.target2):
self.vdict2[target] = self.vdict2.get(target, 0) + 1
def mix(self):
def mapminmax(