几个数据集拼起来,不可避免的会遇到数据集重复的问题,以前就用过这个库,后来丢了,现在记录一下。
git仓库: https://github.com/idealo/imagededup.git
1.安装:
方法1:
git clone https://github.com/idealo/imagededup.git
cd imagededup
pip install "cython>=0.29"
python setup.py install
方法2:
pip install imagededup
2.使用
注释写的很清楚,是递归目录删除重复的图片的,先扫描目录,在计算相似度,再删除。 提取特征有很多种方法。具体看下git的readme
from imagededup.methods import PHash
import os
import sys # 导入sys模块
from os.path import isdir, abspath, getsize, join
from os import listdir, system
sys.setrecursionlimit(30000) # 将默认的递归深度修改为3000
def append_filename(path):
contents = listdir(abspath(path))
for content in contents:
content = join(path, content)
if isdir(content):
append_filename(abspath(content))
else:
filenames.append(abspath(content))
return filenames
def del_zero_kb_file(path):
contents = listdir(abspath(path))
for content in contents:
content = join(path, content)
if isdir(content):
append_filename(abspath(content))
else:
filenames.append(abspath(content))
for filename in filenames:
if getsize(filename) == 0:
system('del %s' % filename)
print("[-] Deleting %s ..." % filename)
filenames = []
if __name__ == '__main__':
phasher = PHash()
# 需要去重的文件目录
image_dir = r'C:\Users\jianming_ge\Downloads\Google-Image-Scraper-master\google_img'
# step1 前置检测 将像素大小为0的文件删掉
del_zero_kb_file(image_dir)
# step2 执行查重
image_dir_sub = os.listdir(image_dir)
total_encodings = {}
for sub_dir in image_dir_sub:
full_tmp_dir = os.path.join(image_dir, sub_dir)
print(full_tmp_dir)
encodings = phasher.encode_images(full_tmp_dir)
total_encodings.update(encodings)
duplicates = phasher.find_duplicates(encoding_map=total_encodings)
for k, v in duplicates.items():
image_dir_sub = os.listdir(image_dir)
if len(v) > 0:
for sub_dir in image_dir_sub:
# step3执行删除
for file in v:
file_name_with_full_path = os.path.join(image_dir, sub_dir, file)
if os.path.exists(file_name_with_full_path):
os.remove(file_name_with_full_path)
print(file + " del ok")
如果是一层的,也可以这样用
from imagededup.methods import PHash
import os
import sys # 导入sys模块
from os.path import isdir, abspath, getsize, join
from os import listdir, system
sys.setrecursionlimit(30000) # 将默认的递归深度修改为3000
def append_filename(path):
contents = listdir(abspath(path))
for content in contents:
content = join(path, content)
if isdir(content):
append_filename(abspath(content))
else:
filenames.append(abspath(content))
return filenames
def del_zero_kb_file(path):
contents = listdir(abspath(path))
for content in contents:
content = join(path, content)
if isdir(content):
append_filename(abspath(content))
else:
filenames.append(abspath(content))
for filename in filenames:
if getsize(filename) == 0:
system('del %s' % filename)
print("[-] Deleting %s ..." % filename)
filenames = []
if __name__ == '__main__':
phasher = PHash()
# 需要去重的文件目录
image_dir = r'./imgs'
# step1 前置检测 将像素大小为0的文件删掉
del_zero_kb_file(image_dir)
# step2 执行查重
total_encodings = {}
encodings = phasher.encode_images(image_dir)
total_encodings.update(encodings)
duplicates = phasher.find_duplicates(encoding_map=total_encodings,max_distance_threshold=5)
for k, v in duplicates.items():
if len(v) > 0:
# step3执行删除
for file in v:
file_name_with_full_path = os.path.join(image_dir, file)
if os.path.exists(file_name_with_full_path):
os.remove(file_name_with_full_path)
print(file + " del ok")
max_distance_threshold=5 是个很有用的参数,是阈值。