【深度学习】训练集图片去重

weixin_40293999

已于 2023-04-27 18:01:49 修改

阅读量1.2k

点赞数 2

分类专栏： yolov # 图像识别深度学习文章标签：深度学习 python 人工智能

于 2023-01-19 15:43:35 首次发布

本文链接：https://blog.csdn.net/weixin_40293999/article/details/128735543

版权

深度学习同时被 3 个专栏收录

111 篇文章

订阅专栏

图像识别

34 篇文章

订阅专栏

yolov

12 篇文章

订阅专栏

文章介绍了如何使用imagededup库来检测并删除重复的图片。首先，通过git仓库下载安装该库，然后利用PHash方法对图片进行编码以计算相似度。接着，遍历文件夹，删除零字节文件，最后执行查重并删除重复图片。max_distance_threshold参数用于设置相似度阈值。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

文章目录

几个数据集拼起来，不可避免的会遇到数据集重复的问题，以前就用过这个库，后来丢了，现在记录一下。
git仓库: https://github.com/idealo/imagededup.git

1.安装：

方法1：

git clone https://github.com/idealo/imagededup.git
cd imagededup
pip install "cython>=0.29"
python setup.py install

方法2：

pip install imagededup

2.使用

注释写的很清楚，是递归目录删除重复的图片的，先扫描目录，在计算相似度，再删除。提取特征有很多种方法。具体看下git的readme

from imagededup.methods import PHash
import os
import sys  # 导入sys模块

from os.path import isdir, abspath, getsize, join
from os import listdir, system

sys.setrecursionlimit(30000)  # 将默认的递归深度修改为3000


def append_filename(path):
    contents = listdir(abspath(path))
    for content in contents:
        content = join(path, content)
        if isdir(content):
            append_filename(abspath(content))
        else:
            filenames.append(abspath(content))
    return filenames


def del_zero_kb_file(path):
    contents = listdir(abspath(path))
    for content in contents:
        content = join(path, content)
        if isdir(content):
            append_filename(abspath(content))
        else:
            filenames.append(abspath(content))

    for filename in filenames:
        if getsize(filename) == 0:
            system('del %s' % filename)
            print("[-] Deleting %s ..." % filename)


filenames = []
if __name__ == '__main__':
    phasher = PHash()
    # 需要去重的文件目录
    image_dir = r'C:\Users\jianming_ge\Downloads\Google-Image-Scraper-master\google_img'
    # step1 前置检测 将像素大小为0的文件删掉
    del_zero_kb_file(image_dir)
    # step2 执行查重
    image_dir_sub = os.listdir(image_dir)
    total_encodings = {}
    for sub_dir in image_dir_sub:
        full_tmp_dir = os.path.join(image_dir, sub_dir)
        print(full_tmp_dir)
        encodings = phasher.encode_images(full_tmp_dir)
        total_encodings.update(encodings)
    duplicates = phasher.find_duplicates(encoding_map=total_encodings)
    for k, v in duplicates.items():
        image_dir_sub = os.listdir(image_dir)
        if len(v) > 0:
            for sub_dir in image_dir_sub:
                # step3执行删除
                for file in v:
                    file_name_with_full_path = os.path.join(image_dir, sub_dir, file)
                    if os.path.exists(file_name_with_full_path):
                        os.remove(file_name_with_full_path)
                        print(file + " del ok")

如果是一层的，也可以这样用

from imagededup.methods import PHash
import os
import sys  # 导入sys模块

from os.path import isdir, abspath, getsize, join
from os import listdir, system

sys.setrecursionlimit(30000)  # 将默认的递归深度修改为3000


def append_filename(path):
    contents = listdir(abspath(path))
    for content in contents:
        content = join(path, content)
        if isdir(content):
            append_filename(abspath(content))
        else:
            filenames.append(abspath(content))
    return filenames


def del_zero_kb_file(path):
    contents = listdir(abspath(path))
    for content in contents:
        content = join(path, content)
        if isdir(content):
            append_filename(abspath(content))
        else:
            filenames.append(abspath(content))

    for filename in filenames:
        if getsize(filename) == 0:
            system('del %s' % filename)
            print("[-] Deleting %s ..." % filename)


filenames = []
if __name__ == '__main__':
    phasher = PHash()
    # 需要去重的文件目录
    image_dir = r'./imgs'
    # step1 前置检测 将像素大小为0的文件删掉
    del_zero_kb_file(image_dir)
    # step2 执行查重
    total_encodings = {}

    encodings = phasher.encode_images(image_dir)
    total_encodings.update(encodings)
    duplicates = phasher.find_duplicates(encoding_map=total_encodings,max_distance_threshold=5)
    for k, v in duplicates.items():
        if len(v) > 0:
            # step3执行删除
            for file in v:
                file_name_with_full_path = os.path.join(image_dir, file)
                if os.path.exists(file_name_with_full_path):
                    os.remove(file_name_with_full_path)
                    print(file + " del ok")