数据集图片去重

介绍

本项目Github
本项目使用aHash,目的是去除重复的数据集图片,Hash部分来自于Github项目:https://github.com/7WebPages/comparer

目录结构

  • Root
    • DeleteSameImage.py
    • HashUtils.py
    • datasets
      • images1
      • images2

快速开始

  • 按照目录结构构件项目目录,然后直接运行DeleteSameImage.py。

依赖

  • opencv
  • PIL

代码

DeleteSampleImage.py
from PIL import Image
from HashUtils import Hash

import cv2 as cv

import os

import time

class RootFile:
    def __init__(self,path="./"):
        self.path=path

    def FindSubRoot(self):
        def GetSubRootList(path):
            originDirList = os.listdir(path)

            originDirList.sort()
            filetedDirList = []
            for dir in originDirList:
                if os.path.isdir(os.path.join(path, dir)) == True and dir!="__pycache__":
                    '''筛选出文件夹'''
                    filetedDirList.append(dir)
            return filetedDirList

        subRootList=GetSubRootList(self.path)

        for subRoot in subRootList:
            print("=========================================")
            print(subRoot + " Start")
            print("=========================================")

            imgs = Images(subRoot)
            imgs.FindRepeat()

class Images:
    def __init__(self,path="./"):

        self.imagePath = path + "/images"
        self.sub = CompareImage(self.imagePath)
    def FindRepeat(self):

        count=0#控制targetImage

        '''过滤/label中的classes.txt文件'''
        originImageList = os.listdir(self.imagePath)
        originImageList.sort(key=lambda x: (x[:-4]))  # 对文件名按照数字从小到大排序屏蔽最后四位

        print("=========================================")
        print(" Start ")
        print("=========================================")

        originImageList = os.listdir(self.imagePath)
        originImageList.sort(key=lambda x: (x[:-4]))  # 对文件名按照数字从小到大排序屏蔽最后四位

        while count != len(originImageList):
            print("!!count:",count)

            targetName = originImageList[count]

            targetImagePath = "./" + self.imagePath + "/" + targetName
            if not os.path.exists(targetImagePath):
                print("targetImage had been deleted: ",targetName)
                count+=1
                continue
            print(targetImagePath)
            temp=cv.imread(targetImagePath)
            targetImage = Image.fromarray(cv.cvtColor(temp,cv.COLOR_BGR2RGB))

            self.sub.Del(originImageList, targetImage,count)  # 单进程模式测试功能函数是否报错(因为多进程报错不会有提示)

            count+=1


class CompareImage:
    def __init__(self,imagePath):
        self.imagePath=imagePath

    def Del(self,batchList,targetImage,count):
        def _Compare(img1, img2):
            first_image_hasher = Hash(img1)
            second_image_hasher = Hash(img2)
            first_image_score = first_image_hasher.ahash()
            second_image_score = second_image_hasher.ahash()
            diff = 0
            for i in range(len(second_image_score)):
                if first_image_score[i] != second_image_score[i]:
                    diff += 1
            return diff

        subCount = 0
        for imageName in batchList:
            imagePath="./"+self.imagePath+"/"+imageName
            if not os.path.exists(imagePath):
                print("imagePath had been deleted: ",imagePath)
                continue

            temp=cv.imread(imagePath)

            '''解决Image.open打开文件过多的问题,使用cv打开再转成Image格式'''
            image = Image.fromarray(cv.cvtColor(temp,cv.COLOR_BGR2RGB))
            subCount+=1

            score = _Compare(targetImage,image)
            print("\nscore: "+str(score)+" count: "+str(count)+" size" + str(len(batchList))+" sunCount: "+str(subCount))

            if(score<=58 and score>0):
                print("Deleted ",imagePath)
                os.remove(imagePath)



if __name__ =="__main__":
    t1=time.time()
    print("start-------------------------------------")
    # splitRatio = sys.argv[1]
    #mode = sys.argv[1]  # 第二个参数输入修改的文件目录

    root = RootFile()
    root.FindSubRoot()

    print("end--------------------------------------")
    t2=time.time()
    print("Used time: ",t2-t1)
HashUtil.py
from PIL import Image
class Hash(object):

    def __init__(self, image):
        if not isinstance(image, str):
            image.seek(0)
        # self.image = Image.open(image)
        self.image = image

    # def CompareImage(self,img1,img2):
    #     first_image_hasher = Hash(img1)
    #     second_image_hasher = Hash(img2)
    #     first_image_score = first_image_hasher.ahash()
    #     second_image_score = second_image_hasher.ahash()
    #     diff = 0
    #     for i in range(len(second_image_score)):
    #         if first_image_score[i] != second_image_score[i]:
    #             diff += 1
    #     return diff

    def prepare_image(self, crop_width_perc=0, crop_height_perc=0, fit_image=True):
        result = self.image

        # convert to grayscale
        result = result.convert('L')

        # crop image
        image_size = result.size
        width_crop_size = int(image_size[0] * crop_width_perc / 2) if crop_width_perc > 0 else 0
        height_crop_size = int(image_size[1] * crop_height_perc / 2) if crop_height_perc > 0 else 0
        if width_crop_size or height_crop_size:
            result = result.crop(
                (
                    width_crop_size,
                    height_crop_size,
                    image_size[0] - width_crop_size,
                    image_size[1] - height_crop_size
                )
            )

        # resize to 128x128 pixels
        resize_option = Image.ANTIALIAS
        if fit_image:
            return ImageOps.fit(result, (128, 128), resize_option)

        return result.resize((128, 128), resize_option)

    def ahash(self, img=None, hash_size=16):
        im = img or self.image
        im = im.convert("L").resize((hash_size, hash_size), Image.ANTIALIAS)
        # Calc average value of pixels
        pixels = list(im.getdata())
        average = sum(pixels) / len(pixels)
        result = ''
        for pixel in pixels:
            if pixel > average:
                result += '1'
            else:
                result += '0'

        return result

    def calc_scores(self):
        alg = (
            ('crop', 0, 0, 8, True),  # original fitted to 128x128
            ('crop', 0, 0.1, 8, True), # vertical 10% crop fitted to 128x128
            ('crop', 0.1, 0, 8, True), # horizontal 10% crop fitted to 128x128
            ('crop', 0.1, 0.1, 8, True), # vertical and horizontal 10% crop fitted to 128x128

            ('crop', 0, 0, 8, False),  # original resized to 128x128
            ('crop', 0, 0.1, 8, False), # vertical 10% crop resized to 128x128
            ('crop', 0.1, 0, 8, False), # horizontal 10% crop resized to 128x128
            ('crop', 0.1, 0.1, 8, False) # vertical and horizontal 10% crop resized to 128x128
        )
        scores = []
        for item in alg:
            if item[0] == 'crop':
                v, h, hash_size, fit_image = item[1:]
                name = '%s_%s_%s_%s_%s' % item
                value = self.ahash(
                    img=self.prepare_image(
                        crop_width_perc=v,
                        crop_height_perc=h,
                        fit_image=fit_image
                    ),
                    hash_size=hash_size
                )
                scores.append((name, value))
        return scores

    @classmethod
    def calc_difference(cls, h1, h2):
        diff = 0
        for a, b in zip(h1, h2):
            diff += int(a != b)
        return diff

    @classmethod
    def predict(cls, vector):
        coefs = numpy.array(
            [
                [
                    0.30346249,
                    -0.33800637,
                    -0.30347395,
                    -0.33800637,
                    0.05190433,
                    -0.20001436,
                    0.07453074,
                    0.29136006
                ]
            ]
        )
        classifier = linear_model.LogisticRegression()
        classifier.coef_ = coefs
        classifier.intercept_ = numpy.array([ 1.98375232])
        resutl = classifier.predict_proba(numpy.array(vector))
        match = resutl[:, 1] > resutl[:, 0]
        return match[0]
  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值