介绍
本项目Github
本项目使用aHash,目的是去除重复的数据集图片,Hash部分来自于Github项目:https://github.com/7WebPages/comparer
目录结构
- Root
- DeleteSameImage.py
- HashUtils.py
- datasets
- images1
- images2
快速开始
- 按照目录结构构件项目目录,然后直接运行DeleteSameImage.py。
依赖
- opencv
- PIL
代码
DeleteSampleImage.py
from PIL import Image
from HashUtils import Hash
import cv2 as cv
import os
import time
class RootFile:
def __init__(self,path="./"):
self.path=path
def FindSubRoot(self):
def GetSubRootList(path):
originDirList = os.listdir(path)
originDirList.sort()
filetedDirList = []
for dir in originDirList:
if os.path.isdir(os.path.join(path, dir)) == True and dir!="__pycache__":
'''筛选出文件夹'''
filetedDirList.append(dir)
return filetedDirList
subRootList=GetSubRootList(self.path)
for subRoot in subRootList:
print("=========================================")
print(subRoot + " Start")
print("=========================================")
imgs = Images(subRoot)
imgs.FindRepeat()
class Images:
def __init__(self,path="./"):
self.imagePath = path + "/images"
self.sub = CompareImage(self.imagePath)
def FindRepeat(self):
count=0#控制targetImage
'''过滤/label中的classes.txt文件'''
originImageList = os.listdir(self.imagePath)
originImageList.sort(key=lambda x: (x[:-4])) # 对文件名按照数字从小到大排序屏蔽最后四位
print("=========================================")
print(" Start ")
print("=========================================")
originImageList = os.listdir(self.imagePath)
originImageList.sort(key=lambda x: (x[:-4])) # 对文件名按照数字从小到大排序屏蔽最后四位
while count != len(originImageList):
print("!!count:",count)
targetName = originImageList[count]
targetImagePath = "./" + self.imagePath + "/" + targetName
if not os.path.exists(targetImagePath):
print("targetImage had been deleted: ",targetName)
count+=1
continue
print(targetImagePath)
temp=cv.imread(targetImagePath)
targetImage = Image.fromarray(cv.cvtColor(temp,cv.COLOR_BGR2RGB))
self.sub.Del(originImageList, targetImage,count) # 单进程模式测试功能函数是否报错(因为多进程报错不会有提示)
count+=1
class CompareImage:
def __init__(self,imagePath):
self.imagePath=imagePath
def Del(self,batchList,targetImage,count):
def _Compare(img1, img2):
first_image_hasher = Hash(img1)
second_image_hasher = Hash(img2)
first_image_score = first_image_hasher.ahash()
second_image_score = second_image_hasher.ahash()
diff = 0
for i in range(len(second_image_score)):
if first_image_score[i] != second_image_score[i]:
diff += 1
return diff
subCount = 0
for imageName in batchList:
imagePath="./"+self.imagePath+"/"+imageName
if not os.path.exists(imagePath):
print("imagePath had been deleted: ",imagePath)
continue
temp=cv.imread(imagePath)
'''解决Image.open打开文件过多的问题,使用cv打开再转成Image格式'''
image = Image.fromarray(cv.cvtColor(temp,cv.COLOR_BGR2RGB))
subCount+=1
score = _Compare(targetImage,image)
print("\nscore: "+str(score)+" count: "+str(count)+" size" + str(len(batchList))+" sunCount: "+str(subCount))
if(score<=58 and score>0):
print("Deleted ",imagePath)
os.remove(imagePath)
if __name__ =="__main__":
t1=time.time()
print("start-------------------------------------")
# splitRatio = sys.argv[1]
#mode = sys.argv[1] # 第二个参数输入修改的文件目录
root = RootFile()
root.FindSubRoot()
print("end--------------------------------------")
t2=time.time()
print("Used time: ",t2-t1)
HashUtil.py
from PIL import Image
class Hash(object):
def __init__(self, image):
if not isinstance(image, str):
image.seek(0)
# self.image = Image.open(image)
self.image = image
# def CompareImage(self,img1,img2):
# first_image_hasher = Hash(img1)
# second_image_hasher = Hash(img2)
# first_image_score = first_image_hasher.ahash()
# second_image_score = second_image_hasher.ahash()
# diff = 0
# for i in range(len(second_image_score)):
# if first_image_score[i] != second_image_score[i]:
# diff += 1
# return diff
def prepare_image(self, crop_width_perc=0, crop_height_perc=0, fit_image=True):
result = self.image
# convert to grayscale
result = result.convert('L')
# crop image
image_size = result.size
width_crop_size = int(image_size[0] * crop_width_perc / 2) if crop_width_perc > 0 else 0
height_crop_size = int(image_size[1] * crop_height_perc / 2) if crop_height_perc > 0 else 0
if width_crop_size or height_crop_size:
result = result.crop(
(
width_crop_size,
height_crop_size,
image_size[0] - width_crop_size,
image_size[1] - height_crop_size
)
)
# resize to 128x128 pixels
resize_option = Image.ANTIALIAS
if fit_image:
return ImageOps.fit(result, (128, 128), resize_option)
return result.resize((128, 128), resize_option)
def ahash(self, img=None, hash_size=16):
im = img or self.image
im = im.convert("L").resize((hash_size, hash_size), Image.ANTIALIAS)
# Calc average value of pixels
pixels = list(im.getdata())
average = sum(pixels) / len(pixels)
result = ''
for pixel in pixels:
if pixel > average:
result += '1'
else:
result += '0'
return result
def calc_scores(self):
alg = (
('crop', 0, 0, 8, True), # original fitted to 128x128
('crop', 0, 0.1, 8, True), # vertical 10% crop fitted to 128x128
('crop', 0.1, 0, 8, True), # horizontal 10% crop fitted to 128x128
('crop', 0.1, 0.1, 8, True), # vertical and horizontal 10% crop fitted to 128x128
('crop', 0, 0, 8, False), # original resized to 128x128
('crop', 0, 0.1, 8, False), # vertical 10% crop resized to 128x128
('crop', 0.1, 0, 8, False), # horizontal 10% crop resized to 128x128
('crop', 0.1, 0.1, 8, False) # vertical and horizontal 10% crop resized to 128x128
)
scores = []
for item in alg:
if item[0] == 'crop':
v, h, hash_size, fit_image = item[1:]
name = '%s_%s_%s_%s_%s' % item
value = self.ahash(
img=self.prepare_image(
crop_width_perc=v,
crop_height_perc=h,
fit_image=fit_image
),
hash_size=hash_size
)
scores.append((name, value))
return scores
@classmethod
def calc_difference(cls, h1, h2):
diff = 0
for a, b in zip(h1, h2):
diff += int(a != b)
return diff
@classmethod
def predict(cls, vector):
coefs = numpy.array(
[
[
0.30346249,
-0.33800637,
-0.30347395,
-0.33800637,
0.05190433,
-0.20001436,
0.07453074,
0.29136006
]
]
)
classifier = linear_model.LogisticRegression()
classifier.coef_ = coefs
classifier.intercept_ = numpy.array([ 1.98375232])
resutl = classifier.predict_proba(numpy.array(vector))
match = resutl[:, 1] > resutl[:, 0]
return match[0]