经常会遇到文件夹里有很多重复图片的情况。
这种时候手动去删除会很浪费时间,可以采用以下代码对相同的图片进行自动删除,让每一个图片都被不重不漏地保留下来。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
import shutil
import cv2
import numpy as np
import os
import pandas as pd
import traceback
# 均值哈希算法
def aHash(img,shape=(10,10)):
# 缩放为10*10
img = cv2.resize(img, shape)
# 转换为灰度图
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# s为像素和初值为0,hash_str为hash值初值为''
s = 0
hash_str = ''
# 遍历累加求像素和
for i in range(shape[0]):
for j in range(shape[1]):
s = s + gray[i, j]
# 求平均灰度
avg = s / 100
# 灰度大于平均值为1相反为0生成图片的hash值
for i in range(shape[0]):
for j in range(shape[1]):
if gray[i, j] > avg:
hash_str = hash_str + '1'
else:
hash_str = hash_str + '0'
return hash_str
# 差值感知算法
def dHash(img,shape=(10,10)):
# 缩放10*11
img = cv2.resize(img, (shape[0]+1, shape[1]))
# 转换灰度图
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
hash_str = ''
# 每行前一个像素大于后一个像素为1,相反为0,生成哈希
for i in range(shape[0]):
for j in range(shape[1]):
if gray[i, j] > gray[i, j + 1]:
hash_str = hash_str + '1'
else:
hash_str = hash_str + '0'
return hash_str
# 感知哈希算法(pHash)
def pHash(img,shape=(10,10)):
# 缩放32*32
img = cv2.resize(img, (32, 32)) # , interpolation=cv2.INTER_CUBIC
# 转换为灰度图
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# 将灰度图转为浮点型,再进行dct变换
dct = cv2.dct(np.float32(gray))
# opencv实现的掩码操作
dct_roi = dct[0:10, 0:10]
hash = []
avreage = np.mean(dct_roi)
for i in range(dct_roi.shape[0]):
for j in range(dct_roi.shape[1]):
if dct_roi[i, j] > avreage:
hash.append(1)
else:
hash.append(0)
return hash
# 通过得到RGB每个通道的直方图来计算相似度
def classify_hist_with_split(image1, image2, size=(256, 256)):
# 将图像resize后,分离为RGB三个通道,再计算每个通道的相似值
image1 = cv2.resize(image1, size)
image2 = cv2.resize(image2, size)
sub_image1 = cv2.split(image1)
sub_image2 = cv2.split(image2)
sub_data = 0
for im1, im2 in zip(sub_image1, sub_image2):
sub_data += calculate(im1, im2)
sub_data = sub_data / 3
return sub_data
# 计算单通道的直方图的相似值
def calculate(image1, image2):
hist1 = cv2.calcHist([image1], [0], None, [256], [0.0, 255.0])
hist2 = cv2.calcHist([image2], [0], None, [256], [0.0, 255.0])
# 计算直方图的重合度
degree = 0
for i in range(len(hist1)):
if hist1[i] != hist2[i]:
degree = degree + (1 - abs(hist1[i] - hist2[i]) / max(hist1[i], hist2[i]))
else:
degree = degree + 1
degree = degree / len(hist1)
return degree
# Hash值对比
def cmpHash(hash1, hash2,shape=(10,10)):
n = 0
# hash长度不同则返回-1代表传参出错
if len(hash1)!=len(hash2):
return -1
# 遍历判断
for i in range(len(hash1)):
# 相等则n计数+1,n最终为相似度
if hash1[i] == hash2[i]:
n = n + 1
return n/(shape[0]*shape[1])
def remove_similarity_picture(path,s):
# 罗列path下所有文件,返回文件名数组
file = os.listdir(os.path.join(path,s))
print(file)
# 遍历文件
for i, o in enumerate(file):
# img1 = cv2.imread(os.path.join(path, o))
# 判断该图像是否还在文件夹中,因为有可能因为重复已经被删除了
file1 = os.listdir(os.path.join(path,s))
if o in file1:
print(1)
# 若还存在文件夹中就读取该图片文件
img1=cv2.imdecode(np.fromfile(os.path.join(path, s, o),dtype=np.uint8),-1)
# print('img1:',n)
# 遍历当前文件后面的所有文件
for a, b in enumerate(file[(i + 1):]):
# img2 = cv2.imread(os.path.join(path, b))
# 同样的,也要判断一下该文件夹中还有没有这张图片,因为有可能会有两张以上的重复图像,都会被删除
file2 = os.listdir(os.path.join(path,s))
if b in file2:
# 若还存在就读取后面的这些图像文件
img2 =cv2.imdecode(np.fromfile(os.path.join(path, s, b), dtype=np.uint8), -1)
# print('img2:',b)
# print(img1, img2)
print(o,b,"的相似度如下")
# hash1 = aHash(img1)
# hash2 = aHash(img2)
# n = cmpHash(hash1, hash2)
# print('均值哈希算法相似度:', n)
#
# hash1 = dHash(img1)
# hash2 = dHash(img2)
# n = cmpHash(hash1, hash2)
# print('差值哈希算法相似度:', n)
#
# hash1 = pHash(img1)
# hash2 = pHash(img2)
# n = cmpHash(hash1, hash2)
# print('感知哈希算法相似度:', n)
#
# n = calculate(img1, img2)
# print('单通道的直方图算法相似度:', n)
# 计算相似度
try:
n = classify_hist_with_split(img1, img2)
print('三直方图算法相似度:', n)
except Exception as err:
info = traceback.format_exc()
print(info)
os.remove(os.path.join(path,s,b))
pass
# 如果相似度大于0.95,判定为相似
if n>=0.95:
print('相似')
# 把这些相似图像文件删除
oldpath=os.path.join(path,s,b)
os.remove(oldpath)
def main():
path = 'D:/Project/test'
dt = pd.read_csv(path+"/LoadPicture.csv",header=0,sep=',')
# print(dt.columns)
path_list = []
for i in dt['Parent SKU']:
if i not in path_list:
path_list.append(os.path.join(path,i))
# print(path_list)
for s in path_list:
remove_similarity_picture(path,s)
if __name__=="__main__":
main()
有时候大量的文件或者图片需要进行删除,批量修改文件名,移动到其他文件夹,检查该图片是否损坏,或者批量做裁剪,下面就是做这些操作的代码脚本
import os
import shutil
import glob
from PIL import Image
import imghdr
import cv2
# 批量删除xml文件
def my_remove(base_path):
files = os.listdir('D:/Project/CycleGan/path/facades')
# print(files)
for file in files:
path = os.path.join(base_path,file)
# print(paths)
if file[-3:] == 'xml':
print(file)
os.remove(path)
# 批量修改文件名
def my_filechange():
files = os.listdir('C:/Users/Yafex/Desktop/cut_file/clean')
for i,filename in enumerate(files): #‘D:/Project/CycleGan/path/to/data/B/train/’是文件夹路径,你也可以替换其他=
newname = str(900000 + i) + '.jpg' #把jpg替换成png
print(filename)
print(newname)
os.rename('C:/Users/Yafex/Desktop/cut_file/clean/'+filename, 'C:/Users/Yafex/Desktop/cut_file/clean/'+newname)
# my_filechange()
# 批量移动文件
def my_move(srcfn, dstdir): ##定义移动函数,参数为待移动的文件路径和目标目录
if not os.path.isfile(srcfn): ##判断文件是否存在
print('srcfn error')
else:
srcdir, fn = os.path.split(srcfn) ##分离绝对路径和相对路径,获得文件名
if not os.path.exists(dstdir): ##如果目录不存在,创建目录
os.makedirs(dstdir)
dstfn = dstdir + fn ##生成目录下的文件名
shutil.move(srcfn, dstfn) ##移动
# 批量移动文件
def move_all():
fns = glob.glob('C:/Users/Yafex/Desktop/data/trainA/*.jpg') ##获取当前目录下所有jpg格式的文件
src_test = [fn for fn in fns if not (int(fn[-12:-6]) % 5)]
# src_test = [fn for fn in fns if fn[-9] == 'x' and 1 <= int(fn[-8:-4]) <= 120] ##获取当前目录下的'x0001-x0120',存为一个list,作为test集,
src_val = [fn for fn in fns if fn[-9] == 'b' and 1 <= int(fn[-8:-4]) <= 120] ##同理,获取'b0001-b0120'作为val集
print(src_test)
for ind in range(len(src_test)): ##循环移动所有文件
my_move(src_test[ind], 'C:/Users/Yafex/Desktop/data/valA/')
# for ind in range(len(src_val)): ##循环移动所有文件
#
# my_move(src_val[ind], 'D:/Project/CycleGan/path/to/data/A/val/')
move_all()
# 批量检查图片是否损坏
def check_picture():
base = 'C:/Users/Yafex/Desktop/shuiyin'
files = os.listdir(base)
for f in files:
if f[-3:] != 'txt':
file = os.path.join(base,f)
img = imghdr.what(file)
if img is None:
# if file == 'D:/Project/CycleGan/datasets/watermark2clean/trainB\\2012_003375.jpg':
print(file)
print(img)
# print(file)
# check_picture()
# 批量裁剪图片
def make_bbox():
path = "D:/Project/test_cutting" # 图片和检测坐标文件夹
path3 = "D:/Project/bboxcut" # 裁剪出来的小图要保存的目录
# w = 1000 # 原始图片resize
# h = 1000
img_total = [] # img空列表
txt_total = [] # txt空列表
file = os.listdir(path) # 获取所有文件
for filename in file: # 遍历所有文件
first,last = os.path.splitext(filename) # 分离文件名与扩展名,如'a.jpg',分离完之后,结果为'a'和'.jpg'
if last == ".png": # 图片的后缀名如果是jpg
img_total.append(first) # 就把该文件名存到img_total列表中
#print(img_total)
else:
txt_total.append(first) # 否则就存到txt_total列表中
for img_ in img_total: # 遍历img的文件名(不含后缀)
if img_ in txt_total: # 如果该文件名在txt列表中
print("文件名:",img_)
filename_img = img_+".png" # 就直接将其加上图片的后缀名,作为图片名
# print('filename_img:', filename_img)
path1 = os.path.join(path,filename_img) # 拼接路径和文件名
img = cv2.imread(path1) # 按路径读取图片
# img = cv2.resize(img,(w,h),interpolation = cv2.INTER_CUBIC) # resize图像大小,否则roi区域可能会报错
h,w,c = img.shape
filename_txt = img_+".txt" # 拿到对应的检测框数据x_center,y_center,width,height
# print('filename_txt:', filename_txt)
n = 1 # 设n为1
with open(os.path.join(path,filename_txt),"r+",encoding="utf-8",errors="ignore") as f: # 打开txt的检测数据文档
for line in f: # 遍历该文档
aa = line.split(" ") # 按空格切分
x_center,y_center,width,height = float(aa[2]),float(aa[3]),float(aa[4]),float(aa[5]) # aa[1]左上点的x_center坐标,aa[2]左上点的y_center坐标,aa[3]图片width,aa[4]图片height
lefttopx = int(w * (x_center-width/2.0)) # x_center-width/2,这是左上角的x坐标
leftdownx = int(w * (x_center+width/2.0))
lefttopy = int(h * (y_center-height/2.0)) # 注意,yolo中的y坐标系是朝下的,和常规坐标系相反,所以y_center-height/2是左上角的y坐标
leftdowny = int(h * (y_center+height / 2.0))
if lefttopy < 3:
lefttopy = 3
if lefttopx < 3:
lefttopx = 3
if leftdowny > h - 3:
lefttopy = h - 3
if leftdownx > w - 3:
lefttopx = w - 3
roi = img[lefttopy-3:leftdowny+3,lefttopx-3:leftdownx+3] # 按照[左上y:右下y,左上x:右下x]裁剪小图,其中(y1:y2,x1:x2)需要调参,否则裁剪出来的小图可能不太好
# print('roi:', roi) # 如果不resize图片统一大小,可能会得到有的roi为[]导致报错
filename_last = img_+"_"+str(n)+".jpg" # 裁剪出来的小图文件名,设为图片名+n的结构,格式为jpg
path2 = os.path.join(path3,"roi") # 需要在path3路径下创建一个roi文件夹
cv2.imwrite(os.path.join(path2,filename_last),roi) # 将该图片写入roi文件夹中
n = n+1 # n+1
else:
continue
# make_bbox()
def txt_match_jpg():
jpg_list = os.listdir('C:/Users/Yafex/Desktop/yolov5-master/datasets/watermark/images/train')
txt_list = os.listdir('C:/Users/Yafex/Desktop/yolov5-master/datasets/watermark/labels/train')
for j,t in zip(jpg_list,txt_list):
print(j,t)
# txt_match_jpg()