使用哈希算法删除相似图像
哈希算法删除相似图像
对数据处理后,使用哈希算法删除相似图像,解决相同内容不同分辨率的图像无法删除。
数据增强
随机垂直翻转
if random.random() > 0.5:
image = image.transpose(Image.FLIP_TOP_BOTTOM)
steps.append(('Vertical Flip', image.copy()))
随机旋转
angle = random.uniform(-10, 10)
image = image.rotate(angle)
steps.append(('Rotation', image.copy()))
随机裁剪
width, height = image.size
left = random.uniform(0, 0.2 * width)
top = random.uniform(0, 0.2 * height)
right = width - random.uniform(0, 0.2 * width)
bottom = height - random.uniform(0, 0.2 * height)
image = image.crop((left, top, right, bottom))
image = image.resize(image_size)
steps.append(('Crop', image.copy()))
添加缩放
scale_factor = random.uniform(0.8, 1.2)
image = image.resize((int(image_size[0] * scale_factor), int(image_size[1] * scale_factor)))
image = image.resize(image_size)
steps.append(('Scaling', image.copy()))
亮度调整
enhancer = ImageEnhance.Brightness(image)
image = enhancer.enhance(random.uniform(0.8, 1.2))
steps.append(('Brightness Adjustment', image.copy()))
处理图像
判断图像是否损坏
with Image.open(filepath) as img:
img.verify() # 检查图像是否损坏
except (UnidentifiedImageError, OSError):
damaged_images.append(filepath)
os.remove(filepath) # 删除损坏的图像
哈希删除相似图像
with Image.open(filepath) as img:
img = img.resize(image_size) # 标准化图像大小
hash = imagehash.average_hash(img, hash_size)
if hash in hash_dict:
duplicates += 1
deleted_images.append(filepath)
os.remove(filepath) # 删除相似图像
else:
hash_dict[hash] = filepath
total_images += 1
内容相同分辨率不同如何解决
思路:在计算哈希前统一图像分辨率
with Image.open(filepath) as img:
img = img.resize(image_size) # 标准化图像大小
完整代码
import os
import shutil
from PIL import Image, ImageEnhance, UnidentifiedImageError
import imagehash
import torch
from torchvision import datasets, transforms
from sklearn.model_selection import train_test_split
import random
import matplotlib.pyplot as plt
# 定义图像目录和数据集目录
image_dir = 'fruits'
dataset_dir = 'dataset'
output_dir = 'augmented_dataset'
# 创建数据集目录
if not os.path.exists(dataset_dir):
os.makedirs(dataset_dir)
# 创建增强后数据集目录
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# 定义图像大小和数据增强方法
image_size = (224, 224)
# 自定义数据增强函数
def custom_transform(image):
steps = []
# 原始图像
steps.append(('Original', image.copy()))
# 随机垂直翻转
if random.random() > 0.5:
image = image.transpose(Image.FLIP_TOP_BOTTOM)
steps.append(('Vertical Flip', image.copy()))
# 随机旋转
angle = random.uniform(-10, 10)
image = image.rotate(angle)
steps.append(('Rotation', image.copy()))
# 随机裁剪
width, height = image.size
left = random.uniform(0, 0.2 * width)
top = random.uniform(0, 0.2 * height)
right = width - random.uniform(0, 0.2 * width)
bottom = height - random.uniform(0, 0.2 * height)
image = image.crop((left, top, right, bottom))
image = image.resize(image_size)
steps.append(('Crop', image.copy()))
# 添加缩放
scale_factor = random.uniform(0.8, 1.2)
image = image.resize((int(image_size[0] * scale_factor), int(image_size[1] * scale_factor)))
image = image.resize(image_size)
steps.append(('Scaling', image.copy()))
# 亮度调整
enhancer = ImageEnhance.Brightness(image)
image = enhancer.enhance(random.uniform(0.8, 1.2))
steps.append(('Brightness Adjustment', image.copy()))
return image, steps
# 读取并处理图像
def process_images(image_dir, hash_size=8):
hash_dict = {}
duplicates = 0
total_images = 0
deleted_images = []
damaged_images = []
for subdir, dirs, files in os.walk(image_dir):
for file in files:
filepath = os.path.join(subdir, file)
if filepath.lower().endswith(('png', 'jpg', 'jpeg')):
try:
with Image.open(filepath) as img:
img.verify() # 检查图像是否损坏
with Image.open(filepath) as img:
img = img.resize(image_size) # 标准化图像大小
hash = imagehash.average_hash(img, hash_size)
if hash in hash_dict:
duplicates += 1
deleted_images.append(filepath)
os.remove(filepath) # 删除相似图像
else:
hash_dict[hash] = filepath
total_images += 1
except (UnidentifiedImageError, OSError):
damaged_images.append(filepath)
os.remove(filepath) # 删除损坏的图像
return total_images - duplicates, duplicates, deleted_images, damaged_images
# 划分数据集
def split_dataset(image_dir, dataset_dir, test_size=0.1):
for fruit in os.listdir(image_dir):
fruit_dir = os.path.join(image_dir, fruit)
if not os.path.isdir(fruit_dir):
continue
images = [os.path.join(fruit_dir, img) for img in os.listdir(fruit_dir) if img.lower().endswith(('png', 'jpg', 'jpeg'))]
train_imgs, val_imgs = train_test_split(images, test_size=test_size, random_state=42)
# 保存划分后的图片
for dataset_type, img_list in zip(['train', 'val'], [train_imgs, val_imgs]):
save_dir = os.path.join(dataset_dir, fruit, dataset_type)
os.makedirs(save_dir, exist_ok=True)
for img_path in img_list:
shutil.copy(img_path, save_dir)
# 数据增强和保存
def augment_and_save(custom_transform, dataset_dir, output_dir):
for dataset_type in ['train', 'val']:
for fruit in os.listdir(dataset_dir):
fruit_dir = os.path.join(dataset_dir, fruit, dataset_type)
if not os.path.isdir(fruit_dir):
continue
output_fruit_dir = os.path.join(output_dir, fruit, dataset_type)
os.makedirs(output_fruit_dir, exist_ok=True)
dataset = datasets.ImageFolder(root=dataset_dir, transform=transforms.ToTensor())
for idx, (image, label) in enumerate(dataset):
if dataset.classes[label] == fruit:
image_pil = transforms.ToPILImage()(image)
transformed_image, _ = custom_transform(image_pil)
output_path = os.path.join(output_fruit_dir, f'{idx}.jpg')
transformed_image.save(output_path)
# 执行图像处理和数据集划分
num_images, num_duplicates, deleted_images, damaged_images = process_images(image_dir)
print(f"总共处理了 {num_images} 张图片,删除了 {num_duplicates} 张相似图片。")
print(f"删除了 {len(damaged_images)} 张损坏的图片。")
if deleted_images:
print("删除的相似图片:")
for img in deleted_images:
print(img)
if damaged_images:
print("删除的损坏图片:")
for img in damaged_images:
print(img)
split_dataset(image_dir, dataset_dir)
print("数据集划分完成。")
# 执行数据增强和保存
augment_and_save(custom_transform, dataset_dir, output_dir)
print("数据增强完成并保存到augmented_dataset目录。")
# 从某一类中随机选取一张图像,展示其数据增强效果
def show_augmentation_example(image_dir):
fruits = os.listdir(image_dir)
random_fruit = random.choice(fruits)
fruit_dir = os.path.join(image_dir, random_fruit, 'train')
if not os.path.isdir(fruit_dir):
return
images = [os.path.join(fruit_dir, img) for img in os.listdir(fruit_dir) if img.lower().endswith(('png', 'jpg', 'jpeg'))]
random_image_path = random.choice(images)
with Image.open(random_image_path) as img:
_, steps = custom_transform(img)
plt.figure(figsize=(12, 24))
for i, (step_name, step_image) in enumerate(steps):
plt.subplot(len(steps), 1, i + 1)
plt.title(step_name)
plt.imshow(step_image)
plt.show()
show_augmentation_example(dataset_dir)