python图片去重

爬虫获取的图片经常会有不同名但相同或相似的情况,因此需要对图片进行去重。
以下介绍两种方法。

一、图片通过md5去重

import md5
import os
from time import clock as now


def getmd5(filename):
    file_txt = open(filename, 'rb').read()
    m = md5.new(file_txt)
    return m.hexdigest()


def main():
    allfiles = "F:\RenYongguo\\cats_remove"
    # all_md5 = []
    # total_file = 0
    # total_delete = 0
    start = now()
    for dir in os.listdir(allfiles):
        all_md5 = []
        total_file = 0
        total_delete = 0
        path = os.path.join(allfiles,dir)
        print path
        for file in os.listdir(path):
            total_file += 1;
            real_path = os.path.join(path, file)
            if os.path.isfile(real_path) == True:
                filemd5 = getmd5(real_path)
                if filemd5 in all_md5:
                    total_delete += 1
                    os.remove(real_path)
                    # print u'删除', file
                else:
                    all_md5.append(filemd5)
        end = now()
        time_last = end - start
        print u'文件总数:', total_file
        print u'删除个数:', total_delete
    print u'耗时:', time_last, u'秒'


if __name__ == '__main__':
    main()

参考网址:http://www.cnblogs.com/ma6174/archive/2012/05/05/2484415.html

二、计算SSIM图片相似度

# -*- coding: utf-8 -*-

import os
import cv2 as cv
from skimage.measure import compare_ssim

EXT = ['.jpg', '.jpeg']


def delete(imgs_n):
    for image in imgs_n:
        os.remove(image)


def find_sim_images(dir_path):
    imgs_n = []
    img_files = [os.path.join(rootdir, file) for rootdir, _, files in os.walk(dir_path) for file in files if
                 (os.path.splitext(file)[-1] in EXT)]
    for currIndex, filename in enumerate(img_files):
        if filename in imgs_n:
            continue
        if currIndex >= len(img_files) - 1:
            break
        for filename2 in img_files[currIndex + 1:]:
            if filename2 in imgs_n:
                continue
            img = cv.imread(filename)
            img1 = cv.imread(filename2)
            try:
                ssim = compare_ssim(img, img1, multichannel=True)
                if ssim > 0.9:
                    imgs_n.append(filename2)
                    print(filename, filename2, ssim)
            except ValueError:
                pass
    print(imgs_n)
    return imgs_n


if __name__ == '__main__':
    path = 'E:\OCRdata_IBM\\new'
    delete(find_sim_images(path))

遇到问题:AttributeError: 'NoneType' object has no attribute 'shape'
原因是cv.imread(filename)返回了None
运行环境为win10、python3.6。
解决:去除路径中的数字就好了。

评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值