爬虫获取的图片经常会有不同名但相同或相似的情况,因此需要对图片进行去重。
以下介绍两种方法。
一、图片通过md5去重
import md5
import os
from time import clock as now
def getmd5(filename):
file_txt = open(filename, 'rb').read()
m = md5.new(file_txt)
return m.hexdigest()
def main():
allfiles = "F:\RenYongguo\\cats_remove"
# all_md5 = []
# total_file = 0
# total_delete = 0
start = now()
for dir in os.listdir(allfiles):
all_md5 = []
total_file = 0
total_delete = 0
path = os.path.join(allfiles,dir)
print path
for file in os.listdir(path):
total_file += 1;
real_path = os.path.join(path, file)
if os.path.isfile(real_path) == True:
filemd5 = getmd5(real_path)
if filemd5 in all_md5:
total_delete += 1
os.remove(real_path)
# print u'删除', file
else:
all_md5.append(filemd5)
end = now()
time_last = end - start
print u'文件总数:', total_file
print u'删除个数:', total_delete
print u'耗时:', time_last, u'秒'
if __name__ == '__main__':
main()
参考网址:http://www.cnblogs.com/ma6174/archive/2012/05/05/2484415.html
二、计算SSIM图片相似度
# -*- coding: utf-8 -*-
import os
import cv2 as cv
from skimage.measure import compare_ssim
EXT = ['.jpg', '.jpeg']
def delete(imgs_n):
for image in imgs_n:
os.remove(image)
def find_sim_images(dir_path):
imgs_n = []
img_files = [os.path.join(rootdir, file) for rootdir, _, files in os.walk(dir_path) for file in files if
(os.path.splitext(file)[-1] in EXT)]
for currIndex, filename in enumerate(img_files):
if filename in imgs_n:
continue
if currIndex >= len(img_files) - 1:
break
for filename2 in img_files[currIndex + 1:]:
if filename2 in imgs_n:
continue
img = cv.imread(filename)
img1 = cv.imread(filename2)
try:
ssim = compare_ssim(img, img1, multichannel=True)
if ssim > 0.9:
imgs_n.append(filename2)
print(filename, filename2, ssim)
except ValueError:
pass
print(imgs_n)
return imgs_n
if __name__ == '__main__':
path = 'E:\OCRdata_IBM\\new'
delete(find_sim_images(path))
遇到问题:AttributeError: 'NoneType' object has no attribute 'shape'
原因是cv.imread(filename)
返回了None
。
运行环境为win10、python3.6。
解决:去除路径中的数字就好了。