整理数据,发现有些文件名虽然不一样,但是文件内容是一样的。所以使用MD5值进行查重,然后通过os.system调用shell脚本删除多余元素。
#code by yangdaxia
import hashlib
import os
from collections import Counter
def getFileMD5(filepath):
'''
:param filepath:
:return: md5
'''
f = open(filepath,'rb')
md5 = hashlib.md5()
md5.update(f.read())
hash = md5.hexdigest()
f.close()
return str(hash)
def checkRemove(path):
files = []
fileMd5s = []
lines = os.listdir(path)
for idx, file in enumerate(lines):
Md5 = getFileMD5(path + file)
fileMd5s.append(Md5)
#print(idx, yl)
files.append(file)
tmp = dict(zip(files, fileMd5s))
print('构建文件与MD5映射完毕!')
chachong = Counter(fileMd5s)
print('查重中......')
for key, value in chachong.items():
if int(value) >1:
print(key, value)
chongfu = [key2 for key2, value2 in tmp.items() if value2 ==key]
print(chongfu)
with open('chongfu.txt', 'a')as fw:
fw.write(str(chongfu)[1:-1]+'\n')
rmf = path+chongfu[-1]
print(rmf)
os.system('rm -v %s'%(rmf))
if __name__=='__main__':
path = '/mnt/lustrenew/dataset/test/Image/guonei_test/Images/'
checkRemove(path)