前言
最近整理nas群晖的文件,发现影音库里面很多重复的资源,自己就写了一个python脚本自动的处理。
import os
import hashlib
# 只删除以下列表中的重复文件类型.如果想删除其他类型的文件,自己添加一下就行了
file_type = ['.jpg', '.jpeg', '.png', '.gif', '.psd', '.bmp', '.webp', '.mp4', '.mkv', '.avi', '.mov', 'mpeg', 'mpg',
'.rar', '.zip','img']
check_files = []
def remove_reapt_files():
for root, dirs, files in os.walk(r'/Users/yuqianjun/Downloads/'):
#这个路径/Users/yuqianjun/Downloads/就是你需要 查重的路径;自己定义就好了
for name in files:
print(name)
p_type = os.path.splitext(os.path.join(root, name))[1]
if p_type in file_type:
check_files.append(os.path.join(root, name))
for name in dirs:
p_type = os.path.splitext(os.path.join(root, name))[1]
if p_type in file_type:
check_files.append(os.path.join(root, name))
files_dict = {}
r_index = 0
print('Fiels Num:%s' % len(check_files))
for value in check_files:
md5_hash = hashlib.md5()
try:
with open(value, "rb+") as f:
for byte_block in iter(lambda: f.read(4096), b""):
md5_hash.update(byte_block)
file_md5 = md5_hash.hexdigest()
print('Check file MD5:%s' % value)
if files_dict.get(file_md5) is None:
files_dict[file_md5] = value
else:
d_path = files_dict[file_md5]
d_path_stats = os.stat(d_path)
file_stats = os.stat(value)
d_time = d_path_stats.st_ctime
f_time = file_stats.st_ctime
if d_time > f_time:
os.remove(d_path)
files_dict[file_md5] = value
print('Delete File:', d_path)
r_index += 1
else:
os.remove(value)
print('Delete File:', value)
r_index += 1
except Exception as e:
pass
print('File does not exist or has been deleted')
print('File Count:%s, Repeat Files Num:%s. All deleted!' %( len(check_files),str(r_index)))
if __name__ == '__main__':
remove_reapt_files()