import os
import hashlib
import shutil
from collections import defaultdict
def get_file_hash(file_path, block_size=2**20):
sha256 = hashlib.sha256()
with open(file_path, 'rb') as f:
while True:
data = f.read(block_size)
if not data:
break
sha256.update(data)
return sha256.hexdigest()
def find_and_remove_duplicates(directory, keep_one=True):
file_sizes = defaultdict(list)
file_hashes = {}
duplicates_to_remove = set()
for root, dirs, files in os.walk(directory):
for file in files:
file_path = os.path.join(root, file)
size = os.path.getsize(file_path)
file_sizes[size].append(file_path)
for size, files in file_sizes.items():
if len(files) > 1:
for file_path in files:
hash_value = get_file_hash(file_path)
if hash_value not in file_hashes:
file_hashes[hash_value] = set()
file_hashes[hash_value].add(file_path)
for hash_value, file_set in file_hashes.items():
if len(file_set) > 1:
if keep_one:
duplicates_to_remove.update(file_set - {next(iter(file_set))})
else:
duplicates_to_remove.update(file_set)
for file_path in duplicates_to_remove:
try:
os.remove(file_path)
print(f"Removed duplicate file: {file_path}")
except OSError as e:
print(f"Error: {e.strerror} : {file_path}")
find_and_remove_duplicates('D:/01-学习/00-新资料/01-数学/常微分方程')
- List item