根据文件大小去除重复文件，python

最新推荐文章于 2024-07-31 14:30:10 发布

ljzwksm

最新推荐文章于 2024-07-31 14:30:10 发布

阅读量146

点赞数 3

文章标签： python 哈希算法开发语言

本文链接：https://blog.csdn.net/qq_46216549/article/details/140002106

版权

import os  

import hashlib  

import shutil  

from collections import defaultdict  

  

def get_file_hash(file_path, block_size=2**20):  

    sha256 = hashlib.sha256()  

    with open(file_path, 'rb') as f:  

        while True:  

            data = f.read(block_size)  

            if not data:  

                break  

            sha256.update(data)  

    return sha256.hexdigest()  

  

def find_and_remove_duplicates(directory, keep_one=True):  

    file_sizes = defaultdict(list)  

    file_hashes = {}  

    duplicates_to_remove = set()  

  

    # 遍历目录中的所有文件  

    for root, dirs, files in os.walk(directory):  

        for file in files:  

            file_path = os.path.join(root, file)  

            size = os.path.getsize(file_path)  

            file_sizes[size].append(file_path)  

  

    # 找出具有相同大小的文件  

    for size, files in file_sizes.items():  

        if len(files) > 1:  

            # 计算这些文件的哈希值  

            for file_path in files:  

                hash_value = get_file_hash(file_path)  

                if hash_value not in file_hashes:  

                    file_hashes[hash_value] = set()  

                file_hashes[hash_value].add(file_path)  

  

            # 找出哈希值相同的文件（即真正的重复文件）  

            for hash_value, file_set in file_hashes.items():  

                if len(file_set) > 1:  

                    # 标记要删除的文件（除了保留一个）  

                    if keep_one:  

                        duplicates_to_remove.update(file_set - {next(iter(file_set))})  

                    else:  

                        duplicates_to_remove.update(file_set)  

  

    # 删除标记为要删除的文件  

    for file_path in duplicates_to_remove:  

        try:  

            os.remove(file_path)  

            print(f"Removed duplicate file: {file_path}")  

        except OSError as e:  

            print(f"Error: {e.strerror} : {file_path}")  

  

# 使用示例  

find_and_remove_duplicates('D:/01-学习/00-新资料/01-数学/常微分方程')