python多线程复制指定文件夹

__尹天仇__

已于 2024-01-07 21:33:22 修改

阅读量600

点赞数 7

文章标签： python

于 2024-01-06 00:30:13 首次发布

本文链接：https://blog.csdn.net/lshgax/article/details/135420231

版权

文章介绍了如何使用Python编写代码，通过构建任务队列和线程池来高效地复制硬盘上的指定文件夹，尤其针对包含大量小文件的情况，实测速度比Windows资源管理器更快。作者还分享了WindowsDefender对复制过程的影响和优化建议。

摘要由CSDN通过智能技术生成

python多线程复制指定文件夹（包含大量小文件）

需求
实现
总结

需求

硬盘-1里有一个源文件夹（记作srcDir），里面有大量的子文件夹，每个子文件夹是一个样本，以样本名称命名，一个样本包含大量小文件（300-500个300-500KB的文件）。经过数据筛选，得到了一个纳入样本列表，保存在included.xlsx文件的sampleName列，sampleName列的每一行就是srcDir下的子文件夹。现需要将纳入的样本复制到硬盘-2的dstDir中。

经过简单的搜索，并未发现可以简单实现此需求的软件，遂决定自己写一个。

实现

构建任务队列

输入srcDir、dstDir、included.xlsx和targetColumn，先从included.xlsx中的targetColumn读取需要复制的样本（即srcDir的子文件夹），拼接源路径以及目标路径，保存到tasks列表中。

# python版本：3.11.5

import sys
import pandas as pd
from pathlib import Path

def generate_copy_tasks(sheet: pd.DataFrame, src_dir:Path, dst_dir:Path, column:str) -> list[tuple[Path, Path]]:
    ''' 生成复制任务 '''
    print(f'开始构建复制任务')
    if not column in sheet.columns:
        print(f'{column} 不在excel中')
        return []
    task_df = sheet[column].dropna().apply(lambda x: (src_dir / x, dst_dir / x))
    print(f'构建复制任务完成，共{len(task_df.shape[0])}个任务')
    return task_df.tolist()

    
    # 读取excel
    excel_path = Path(r'D:/include.xlsx')
    sheet = pd.read_excel(excel_path, dtype=str)
    # 指定源文件夹和目标文件夹
    src_dir = Path('D:/srcdir/')
    dst_dir = Path('E:/dstdir/')
    # 生成复制任务
    tasks = generate_copy_tasks(sheet, src_dir, dst_dir, 'sampleName')
    if not tasks:
    	print(f'没有复制任务')
   		sys.exit(0)

多线程执行任务

构建好任务队列后，创建线程池，将任务队列提交到线程池中，等待所有线程结束，退出程序。

# python版本：3.11.5

import shutil
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed


def copy_folder(src_path:Path, dst_path:Path):
    '''	子线程执行的函数，复制文件夹（只能复制文件夹）'''
    if not (src_path.exists() and src_path.is_dir()):
        print(f'{src_path} 不存在或不是文件夹\n')
        return src_path.stem
    
    if dst_path.exists():
        # 目标文件夹已经存在，可能是未完整复制的文件夹
        dst_files = sorted((file.name, file.stat().st_size) for file in dst_path.iterdir())
        src_files = sorted((file.name, file.stat().st_size) for file in src_path.iterdir())
        if len(dst_files) != len(src_files) or any([src != dst for src, dst in zip(src_files, dst_files)]):
            # 未完整复制
            pass
        else:
            # 已完整复制，跳过
            return src_path.stem

    try:
        # 调用shutil复制文件夹
        shutil.copytree(src_path, dst_path, dirs_exist_ok=True)
    except Exception as e:
        print(f'复制失败, {e}')
    finally:
        return src_path.stem


def multi_thread_copy(tasks: list[tuple[Path, Path]], thread_counts:int):
    ''' 多线程复制 '''
    thread_counts = max(1, min(thread_counts, len(tasks)))
    with ThreadPoolExecutor(thread_counts) as executor:
        futures = []
        for src_path, dst_path in tasks:
        	# 把任务提交到copy_folder函数去执行
            futures.append(executor.submit(copy_folder, src_path, dst_path))

        process_bar = tqdm(as_completed(futures), desc='复制进度', total=len(futures))
        for future in process_bar:
            r = future.result()    # 获取返回值(文件夹名)
            process_bar.set_postfix_str(f'{r} Done')    # 更新进度条的后缀
    
    # 检查是否有未完成的复制任务
    print(f'正在检查复制结果...')
    all_done = True
    for src_path, dst_path in tasks:
        if not dst_path.exists():
            print(f'{src_path.stem} 未复制')
        dst_files = sorted((file.name, file.stat().st_size) for file in dst_path.iterdir())
        src_files = sorted((file.name, file.stat().st_size) for file in src_path.iterdir())
        if len(dst_files) != len(src_files) or any([src != dst for src, dst in zip(src_files, dst_files)]):
            print(f'{src_path.stem} 未完整复制')
    print(f'检查完成')
    return all_done

完整代码

# python版本：3.11.5

import sys
import shutil
import pandas as pd

from pathlib import Path
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed


def generate_copy_tasks(sheet: pd.DataFrame, src_dir:Path, dst_dir:Path, column:str) -> list[tuple[Path, Path]]:
    ''' 生成复制任务 '''
    print(f'开始构建复制任务')
    if not column in sheet.columns:
        print(f'{column} 不在excel中')
        return []
    task_df = sheet[column].dropna().apply(lambda x: (src_dir / x, dst_dir / x))
    print(f'构建复制任务完成，共{len(task_df.shape[0])}个任务')
    return task_df.tolist()


def copy_folder(src_path:Path, dst_path:Path):
    '''	子线程执行的函数，复制文件夹（只能复制文件夹）'''
    if not (src_path.exists() and src_path.is_dir()):
        print(f'{src_path} 不存在或不是文件夹\n')
        return src_path.stem
    
    if dst_path.exists():
        # 目标文件夹已经存在，可能是未完整复制的文件夹
        dst_files = sorted((file.name, file.stat().st_size) for file in dst_path.iterdir())
        src_files = sorted((file.name, file.stat().st_size) for file in src_path.iterdir())
        if len(dst_files) != len(src_files) or any([src != dst for src, dst in zip(src_files, dst_files)]):
            # 未完整复制
            pass
        else:
            # 已完整复制，跳过
            return src_path.stem

    try:
        # 调用shutil复制文件夹
        shutil.copytree(src_path, dst_path, dirs_exist_ok=True)
    except Exception as e:
        print(f'复制失败, {e}')
    finally:
        return src_path.stem


def multi_thread_copy(tasks: list[tuple[Path, Path]], thread_counts:int):
    ''' 多线程复制 '''
    thread_counts = max(1, min(thread_counts, len(tasks)))
    with ThreadPoolExecutor(thread_counts) as executor:
        futures = []
        for src_path, dst_path in tasks:
            futures.append(executor.submit(copy_folder, src_path, dst_path))

        process_bar = tqdm(as_completed(futures), desc='复制进度', total=len(futures))
        for future in process_bar:
            r = future.result()    # 获取返回值(文件夹名)
            process_bar.set_postfix_str(f'{r} Done')    # 更新进度条的后缀
    
    # 检查是否有未完成的复制任务
    print(f'正在检查复制结果...')
    all_done = True
    for src_path, dst_path in tasks:
        if not dst_path.exists():
            print(f'{src_path.stem} 未复制')
        dst_files = sorted((file.name, file.stat().st_size) for file in dst_path.iterdir())
        src_files = sorted((file.name, file.stat().st_size) for file in src_path.iterdir())
        if len(dst_files) != len(src_files) or any([src != dst for src, dst in zip(src_files, dst_files)]):
            print(f'{src_path.stem} 未完整复制')
    print(f'检查完成')
    return all_done


if __name__ == '__main__':
    s_t = time.time()
    # 读取excel
    excel_path = Path(r'D:/include.xlsx')
    sheet = pd.read_excel(excel_path, dtype=str)
    # 指定源文件夹和目标文件夹
    src_dir = Path('D:/srcdir/')
    dst_dir = Path('E:/dstdir/')
    # 生成复制任务
    tasks = generate_copy_tasks(sheet, src_dir, dst_dir, 'sampleName')
    if not tasks:
    	print(f'没有复制任务')
   		sys.exit(0)
    # 复制文件夹
    all_done = multi_thread_copy(tasks, 8)
    if all_done:
        print(f'全部复制完成')
    else:
        print(f'存在未完成的复制任务')
    e_t = time.time()
    print(f'总耗时：{e_t-s_t:.2f}s')

效果

实测速度明显优于Windows资源管理器直接复制，与知名软件 FastCopy不相上下（甚至在略胜，不过CPU占用也明显“胜过”FastCopy😂）。（仅代表本人的测试环境、测试数据（大量小文件）及测试配置下的表现，不具备普适性🐶

补充截图

注：已关闭windows Defender 实时保护

SSD → SSD

Windows资源管理器，SSD → SSD：23秒

本程序SSD → SSD表现
请添加图片描述

SSD → HDD

Windows自带资源管理器，SSD → HDD：130秒

本程序SSD → HDD表现
FastCopy SSD → HDD表现

some tips

大量复制文件的时候，windows在写入文件时，Windows Defender 的 Antimalware Service Executable 会实时扫描文件，会导致写入速度变慢。如果发现写入速度很慢、Antimalware Service Executable占用大量CPU，可以尝试关闭Windows Defender的实时保护。实测两个SSD之间运行上述脚本时，如果未关闭实时保护，Antimalware Service Executable的CPU占用可以到50%，如果关了就没它什么事了，同时复制速度大大提升。
线程数也不是越多越好，16线程基本上可以跑满硬盘的IO，太多适得其反。