《Python多线程下载阿里云OSS文件实战教程：提升下载效率的最佳方案》

原创于 2025-06-07 13:09:27 发布

· 624 阅读

13 ·

版权

文章标签：

#python #阿里云 #数据库

需求背景

因为客户自己无力运维，将服务器维护等工作交给我们维护，
所以需要将数据迁移到我们的阿里，oss文件里有30多个G
阿里官方提供的软件包用起来不太方便，
1.0版本的那个软件虽然可以全选文件，但文件太多的时候，软件会卡死，而且线程太少，效率低
2.0版本那个软件，虽然使用体验好了一点，但无法全选文件进行下载。
而且我们也打算对附件进行压缩，达到【节约流量费】和【用户访问加速】的效果。

所以就自己写一个屁来下载工具吧

所需库内容如下
requirements.txt

aliyun-python-sdk-core==2.16.0
aliyun-python-sdk-kms==2.16.5
certifi==2025.4.26
cffi==1.17.1
charset-normalizer==3.4.2
crcmod==1.7
cryptography==45.0.3
idna==3.10
jmespath==0.10.0
oss2==2.19.1
pycparser==2.22
pycryptodome==3.23.0
python-dotenv==1.1.0
requests==2.32.3
six==1.17.0
urllib3==1.26

请自行新建.env文件
格式如下

OSS_ACCESS_KEY_ID=阿里KEY
OSS_ACCESS_KEY_SECRET=阿里KEY_SECRET
OSS_BUCKET_NAME = OSS包名
OSS_ENDPOINT = 所属节点

执行代码如下：


import oss2
from concurrent.futures import ThreadPoolExecutor
import time
import sys
import math
import os
from dotenv import load_dotenv
load_dotenv()



access_key_id = os.getenv('OSS_ACCESS_KEY_ID')
access_key_secret = os.getenv('OSS_ACCESS_KEY_SECRET')
bucket_name = os.getenv('OSS_BUCKET_NAME')
endpoint = os.getenv('OSS_ENDPOINT')


# 初始化OSS客户端
auth = oss2.Auth(access_key_id, access_key_secret)
bucket = oss2.Bucket(auth, endpoint, bucket_name)

# 本地存储目录
LOCAL_DIR = './osss'

def ensure_directory(path):
    """确保目录存在"""
    os.makedirs(path, exist_ok=True)

def format_size(bytes_size):
    """格式化文件大小"""
    if bytes_size == 0:
        return "0B"
    units = ("B", "KB", "MB", "GB", "TB")
    i = int(math.floor(math.log(bytes_size, 1024)))
    return f"{bytes_size / 1024 ** i:.2f}{units[i]}"

def download_file(obj):
    """下载单个文件"""
    remote_path = obj.key
    local_path = os.path.join(LOCAL_DIR, remote_path)

    # 跳过已存在的文件（可根据需要修改此逻辑）
    if os.path.exists(local_path):
        return False, remote_path, 0

    ensure_directory(os.path.dirname(local_path))
    try:
        start_time = time.time()
        bucket.get_object_to_file(remote_path, local_path)
        file_size = os.path.getsize(local_path)
        return True, remote_path, file_size
    except Exception as e:
        print(f"\n下载失败: {remote_path} - {str(e)}")
        return False, remote_path, 0

def download_all_files(max_workers=8, batch_size=1000):
    """
    下载所有文件到本地（分页+多线程优化版）
    参数:
        max_workers: 下载线程数
        batch_size: 每页获取的对象数量
    """
    ensure_directory(LOCAL_DIR)

    total_files = 0
    downloaded_files = 0
    total_size = 0
    skipped_files = 0
    start_time = time.time()

    # 进度显示函数
    def show_progress():
        elapsed = time.time() - start_time
        sys.stdout.write(
            f"\r处理进度: {total_files} 文件 | "
            f"已下载: {downloaded_files} | "
            f"跳过: {skipped_files} | "
            f"大小: {format_size(total_size)} | "
            f"耗时: {elapsed:.1f}s | "
            f"速度: {downloaded_files / max(1, elapsed):.1f}文件/s"
        )
        sys.stdout.flush()

    print(f"开始下载OSS存储桶内容到: {os.path.abspath(LOCAL_DIR)}")

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        next_marker = ''
        while True:
            # 分页获取对象列表
            result = bucket.list_objects(
                max_keys=batch_size,
                marker=next_marker
            )

            if not result.object_list:
                break

            # 准备下载任务
            futures = []
            for obj in result.object_list:
                if obj.key.endswith('/'):  # 跳过目录标记
                    continue
                futures.append(executor.submit(download_file, obj))
                total_files += 1
                if total_files % 10 == 0:
                    show_progress()

            # 处理下载结果
            for future in futures:
                success, remote_path, file_size = future.result()
                if success:
                    downloaded_files += 1
                    total_size += file_size
                else:
                    if file_size == 0:  # 下载失败
                        pass
                    else:  # 文件已存在
                        skipped_files += 1
                        total_size += file_size

            # 准备下一页
            next_marker = result.next_marker
            if not next_marker:
                break

    # 最终进度显示
    show_progress()
    print("\n\n下载统计:")
    print(f"- 总文件数: {total_files}")
    print(f"- 成功下载: {downloaded_files}")
    print(f"- 跳过文件: {skipped_files}")
    print(f"- 失败文件: {total_files - downloaded_files - skipped_files}")
    print(f"- 总数据量: {format_size(total_size)}")
    print(f"- 保存路径: {os.path.abspath(LOCAL_DIR)}")

if __name__ == '__main__':
    try:
        download_all_files(max_workers=16, batch_size=1000)  # 可根据网络情况调整
    except KeyboardInterrupt:
        print("\n用户中断下载")
    except Exception as e:
        print(f"\n发生错误: {str(e)}")

AI协助整理功能介绍

这段代码实现了从阿里云OSS（对象存储服务）下载所有文件到本地的功能，并通过多线程优化了下载速度。接下来，我会逐行解释代码的主要功能。

导入库和环境变量

import oss2
from concurrent.futures import ThreadPoolExecutor
import time
import sys
import math
import os
from dotenv import load_dotenv
load_dotenv()

oss2: 阿里云OSS的Python SDK，用于与OSS进行交互。
ThreadPoolExecutor: 用于创建一个线程池，方便执行多线程任务。
time: 提供时间相关的功能，比如测量下载速度。
sys: 用于向控制台输出进度信息。
math: 用于数学计算，特别是格式化文件大小。
os: 用于操作本地文件系统，检查文件是否存在和创建目录。
dotenv: 加载 .env 文件中的环境变量。

.env 文件中存储了阿里云的身份验证信息和桶名称等。

环境变量读取

access_key_id = os.getenv('OSS_ACCESS_KEY_ID')
access_key_secret = os.getenv('OSS_ACCESS_KEY_SECRET')
bucket_name = os.getenv('OSS_BUCKET_NAME')
endpoint = os.getenv('OSS_ENDPOINT')

这四个变量分别从环境文件中读取阿里云OSS的访问密钥、存储桶名称以及终端节点。

初始化OSS客户端

auth = oss2.Auth(access_key_id, access_key_secret)
bucket = oss2.Bucket(auth, endpoint, bucket_name)

oss2.Auth：使用读取到的 access_key_id 和 access_key_secret 进行身份验证。
oss2.Bucket：连接到指定的存储桶 (bucket_name) 和指定的终端节点 (endpoint)，从而可以执行后续的OSS操作。

本地存储目录

LOCAL_DIR = './osss'

设置本地存储的目录，这里是 ./osss，即当前目录下的 osss 文件夹。

确保目录存在

def ensure_directory(path):
    """确保目录存在"""
    os.makedirs(path, exist_ok=True)

该函数用于检查并创建本地存储路径。如果路径已经存在，exist_ok=True 使得不会抛出错误。

文件大小格式化

def format_size(bytes_size):
    """格式化文件大小"""
    if bytes_size == 0:
        return "0B"
    units = ("B", "KB", "MB", "GB", "TB")
    i = int(math.floor(math.log(bytes_size, 1024)))
    return f"{bytes_size / 1024 ** i:.2f}{units[i]}"

此函数用于将字节大小转换为人类可读的格式（如 KB, MB, GB）。使用对数函数计算文件的单位，并且保留两位小数。

下载单个文件

def download_file(obj):
    """下载单个文件"""
    remote_path = obj.key
    local_path = os.path.join(LOCAL_DIR, remote_path)

    # 跳过已存在的文件（可根据需要修改此逻辑）
    if os.path.exists(local_path):
        return False, remote_path, 0

    ensure_directory(os.path.dirname(local_path))
    try:
        start_time = time.time()
        bucket.get_object_to_file(remote_path, local_path)
        file_size = os.path.getsize(local_path)
        return True, remote_path, file_size
    except Exception as e:
        print(f"\n下载失败: {remote_path} - {str(e)}")
        return False, remote_path, 0

download_file 用于下载单个文件，obj 是 OSS 中的一个对象。
先检查本地是否已存在该文件，如果存在则跳过下载。
使用 bucket.get_object_to_file 方法将文件从 OSS 下载到本地指定路径。
下载成功后返回文件大小，否则捕获异常并返回下载失败信息。

下载所有文件

def download_all_files(max_workers=8, batch_size=1000):
    """
    下载所有文件到本地（分页+多线程优化版）
    参数:
        max_workers: 下载线程数
        batch_size: 每页获取的对象数量
    """
    ensure_directory(LOCAL_DIR)

    total_files = 0
    downloaded_files = 0
    total_size = 0
    skipped_files = 0
    start_time = time.time()

download_all_files 是主函数，用于下载 OSS 存储桶中的所有文件。
使用 ThreadPoolExecutor 实现多线程下载，max_workers 控制最大线程数，batch_size 控制每次列出的文件数量。

显示进度

def show_progress():
    elapsed = time.time() - start_time
    sys.stdout.write(
        f"\r处理进度: {total_files} 文件 | "
        f"已下载: {downloaded_files} | "
        f"跳过: {skipped_files} | "
        f"大小: {format_size(total_size)} | "
        f"耗时: {elapsed:.1f}s | "
        f"速度: {downloaded_files / max(1, elapsed):.1f}文件/s"
    )
    sys.stdout.flush()

此函数实时更新下载进度，包括已下载的文件数、跳过的文件数、总大小、耗时和下载速度。

分页获取文件列表并下载

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    next_marker = ''
    while True:
        result = bucket.list_objects(
            max_keys=batch_size,
            marker=next_marker
        )

        if not result.object_list:
            break

        # 准备下载任务
        futures = []
        for obj in result.object_list:
            if obj.key.endswith('/'):  # 跳过目录标记
                continue
            futures.append(executor.submit(download_file, obj))
            total_files += 1
            if total_files % 10 == 0:
                show_progress()

        # 处理下载结果
        for future in futures:
            success, remote_path, file_size = future.result()
            if success:
                downloaded_files += 1
                total_size += file_size
            else:
                if file_size == 0:  # 下载失败
                    pass
                else:  # 文件已存在
                    skipped_files += 1
                    total_size += file_size

        # 准备下一页
        next_marker = result.next_marker
        if not next_marker:
            break

使用分页获取 OSS 存储桶中的文件，每次获取 batch_size 个对象。
使用线程池下载文件，executor.submit 提交任务。
每下载10个文件就显示一次进度。
next_marker 用于获取分页数据，直到所有文件都列出并下载完成。

输出最终进度

show_progress()
print("\n\n下载统计:")
print(f"- 总文件数: {total_files}")
print(f"- 成功下载: {downloaded_files}")
print(f"- 跳过文件: {skipped_files}")
print(f"- 失败文件: {total_files - downloaded_files - skipped_files}")
print(f"- 总数据量: {format_size(total_size)}")
print(f"- 保存路径: {os.path.abspath(LOCAL_DIR)}")

在下载完成后，输出最终的下载统计数据：文件总数、成功下载的文件数、跳过的文件数、失败文件数、总数据量和本地保存路径。

主程序

if __name__ == '__main__':
    try:
        download_all_files(max_workers=16, batch_size=1000)
    except KeyboardInterrupt:
        print("\n用户中断下载")
    except Exception as e:
        print(f"\n发生错误: {str(e)}")