python 多线程上传海量文件到阿里云对象存储

本文链接：https://blog.csdn.net/pengda63/article/details/105414207

本文介绍了一种通过多线程处理和文件名哈希分配任务的方法，显著提高了大量文件上传至阿里云OSS的速度。这种方法避免了上传过程中文件夹内文件的新增或删除，确保了上传过程的稳定性和效率。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

直接上代码,可以增加上传的速度，不然我几百万个文件要传到猴年马月。

问题是根据遍历的文件顺序取模给不同的线程处理实现，缺点就是上传的过程中文件夹的文件不能有新增或者删除。不然就全乱了。。。。后面再考虑其他方法吧

# -*- coding: utf-8 -*-
import oss2
import sys
import os
from urllib import parse
import threading
# 阿里云主账号AccessKey拥有所有API的访问权限，风险很高。强烈建议您创建并使用RAM账号进行API访问或日常运维，请登录 https://ram.console.aliyun.com 创建RAM账号。
auth = oss2.Auth('xxx', 'xxx')
# Endpoint以杭州为例，其它Region请按实际情况填写。
bucket = oss2.Bucket(auth, 'http://oss-cn-chengdu.aliyuncs.com', 'xxx')

rootdir = 'D:/backup/aliyun-oss-python-sdk-master'
maxThread = 10

def walk_files(path):
	file_list = []
	for root,dirs,files in os.walk(path):
		for file in files:
			file_path = os.path.join(root,file)
			file_list.append(file_path)

	return file_list


counter = []



def gci(filepath,i):
#遍历filepath下所有文件，包括子目录
	global counter,maxThread
	files = os.listdir(filepath)
	for fi in files:
		fi_d = os.path.join(filepath,fi)
		if os.path.isdir(fi_d):
			gci(fi_d,i)
		else:
			counter[i] = counter[i] + 1
			if counter[i]%maxThread == i:
				print(str(counter[i]) + "....thread---"+str(i))
				urlencode=parse.quote(fi_d)
				result = bucket.put_object_from_file(fi_d[1:], fi_d)
				print('http status: {0}'.format(result.status))


def percentage(consumed_bytes, total_bytes):
	if total_bytes:
		rate = int(100 * (float(consumed_bytes) / float(total_bytes)))
		print('\r{0}% '.format(rate), end='')
		sys.stdout.flush()


#多线程处理，前提是文件夹下的数据暂时无变动
for i in range(maxThread):
	counter.append(0)
	t1 =threading.Thread(target=gci,args=(rootdir,i))
	t1.start()

今天改进了下，根据文件名做哈希来分配

# -*- coding: utf-8 -*-
import oss2
import sys
import os
from urllib import parse
import threading
import hashlib
# 阿里云主账号AccessKey拥有所有API的访问权限，风险很高。强烈建议您创建并使用RAM账号进行API访问或日常运维，请登录 https://ram.console.aliyun.com 创建RAM账号。
auth = oss2.Auth('x', 'x')
# Endpoint以杭州为例，其它Region请按实际情况填写。
bucket = oss2.Bucket(auth, 'http://oss-cn-chengdu.aliyuncs.com', 'x')

rootdir = 'D:/backup/aliyun-oss-python-sdk-master'
maxThread = 10

def walk_files(path):
	file_list = []
	for root,dirs,files in os.walk(path):
		for file in files:
			file_path = os.path.join(root,file)
			file_list.append(file_path)

	return file_list


counter = []

def simpleHash(path):
	value = 0
	for ch in path:
		value += ord(ch)
	return value



def gci(filepath,i):
#遍历filepath下所有文件，包括子目录
	global counter,maxThread
	files = os.listdir(filepath)
	for fi in files:
		fi_d = os.path.join(filepath,fi)
		if os.path.isdir(fi_d):
			gci(fi_d,i)
		else:
			counter[i] = counter[i] + 1
			if simpleHash(fi_d)%maxThread == i:
				print(str(counter[i])+"....thread---"+str(i))
				urlencode=parse.quote(fi_d)
				result = bucket.put_object_from_file(fi_d[1:], fi_d)
				print('http status: {0}'.format(result.status))


def percentage(consumed_bytes, total_bytes):
	if total_bytes:
		rate = int(100 * (float(consumed_bytes) / float(total_bytes)))
		print('\r{0}% '.format(rate), end='')
		sys.stdout.flush()


#多线程处理，前提是文件夹下的数据暂时无变动
for i in range(maxThread):
	counter.append(0)
	t1 =threading.Thread(target=gci,args=(rootdir,i))
	t1.start()
	
	
#t1 =threading.Thread(target=action,args=(100,))
#gci(rootdir)
#result=oss2.resumable_upload(bucket,'hadoop-2.7.7.tar.gz', 'D:\\backup\\hadoop-2.7.7.tar.gz',progress_callback=percentage)

# HTTP返回码。