直接上代码,可以增加上传的速度,不然我几百万个文件要传到猴年马月。
问题是根据遍历的文件顺序取模给不同的线程处理实现,缺点就是上传的过程中文件夹的文件不能有新增或者删除。不然就全乱了。。。。后面再考虑其他方法吧
# -*- coding: utf-8 -*-
import oss2
import sys
import os
from urllib import parse
import threading
# 阿里云主账号AccessKey拥有所有API的访问权限,风险很高。强烈建议您创建并使用RAM账号进行API访问或日常运维,请登录 https://ram.console.aliyun.com 创建RAM账号。
auth = oss2.Auth('xxx', 'xxx')
# Endpoint以杭州为例,其它Region请按实际情况填写。
bucket = oss2.Bucket(auth, 'http://oss-cn-chengdu.aliyuncs.com', 'xxx')
rootdir = 'D:/backup/aliyun-oss-python-sdk-master'
maxThread = 10
def walk_files(path):
file_list = []
for root,dirs,files in os.walk(path):
for file in files:
file_path = os.path.join(root,file)
file_list.append(file_path)
return file_list
counter = []
def gci(filepath,i):
#遍历filepath下所有文件,包括子目录
global counter,maxThread
files = os.listdir(filepath)
for fi in files:
fi_d = os.path.join(filepath,fi)
if os.path.isdir(fi_d):
gci(fi_d,i)
else:
counter[i] = counter[i] + 1
if counter[i]%maxThread == i:
print(str(counter[i]) + "....thread---"+str(i))
urlencode=parse.quote(fi_d)
result = bucket.put_object_from_file(fi_d[1:], fi_d)
print('http status: {0}'.format(result.status))
def percentage(consumed_bytes, total_bytes):
if total_bytes:
rate = int(100 * (float(consumed_bytes) / float(total_bytes)))
print('\r{0}% '.format(rate), end='')
sys.stdout.flush()
#多线程处理,前提是文件夹下的数据暂时无变动
for i in range(maxThread):
counter.append(0)
t1 =threading.Thread(target=gci,args=(rootdir,i))
t1.start()
今天改进了下,根据文件名做哈希来分配
# -*- coding: utf-8 -*-
import oss2
import sys
import os
from urllib import parse
import threading
import hashlib
# 阿里云主账号AccessKey拥有所有API的访问权限,风险很高。强烈建议您创建并使用RAM账号进行API访问或日常运维,请登录 https://ram.console.aliyun.com 创建RAM账号。
auth = oss2.Auth('x', 'x')
# Endpoint以杭州为例,其它Region请按实际情况填写。
bucket = oss2.Bucket(auth, 'http://oss-cn-chengdu.aliyuncs.com', 'x')
rootdir = 'D:/backup/aliyun-oss-python-sdk-master'
maxThread = 10
def walk_files(path):
file_list = []
for root,dirs,files in os.walk(path):
for file in files:
file_path = os.path.join(root,file)
file_list.append(file_path)
return file_list
counter = []
def simpleHash(path):
value = 0
for ch in path:
value += ord(ch)
return value
def gci(filepath,i):
#遍历filepath下所有文件,包括子目录
global counter,maxThread
files = os.listdir(filepath)
for fi in files:
fi_d = os.path.join(filepath,fi)
if os.path.isdir(fi_d):
gci(fi_d,i)
else:
counter[i] = counter[i] + 1
if simpleHash(fi_d)%maxThread == i:
print(str(counter[i])+"....thread---"+str(i))
urlencode=parse.quote(fi_d)
result = bucket.put_object_from_file(fi_d[1:], fi_d)
print('http status: {0}'.format(result.status))
def percentage(consumed_bytes, total_bytes):
if total_bytes:
rate = int(100 * (float(consumed_bytes) / float(total_bytes)))
print('\r{0}% '.format(rate), end='')
sys.stdout.flush()
#多线程处理,前提是文件夹下的数据暂时无变动
for i in range(maxThread):
counter.append(0)
t1 =threading.Thread(target=gci,args=(rootdir,i))
t1.start()
#t1 =threading.Thread(target=action,args=(100,))
#gci(rootdir)
#result=oss2.resumable_upload(bucket,'hadoop-2.7.7.tar.gz', 'D:\\backup\\hadoop-2.7.7.tar.gz',progress_callback=percentage)
# HTTP返回码。