#!/usr/bin/env python3#-*- coding:utf-8 -*-#@Time: 2020/11/22 10:13#@Author:zhangmingda#@File: ks3_multi_thread_for_threadpool.py#@Software: PyCharm#Description: 本地文件/目录 上传到KS3
importmathimportosimportsysfrom ks3.connection importConnectionfrom filechunkio importFileChunkIOimportthreadpoolclassKs3Pusher(object):def __init__(self, ak, sk, bucket_name, host,
block_threshold=100,
partsize=100,
files_thread_count=10,
part_thread_num=10):
self.ak=ak
self.sk=sk
self.bucket_name=bucket_name
self.host=host
self.block_threshold_bytes= block_threshold * 1024 * 1024self.partsize= partsize * 1024 * 1024self.retry_times= 3self.files_thread_count=files_thread_count
self.part_thread_num=part_thread_num
self.file_acl= 'private'
#self.conn = Connection(self.ak, self.sk, host=self.host)
def __initialize(self):
conn= Connection(self.ak, self.sk, host=self.host)
self.b=conn.get_bucket(self.bucket_name)def list_dir(self, abs_dir, prefix=''):
file_list=[]for file_or_dir inos.listdir(abs_dir):
sub_relative_path=os.path.join(abs_dir, file_or_dir)#子目录查找文件
ifos.path.isdir(sub_relative_path):
ks3_prefix=os.path.join(prefix, file_or_dir)#print('发现子目录:%s ' % ks3_prefix)
#递归查找最子目录,获取文件列表,最后合并子目录文件
file_list += self.list_dir(sub_relative_path, prefix=ks3_prefix)#文件获取绝对路径和ks3要存的名字
elifos.path.isfile(sub_relative_path):
file_abspath=os.path.abspath(sub_relative_path)
ks3_key=os.path.join(prefix, file_or_dir)
ks3_key= '/'.join(ks3_key.split('\\'))#print('%s is file' % file_abspath)
#列表添加元素
file_list.append(([file_abspath, ks3_key, self.file_acl], None))#print('目录:%s' % abs_dir,file_list)
returnfile_listdef push(self,local_path,prefix='',file_acl='private'):""":param local_path:本地路径
:param prefix: 前缀,或者ks3 key
:param file_acl: 文件权限
:return:"""self.file_acl=file_acl
self.__initialize()
abspath=os.path.abspath(local_path)ifos.path.isfile(abspath):
ks3_key=os.path.join(prefix, os.path.basename(abspath))
ks3_key= '/'.join(ks3_key.split('\\'))if os.stat(abspath).st_size
self.put(abspath, ks3_key=ks3_key, file_acl=file_acl)else:
self.multi_put(abspath, ks3_key=ks3_key, file_acl=file_acl)elifos.path.isdir(abspath):
thread_task_list=[]
all_file_list= self.list_dir(abspath, prefix=prefix)#print(all_file_list)
small_file_list=[]
big_file_list=[]#构建线程池实例
files_pool =threadpool.ThreadPool(self.files_thread_count)#全部添加到任务队列开始处理
for args inall_file_list:#判断是否使用分块上传
if os.stat(args[0][0]).st_size
small_file_list.append(args)else:#print(*args[0])
self.multi_put(*args[0])#big_file_list.append(args)
#初始化任务列表
small_requests =threadpool.makeRequests(self.put, small_file_list)#big_requests = threadpool.makeRequests(self.multi_put, big_file_list) # ***大文件并发,再并发分块。并发分块中全局变量不适合多个文件同时并发***
#全部添加到任务队列开始处理
[files_pool.putRequest(small_req) for small_req insmall_requests]#[files_pool.putRequest(big_req) for big_req in big_requests] # ***大文件并发,再并发分块。并发分块中全局变量不适合多个文件同时并发***
#等待所有子线程任务结束
files_pool.wait()def put(self,file_path,ks3_key,file_acl='private'):for i inrange(self.retry_times):try:
k=self.b.new_key(ks3_key)
ret= k.set_contents_from_filename(file_path, policy=file_acl)if ret and ret.status == 200:ifi:print("%s 重试第%s次上传成功" %(ks3_key,i))break
print("%s 上传成功" %ks3_key)break
else:print("%s RECV code:%s" %(ks3_key,ret.status))exceptException as e:if i+1 >=self.retry_times:print("%s 上传失败, Error: %s" %(ks3_key,e))defupload_part_task(self, mp, file_path, ks3_key, offset, chunk_size, part_num):""":param mp: KS3 会话实例
:param file_path: 本地文件名
:param ks3_key: ks3存储的文件名
:param offset: 起始字节点
:param chunk_size: 块大小
:param part_num: 块儿ID
:param retry_times: 单块失败重试次数
:return:"""cur_task_ret=Falsetry:for i inrange(self.retry_times):try:
with FileChunkIO(file_path,'rb', offset=offset, bytes=chunk_size) as fp:
mp.upload_part_from_file(fp, part_num=part_num)
cur_task_ret=Trueifi:print("%s part -----> %d retry %s times upload success" %(ks3_key, part_num, i))else:print("%s part -----> %d upload success" %(ks3_key, part_num))break
exceptBaseException as e:print("%s part %d upload_id=%s,error=%s" %(
ks3_key, part_num, mp.id, e))if i + 1 >=self.retry_times:print("%s part %d upload fail" %(ks3_key, part_num))raiseeexceptBaseException as e:
cur_task_ret=Falsefinally:return{part_num: cur_task_ret}defget_upload_part_result(self,req,result):""":param req:子线程实例
:param result: 每个线程的返回值
:return: 没必要....没人接收了"""
globalmulti_chunk_result
multi_chunk_result.update(result)def multi_put(self, file_path, ks3_key=None, file_acl="private"):""":param file_path:本地文件路径
:param ks3_key:ks3名称
:param file_acl: 文件权限
:return:"""
#分块任务列表
thread_list =[]#每个块儿的上传结果
globalmulti_chunk_result
multi_chunk_result={}#如果没有给KS3上面的文件命名,就获取原名字
if notks3_key:
ks3_key=os.path.basename(file_path)
f_size=os.stat(file_path).st_size
mp= self.b.initiate_multipart_upload(ks3_key, policy=file_acl)if notmp:raise RuntimeError("%s init multiupload error" %ks3_key)print("%s begin multipart upload,uploadid=%s" %(ks3_key, mp.id))
chunk_size=self.partsize
chunk_count= int(math.ceil(f_size /float(chunk_size)))
pool_args_list=[]try:for i inrange(chunk_count):
offset= chunk_size *i
bs= min(chunk_size, f_size -offset)
part_num= i + 1
#将一个文件划分的所有块儿任务,添加到任务列表
pool_args_list.append(([mp, file_path, ks3_key, offset, bs, part_num], None))#构建线程池实例
pool =threadpool.ThreadPool(self.part_thread_num)#初始化任务列表
requests =threadpool.makeRequests(self.upload_part_task, pool_args_list, self.get_upload_part_result)print('pool.putRequest(req)')#全部添加到任务队列开始处理
[pool.putRequest(req) for req inrequests]#等待所有子线程任务结束
pool.wait()#[multi_chunk_result.update(part_thread.result()) for part_thread in as_completed(thread_list)]
#上传总结
#如果任务数和块儿数对不上,报一下出入
if len(multi_chunk_result) !=chunk_count:raiseRuntimeError("%s part miss,expect=%d,actual=%d" %(ks3_key, chunk_count, len(multi_chunk_result)))#如果任务都完毕,检查是否有失败的块儿
for item inmulti_chunk_result.keys():if notmulti_chunk_result[item]:raise RuntimeError("%s part upload has fail" %ks3_key)#总结都OK,完成上传做合并动作
mp.complete_upload()print("%s multipart upload success" %ks3_key)return "%s multipart upload success" %ks3_keyexceptBaseException as e:print("%s multipart upload fail err:%s" %(ks3_key,e))ifmp:
mp.cancel_upload()raiseeif __name__ == '__main__':#Connect to S3
ak = 'XXXXXXXXXXXXX'sk= 'XXXXXXXXXXXXXXXXXXXXX'backet_name= 'XXXXXXXX'host= 'ks3-cn-beijing.ksyun.com'
#host = 'ks3-cn-beijing-internal.ksyun.com'
#本地文件/目录路径
path_name = sys.argv[1]if notos.path.exists(path_name):
exit("%s not exists" %path_name)#input_path = os.path.abspath(path_name)
#ks3 上传的前缀
prefix = ''
#object policy : 'private' or 'public-read'
object_policy = 'public-read'
#多文件并发上传数
files_thread_count = 10
#单文件并发分块线程数
part_thread_num = 10
#分块上传阈值,单位MB
block_threshold = 10
#分块大小设置,单位MB https://docs.ksyun.com/documents/943
#(当所有块总大小大于5M时,除了最后一个块没有大小限制外,其余的块的大小均要求在5MB以上。)
#(当所有块总大小小于5M时,除了最后一个块没有大小限制外,其余的块的大小均要求在100K以上。如果不符合上述要求,会返回413状态码。)
partsize = 10kpusher= Ks3Pusher(ak=ak, sk=sk, bucket_name=backet_name,host=host,
block_threshold=block_threshold,
partsize=partsize,
files_thread_count=files_thread_count,
part_thread_num=part_thread_num
)
kpusher.push(path_name, prefix=prefix, file_acl=object_policy)