删除 Azure Data Lake Storage Gen2 中的blob对象
import os
import time
from retrying import retry
from azure.storage.blob import BlobServiceClient
class DirectoryClient:
def __init__(self, connection_string, container_name):
service_client = BlobServiceClient.from_connection_string(connection_string)
self.client = service_client.get_container_client(container_name)
def ls_files(self, path, recursive=False):
"""
列举当前路径下所有文件
@params1:path 路径
@params2: recursive 是否递归
"""
if not path == '' and not path.endswith('/'):
path += '/'
blob_iter = self.client.list_blobs(name_starts_with=path)
files = []
for blob in blob_iter:
relative_path = os.path.relpath(blob.name, path)
if recursive or not '/' in relative_path:
files.append(relative_path)
return files
def ls_dirs(self, path, recursive=False):
"""
列举当前路径下所有子路径
"""
if not path == '' and not path.endswith('/'):
path += '/'
blob_iter = self.client.list_blobs(name_starts_with=path)
dirs = []
for blob in blob_iter:
relative_dir = os.path.dirname(os.path.relpath(blob.name, path))
if relative_dir and (recursive or not '/' in relative_dir) and not relative_dir in dirs:
dirs.append(relative_dir)
return dirs
def rm(self, path, recursive=False):
"""
删除指定路径文件
"""
if recursive:
self.rmdir(path)
else:
print(f'Deleting {path}')
self.client.delete_blob(path)
def rmdir(self, path):
"""
递归删除指定路径下所有内容(子路径/文件)
"""
blobs = self.ls_files(path, recursive=True)
if not blobs:
return
if not path == '' and not path.endswith('/'):
path += '/'
blobs_list = [path + blob for blob in blobs]
blobs_length = len(blobs_list)
if blobs_length <= 200:
self.client.delete_blobs(*blobs_list)
else:
start = 0
end = 250
while end <= blobs_length:
self.client.delete_blobs(*blobs_list[start:end])
start = start + 200
end = end + 200
if start < blobs_length and end > blobs_length:
self.client.delete_blobs(*blobs_list[start:blobs_length])
print(path + ':blob 删除完成')
def droptable(self, dbName, tableName):
spark.sql(f'DROP TABLE IF EXISTS {dbName}.{tableName}')
print('{0}.{1} 删除完成'.format(dbName, tableName))
@retry(stop_max_attempt_number=20, wait_incrementing_increment=200)
def main():
blob_connect_string = 'DefaultEndpointsProtocol=https;AccountName=aalsabddev01e2;AccountKey=xxxxxxxxxxxxxxxxxxx==;EndpointSuffix=core.chinacloudapi.cn'
container_name = 'dwm-storage'
dbName = 'dwm_dev'
Client = DirectoryClient(connection_string=blob_connect_string, container_name=container_name)
del_table_list = ['m02_iap_track_logon_evt', 'm02_iap_track_vist_evt', 'm02_iap_track_page_clos_evt']
for table in del_table_list:
Client.rmdir(table)
Client.droptable(dbName, table)