一些aws代码自存-CSDN博客

本文链接：https://blog.csdn.net/whuang094/article/details/141926825

# basic config

str_jdbc_url='jdbc:redshift://xxx.redshift.amazonaws.com.cn:port/database?'

str_iam_role='arn:aws-cn:iam::1xxx1:role/dms-access-for-endpoint'

# get secret, could check guides from module "secrets manager"

def get_secret():

secret_name = 'xxxxx'

region_name = 'region'

# Create a Secrets Manager client

session = boto3.session.Session()

client = session.client(

service_name='secretsmanager',

region_name=region_name

)

try:

get_secret_value_response = client.get_secret_value(

SecretId=secret_name

)

except ClientError as e:

# For a list of exceptions thrown, see

# https://docs.amazonaws.cn/secretsmanager/latest/apireference/API_GetSecretValue.html

raise e

secret = get_secret_value_response['SecretString']

return secret

dp_info = get_secret()

str_jdbc_url=str_jdbc_url+'user='+ eval(dp_info)['user'] + '&password=' + eval(dp_info)['password']

# jar library: org.apache.spark:spark-avro_2.12:3.3.0,io.github.spark-redshift-community:spark-redshift_2.12:6.2.0-spark_3.3,com.amazon.redshift:redshift-jdbc42:2.1.0.2

def get_df_in_redshift(query):

df = spark.read \

.format('io.github.spark_redshift_community.spark.redshift') \

.option('url', str_jdbc_url) \

.option('query', query) \

.option('aws_iam_role',str_iam_role) \

.option('tempdir', str_s3_path) \

.load()

return df

# query redshift would save data in S3 and NOT delete

def delete_in_s3_folder(bucket_name, folder_path):

s3 = boto3.client('s3')

objects_to_delete = s3.list_objects_v2(Bucket=bucket_name, Prefix=folder_path)

if 'Contents' in objects_to_delete:

keys_to_delete = [{'Key': obj['Key']} for obj in objects_to_delete['Contents']]

while len(keys_to_delete) > 0:

batch_to_delete = keys_to_delete[:10]

response = s3.delete_objects(Bucket=bucket_name, Delete={'Objects': batch_to_delete})

keys_to_delete = keys_to_delete[10:]

log_info(f"Deleted batch of {len(batch_to_delete)} objects.")

else:

pass

# list all files then delete

def list_s3_file(s3_dir_path): s3_keys = [] javaPath = spark._jvm.java.net.URI.create(s3_dir_path) hadoopPath = spark._jvm.org.apache.hadoop.fs.Path(s3_dir_path) hadoopFileSystem = spark._jvm.org.apache.hadoop.fs.FileSystem.get(javaPath, spark._jvm.org.apache.hadoop.conf.Configuration()) iterator = hadoopFileSystem.listFiles(hadoopPath, True) s3_keys = [] while iterator.hasNext(): s3_keys.append(iterator.next().getPath().toUri().getRawPath()) return s3_keys

def delete_s3_file(s3_dir_path, s3_keys): try: javaPath = spark._jvm.java.net.URI.create(s3_dir_path) hadoopFileSystem = spark._jvm.org.apache.hadoop.fs.FileSystem.get(javaPath, spark._jvm.org.apache.hadoop.conf.Configuration()) for i in s3_keys: hadoopPath = spark._jvm.org.apache.hadoop.fs.Path(i) files_deleted = hadoopFileSystem.delete(hadoopPath, True) print(f'Emptied S3 path: {s3_dir_path}') except Exception as e: print(f"Error deleting objects in {s3_dir_path}: {e}")

delete_s3_files(str_s3_path, list_s3_files(str_s3_path))