# basic config
str_jdbc_url='jdbc:redshift://xxx.redshift.amazonaws.com.cn:port/database?'
str_iam_role='arn:aws-cn:iam::1xxx1:role/dms-access-for-endpoint'
# get secret, could check guides from module "secrets manager"
def get_secret():
secret_name = 'xxxxx'
region_name = 'region'
# Create a Secrets Manager client
session = boto3.session.Session()
client = session.client(
service_name='secretsmanager',
region_name=region_name
)
try:
get_secret_value_response = client.get_secret_value(
SecretId=secret_name
)
except ClientError as e:
# For a list of exceptions thrown, see
# https://docs.amazonaws.cn/secretsmanager/latest/apireference/API_GetSecretValue.html
raise e
secret = get_secret_value_response['SecretString']
return secret
dp_info = get_secret()
str_jdbc_url=str_jdbc_url+'user='+ eval(dp_info)['user'] + '&password=' + eval(dp_info)['password']
# jar library: org.apache.spark:spark-avro_2.12:3.3.0,io.github.spark-redshift-community:spark-redshift_2.12:6.2.0-spark_3.3,com.amazon.redshift:redshift-jdbc42:2.1.0.2
def get_df_in_redshift(query):
df = spark.read \
.format('io.github.spark_redshift_community.spark.redshift') \
.option('url', str_jdbc_url) \
.option('query', query) \
.option('aws_iam_role',str_iam_role) \
.option('tempdir', str_s3_path) \
.load()
return df
# query redshift would save data in S3 and NOT delete
def delete_in_s3_folder(bucket_name, folder_path):
s3 = boto3.client('s3')
objects_to_delete = s3.list_objects_v2(Bucket=bucket_name, Prefix=folder_path)
if 'Contents' in objects_to_delete:
keys_to_delete = [{'Key': obj['Key']} for obj in objects_to_delete['Contents']]
while len(keys_to_delete) > 0:
batch_to_delete = keys_to_delete[:10]
response = s3.delete_objects(Bucket=bucket_name, Delete={'Objects': batch_to_delete})
keys_to_delete = keys_to_delete[10:]
log_info(f"Deleted batch of {len(batch_to_delete)} objects.")
else:
pass
# list all files then delete
def list_s3_file(s3_dir_path): s3_keys = [] javaPath = spark._jvm.java.net.URI.create(s3_dir_path) hadoopPath = spark._jvm.org.apache.hadoop.fs.Path(s3_dir_path) hadoopFileSystem = spark._jvm.org.apache.hadoop.fs.FileSystem.get(javaPath, spark._jvm.org.apache.hadoop.conf.Configuration()) iterator = hadoopFileSystem.listFiles(hadoopPath, True) s3_keys = [] while iterator.hasNext(): s3_keys.append(iterator.next().getPath().toUri().getRawPath()) return s3_keys
def delete_s3_file(s3_dir_path, s3_keys): try: javaPath = spark._jvm.java.net.URI.create(s3_dir_path) hadoopFileSystem = spark._jvm.org.apache.hadoop.fs.FileSystem.get(javaPath, spark._jvm.org.apache.hadoop.conf.Configuration()) for i in s3_keys: hadoopPath = spark._jvm.org.apache.hadoop.fs.Path(i) files_deleted = hadoopFileSystem.delete(hadoopPath, True) print(f'Emptied S3 path: {s3_dir_path}') except Exception as e: print(f"Error deleting objects in {s3_dir_path}: {e}")
delete_s3_files(str_s3_path, list_s3_files(str_s3_path))