from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
from airflow.providers.postgres.hooks.postgres import PostgresHook
from airflow.utils.dates import days_ago
from airflow.exceptions import AirflowException
import tempfile
import pandas as pd
# 读取sql数据并存为csv档上传到minio
def get_data(s3_conn_id, bucket_name, object_key_prefix, pg_conn_id, sql, **context):
year, month, day, hour, *_ = context['execution_date'].timetuple()
# 创建 postgres数据路连接引擎
pg_hook = PostgresHook(postgres_conn_id=pg_conn_id)
# 获取sqlalchemy 引擎实例
sqlalchemy_engine = pg_hook.get_sqlalchemy_engine()
try:
#read_sql方法读取数据库数据到DataFrame
df = pd.read_sql(sql, sqlalchemy_engine)
print(df.info())
print(df.head())
except Exception as e:
raise AirflowException(f"read table to dataframe fail:{e}")
# 将数据写到csv档并上传到s3服务器(minio)
with tempfile.NamedTemporaryFile('wb+') as fp:
temp_filename = fp.name # 暫存檔案名
# 上传csv档的key
object_key = (
f"{object_key_prefix}/task09/"
f"{year}/{year}-{month:0>2}/"
f"payment-{year}{month:0>2}{day:0>2}.csv.gz"
)
try:
# export dataframe to csv
df.to_csv(temp_filename, index=False, compression='gzip')
# upload to minio
s3_hook = S3Hook(aws_conn_id=s3_conn_id)
s3_hook.load_file(filename=temp_filename,
bucket_name=bucket_name,
key=object_key,
replace=True)
print(f'put query data to s3: [{bucket_name}] -> {object_key}, success!')
except Exception as e:
raise AirflowException(f"put query data to s3 fail:{e}")
default_args = {
'owner':'airflow的dag账号', # owner是airflow的DAG的開發者
}
dag = DAG(
dag_id="de13_task09", # prefix必需是tenant id
description="dag to read dvdrental database to s3",
start_date=days_ago(1),
schedule_interval="0 0 * * *", #airflow运行job的schedule(每天凌晨自动运行)
catchup=False, #false:不获取过去时间点的数据
max_active_runs=1,
default_args=default_args,
access_control={
'de13': {'can_read', 'can_edit'} # 設定DAG歸屬那個團隊[tenant id]與權限
},
tags=['de08'],
)
#执行sql语句
sql_stmt = """
SELECT
first_name || ' ' || last_name full_name, SUM (amount) amount
FROM
payment INNER JOIN customer USING (customer_id)
GROUP BY full_name ORDER BY amount;
"""
# airflow相关设定
task_get_data = PythonOperator(
task_id='get_data',
python_callable=get_data, #执行的方法
#airflow上有关minio和postgre数据库的连线设定资讯
op_kwargs={
's3_conn_id': 'de13_minio',
'bucket_name': 'minio的bucket名字',
'object_key_prefix': 'de08/dvdrental',
'pg_conn_id': 'airflow上数据库的连接资讯账号',
'sql': sql_stmt,
},
dag=dag,
)
读取sql数据写到csv档上传到minio,用airflow定时调度任务
最新推荐文章于 2023-03-15 16:48:17 发布