案例一:DAG入门案例
编写python脚本并放在$AIRFLOW_HOME/dags目录下
Vim helloworld.py
"""
describe: test HelloWorld schedule
founder:Efron.Shu
Date created:2022-03-22
Modified by:Efron.Shu
Modified date:2022-03-22
##vim /data/efron/schedule/HelloWorld.py
"""
#!/usr/bin/python3
from jinja2 import Template
from datetime import datetime,timedelta
from airflow import DAG
from airflow.utils import dates
# from airflow.utils.helper import chain
# from airflow.operators.bash_operatorimport BashOperator
from airflow.operators.bash importBashOperator
# from airflow.operators.python_operatorimport PythonOperator
from airflow.operators.python importPythonOperator
def default_options():
default_args = {
'owner':'airflow',
'start_date':dates.days_ago(1),
'retries':1,
'retry_delay':timedelta(seconds=5)
}
return default_args
def task1(dag):
t = "echo "+"{{ ds }}"
task = BashOperator(
task_id = 'MyTask1',
bash_command=t,
dag=dag
)
print(t)
return task
def hello_world():
current_time = str(datetime.today())
print('hello world at {}'.format(current_time))
def task2(dag):
#python Operator
task = PythonOperator(
task_id='MyTask2',
python_callable=hello_world,
dag=dag)
return task
def task3(dag):
t = "date"
task = BashOperator(
task_id='MyTask3',
bash_command=t,
dag=dag)
return task
with DAG(
'HelloWorldDag',
default_args=default_options(),
schedule_interval = "10 1* * *",
# schedule_interval ="@once",
catchup=False,
# max_active_runs=1, # 避免同一个调度同时跑,如果有数据库操作,同一个task同时跑可能会造成死锁
) as d:
task1 = task1(d)
task2 = task2(d)
task3 = task3(d)
##chain(task1,task2,task3)
task1 >> task2
task2 >> task3
#执行命令检查脚本是否有错误
python $AIRFLOW_HOME/dags/helloworld.py
#查看生效的dags
airflow list_dags -sd /opt/bitnami/airflow/dags
#查看指定dag中的task
airflow tasks list HelloWorldDag
#测试DAG中的task
airflow tasks test HelloWorldDag MyTask220220321
查看WEB页面显示
也可以通过web页面手动执行task进行测试
案例二:DAG项目实战
核心交易分析调度场景涉及
需求:1,实现ETL各层级之间task的调度关系
2,实现DAG中task依赖外部DAG的其他task
10.1 准备需要执行的task程序
准备如下文件,表示我们需要执行的task程序,文件内容先简单做一个打印输出
#加载ODS层数据
/opt/bitnami/airflow/scripts/ods_load_trade.sh
#加载DIM层数据
/opt/bitnami/airflow/scripts/dim_load_product_cat.sh
/opt/bitnami/airflow/scripts/dim_load_shop_org.sh
/opt/bitnami/airflow/scripts/dim_load_payment.sh
/opt/bitnami/airflow/scripts/dim_load_product_info.sh
#加载DWD层数据
/opt/bitnami/airflow/scripts/dwd_trade_orders.sh
#加载ADS层数据
/opt/bitnami/airflow/scripts/ads_load_trade_order_analysis.sh
10.2 准备DAG脚本
进入$AIRFLOW_HOME/dags
编辑XXX_POC_schedule.py文件
vim coretradedag.py
内容如下
"""
describe: ETL POC schedule
founder:Efron.Shu
Date created:2022-03-22
Modified by:Efron.Shu
Modified date:2022-03-22
----81.68.137.242----
vim /data/efron/schedule/ETL_POC_schedule.py
#加载ODS层数据
/opt/bitnami/airflow/scripts/ods_load_trade.sh
#加载DIM层数据
/opt/bitnami/airflow/scripts/dim_load_product_cat.sh
/opt/bitnami/airflow/scripts/dim_load_shop_org.sh
/opt/bitnami/airflow/scripts/dim_load_payment.sh
/opt/bitnami/airflow/scripts/dim_load_product_info.sh
#加载DWD层数据
/opt/bitnami/airflow/scripts/dwd_load_trade_orders.sh
#加载DWS层数据
/opt/bitnami/airflow/scripts/dws_load_trade_orders.sh
#加载ADS层数据
/opt/bitnami/airflow/scripts/ads_load_trade_order_analysis.sh
"""
#!/usr/bin/python3
from datetime import timedelta
import datetime
# from datetime import datetime
from airflow import DAG
# from airflow.models import DAG
from airflow.utils import dates
# from airflow.operators.bash_operatorimport BashOperator
from airflow.operators.bash importBashOperator
from airflow.utils.dates import days_ago
# from airflow.operators.sensors importExternalTaskSensor
# from airflow.sensors.external_task_sensorimport ExternalTaskSensor
from airflow.sensors.external_task importExternalTaskSensor
# 定义DAG的默认参数
default_args = {
'owner':'airflow',
'start_date':dates.days_ago(1),
'depends_on_past':False,
'email':['airflow@example.com'],
'email_on_failure':False,
'email_on_retry':False,
'retries':1,
'retry_delay':timedelta(minutes = 1),
}
# 定义DAG
coretradedag = DAG(
'coretrade',
default_args=default_args,
description='core trade analyze',
schedule_interval='30 1 * * *',
#schedule_interval = "@once",
#max_active_runs=1,
)
today= datetime.date.today()
oneday=timedelta(days=1)
yesterday=(today-oneday).strftime("%Y-%m-%d")
odstask = BashOperator(
task_id='ods_load_data',
depends_on_past=False,
bash_command='sh /opt/bitnami/airflow/scripts/ods_load_trade.sh ' +yesterday,
dag=coretradedag
)
dimtask1 =BashOperator(
task_id='dimtask_product_cat',
depends_on_past=False,
bash_command='sh /opt/bitnami/airflow/scripts/dim_load_product_cat.sh '+ yesterday,
dag=coretradedag
)
dimtask2 =BashOperator(
task_id='dimtask_shop_org',
depends_on_past=False,
bash_command='sh /opt/bitnami/airflow/scripts/dim_load_shop_org.sh ' +yesterday,
dag=coretradedag
)
dimtask3 =BashOperator(
task_id='dimtask_payment',
depends_on_past=False,
bash_command='sh /opt/bitnami/airflow/scripts/dim_load_payment.sh ' +yesterday,
dag=coretradedag
)
dimtask4 =BashOperator(
task_id='dimtask_product_info',
depends_on_past=False,
bash_command='sh /opt/bitnami/airflow/scripts/dim_load_product_info.sh '+ yesterday,
dag=coretradedag
)
dwdtask = BashOperator(
task_id = 'dwd_load_data',
depends_on_past=False,
bash_command='sh /opt/bitnami/airflow/scripts/dwd_load_trade_orders.sh'+ yesterday,
dag=coretradedag
)
dwstask = BashOperator(
task_id = 'dws_load_data',
depends_on_past=False,
bash_command='sh /opt/bitnami/airflow/scripts/dws_load_trade_orders.sh '+ yesterday,
dag=coretradedag
)
adstask = BashOperator(
task_id = 'ads_load_data',
depends_on_past=False,
bash_command='sh /opt/bitnami/airflow/scripts/ads_load_trade_order_analysis.sh'+ yesterday,
dag=coretradedag
)
'''
poke
reschedule
smart sensor
DAG Dependencies
TriggerDagRunOperator
ExternalTaskSensor
Added in Airflow 2.1
'''
dag1_check_task = ExternalTaskSensor(
#指定该dag的名称,在airflow列表页面显示的就是这个任务的id名称
task_id='dag1_check_task',
#指定依赖哪一个dag的id
external_dag_id='HelloWorldDag',
#指定依赖dag的哪一个task任务
external_task_id='MyTask1',
#执行的命令
#bash_command='echo "hello 123!"',
#列出允许的states,default是success
allowed_states=['success'], #Task允许的状态,这里指允许外部Task执行状态为 success
#与执行的external任务的时间差,即往前推8个小时内有一个成功的dag1的记录
execution_delta=timedelta(minutes=20),
#execution_date_fn=lambda dt:dt + timedelta(minutes=5),
timeout=300, # 超时时间,如果等待了300秒还未符合期望状态的外部Task,那么抛出异常进入重试
#poke_interval=60, #check every 1 minutes
mode='reschedule', # 默认poke,还有reschedule 模式.reschedule模式,在等待的时候,再次检查期间会sleep当前的Task,节约系统开销
check_existence=True, #校验外部Task是否存在,不存在立马结束等待
dag=coretradedag
)
'''
trigger_next_dag = TriggerDagRunOperator(
trigger_dag_id = "Download_Stock_Price",
task_id = 'download_prices',
execution_date = "{{ds}}",
wait_for_completion = False
)
'''
# print(odstask)
dag1_check_task >> adstask
odstask >> dimtask1
odstask >> dimtask2
odstask >> dimtask3
odstask >> dimtask4
odstask >> dwdtask
#dwdtask >> adstask
dwdtask.set_upstream(adstask)
dwdtask >> dwstask
dimtask1 >> dwstask
dimtask2 >> dwstask
dimtask3 >> dwstask
dimtask4 >> dwstask
注意:DAG中的参数depends_on_pass设置为True时,表示上一次调度成功了才可以触发。
10.3 检查部署的DAG
命令行查看
airflow list_dags
airflow list_tasks coretrade --tree
airflow list_tasks tutorial --tree
WEB页面树形图展示
WEB页面Graph展示