Airflow DAG项目实战

案例一:DAG入门案例

编写python脚本并放在$AIRFLOW_HOME/dags目录下

Vim helloworld.py

"""

describe: test HelloWorld schedule

founder:Efron.Shu

Date created:2022-03-22

Modified by:Efron.Shu

Modified date:2022-03-22

##vim /data/efron/schedule/HelloWorld.py

"""

#!/usr/bin/python3

from jinja2 import Template

from datetime import datetime,timedelta

from airflow import DAG

from airflow.utils import dates

# from airflow.utils.helper import chain

# from airflow.operators.bash_operatorimport BashOperator

from airflow.operators.bash importBashOperator

# from airflow.operators.python_operatorimport PythonOperator

from airflow.operators.python importPythonOperator

def default_options():

default_args = {

'owner':'airflow',

'start_date':dates.days_ago(1),

'retries':1,

'retry_delay':timedelta(seconds=5)

}

return default_args

def task1(dag):

t = "echo "+"{{ ds }}"

task = BashOperator(

task_id = 'MyTask1',

bash_command=t,

dag=dag

)

print(t)

return task

def hello_world():

current_time = str(datetime.today())

print('hello world at {}'.format(current_time))

def task2(dag):

#python Operator

task = PythonOperator(

task_id='MyTask2',

python_callable=hello_world,

dag=dag)

return task

def task3(dag):

t = "date"

task = BashOperator(

task_id='MyTask3',

bash_command=t,

dag=dag)

return task

with DAG(

'HelloWorldDag',

default_args=default_options(),

schedule_interval = "10 1* * *",

# schedule_interval ="@once",

catchup=False,

# max_active_runs=1, # 避免同一个调度同时跑,如果有数据库操作,同一个task同时跑可能会造成死锁

) as d:

task1 = task1(d)

task2 = task2(d)

task3 = task3(d)

##chain(task1,task2,task3)

task1 >> task2

task2 >> task3

#执行命令检查脚本是否有错误

python $AIRFLOW_HOME/dags/helloworld.py

#查看生效的dags

airflow list_dags -sd /opt/bitnami/airflow/dags

#查看指定dag中的task

airflow tasks list HelloWorldDag

#测试DAG中的task

airflow tasks test HelloWorldDag MyTask220220321

查看WEB页面显示

也可以通过web页面手动执行task进行测试

案例二:DAG项目实战

核心交易分析调度场景涉及

需求:1,实现ETL各层级之间task的调度关系

2,实现DAG中task依赖外部DAG的其他task

10.1 准备需要执行的task程序

准备如下文件,表示我们需要执行的task程序,文件内容先简单做一个打印输出

#加载ODS层数据

/opt/bitnami/airflow/scripts/ods_load_trade.sh

#加载DIM层数据

/opt/bitnami/airflow/scripts/dim_load_product_cat.sh

/opt/bitnami/airflow/scripts/dim_load_shop_org.sh

/opt/bitnami/airflow/scripts/dim_load_payment.sh

/opt/bitnami/airflow/scripts/dim_load_product_info.sh

#加载DWD层数据

/opt/bitnami/airflow/scripts/dwd_trade_orders.sh

#加载ADS层数据

/opt/bitnami/airflow/scripts/ads_load_trade_order_analysis.sh

10.2 准备DAG脚本

进入$AIRFLOW_HOME/dags

编辑XXX_POC_schedule.py文件

vim coretradedag.py

内容如下

"""

describe: ETL POC schedule

founder:Efron.Shu

Date created:2022-03-22

Modified by:Efron.Shu

Modified date:2022-03-22

----81.68.137.242----

vim /data/efron/schedule/ETL_POC_schedule.py

#加载ODS层数据

/opt/bitnami/airflow/scripts/ods_load_trade.sh

#加载DIM层数据

/opt/bitnami/airflow/scripts/dim_load_product_cat.sh

/opt/bitnami/airflow/scripts/dim_load_shop_org.sh

/opt/bitnami/airflow/scripts/dim_load_payment.sh

/opt/bitnami/airflow/scripts/dim_load_product_info.sh

#加载DWD层数据

/opt/bitnami/airflow/scripts/dwd_load_trade_orders.sh

#加载DWS层数据

/opt/bitnami/airflow/scripts/dws_load_trade_orders.sh

#加载ADS层数据

/opt/bitnami/airflow/scripts/ads_load_trade_order_analysis.sh

"""

#!/usr/bin/python3

from datetime import timedelta

import datetime

# from datetime import datetime

from airflow import DAG

# from airflow.models import DAG

from airflow.utils import dates

# from airflow.operators.bash_operatorimport BashOperator

from airflow.operators.bash importBashOperator

from airflow.utils.dates import days_ago

# from airflow.operators.sensors importExternalTaskSensor

# from airflow.sensors.external_task_sensorimport ExternalTaskSensor

from airflow.sensors.external_task importExternalTaskSensor

# 定义DAG的默认参数

default_args = {

'owner':'airflow',

'start_date':dates.days_ago(1),

'depends_on_past':False,

'email':['airflow@example.com'],

'email_on_failure':False,

'email_on_retry':False,

'retries':1,

'retry_delay':timedelta(minutes = 1),

}

# 定义DAG

coretradedag = DAG(

'coretrade',

default_args=default_args,

description='core trade analyze',

schedule_interval='30 1 * * *',

#schedule_interval = "@once",

#max_active_runs=1,

)

today= datetime.date.today()

oneday=timedelta(days=1)

yesterday=(today-oneday).strftime("%Y-%m-%d")

odstask = BashOperator(

task_id='ods_load_data',

depends_on_past=False,

bash_command='sh /opt/bitnami/airflow/scripts/ods_load_trade.sh ' +yesterday,

dag=coretradedag

)

dimtask1 =BashOperator(

task_id='dimtask_product_cat',

depends_on_past=False,

bash_command='sh /opt/bitnami/airflow/scripts/dim_load_product_cat.sh '+ yesterday,

dag=coretradedag

)

dimtask2 =BashOperator(

task_id='dimtask_shop_org',

depends_on_past=False,

bash_command='sh /opt/bitnami/airflow/scripts/dim_load_shop_org.sh ' +yesterday,

dag=coretradedag

)

dimtask3 =BashOperator(

task_id='dimtask_payment',

depends_on_past=False,

bash_command='sh /opt/bitnami/airflow/scripts/dim_load_payment.sh ' +yesterday,

dag=coretradedag

)

dimtask4 =BashOperator(

task_id='dimtask_product_info',

depends_on_past=False,

bash_command='sh /opt/bitnami/airflow/scripts/dim_load_product_info.sh '+ yesterday,

dag=coretradedag

)

dwdtask = BashOperator(

task_id = 'dwd_load_data',

depends_on_past=False,

bash_command='sh /opt/bitnami/airflow/scripts/dwd_load_trade_orders.sh'+ yesterday,

dag=coretradedag

)

dwstask = BashOperator(

task_id = 'dws_load_data',

depends_on_past=False,

bash_command='sh /opt/bitnami/airflow/scripts/dws_load_trade_orders.sh '+ yesterday,

dag=coretradedag

)

adstask = BashOperator(

task_id = 'ads_load_data',

depends_on_past=False,

bash_command='sh /opt/bitnami/airflow/scripts/ads_load_trade_order_analysis.sh'+ yesterday,

dag=coretradedag

)

'''

poke

reschedule

smart sensor

DAG Dependencies

TriggerDagRunOperator

ExternalTaskSensor

Added in Airflow 2.1

'''

dag1_check_task = ExternalTaskSensor(

#指定该dag的名称,在airflow列表页面显示的就是这个任务的id名称

task_id='dag1_check_task',

#指定依赖哪一个dag的id

external_dag_id='HelloWorldDag',

#指定依赖dag的哪一个task任务

external_task_id='MyTask1',

#执行的命令

#bash_command='echo "hello 123!"',

#列出允许的states,default是success

allowed_states=['success'], #Task允许的状态,这里指允许外部Task执行状态为 success

#与执行的external任务的时间差,即往前推8个小时内有一个成功的dag1的记录

execution_delta=timedelta(minutes=20),

#execution_date_fn=lambda dt:dt + timedelta(minutes=5),

timeout=300, # 超时时间,如果等待了300秒还未符合期望状态的外部Task,那么抛出异常进入重试

#poke_interval=60, #check every 1 minutes

mode='reschedule', # 默认poke,还有reschedule 模式.reschedule模式,在等待的时候,再次检查期间会sleep当前的Task,节约系统开销

check_existence=True, #校验外部Task是否存在,不存在立马结束等待

dag=coretradedag

)

'''

trigger_next_dag = TriggerDagRunOperator(

trigger_dag_id = "Download_Stock_Price",

task_id = 'download_prices',

execution_date = "{{ds}}",

wait_for_completion = False

)

'''

# print(odstask)

dag1_check_task >> adstask

odstask >> dimtask1

odstask >> dimtask2

odstask >> dimtask3

odstask >> dimtask4

odstask >> dwdtask

#dwdtask >> adstask

dwdtask.set_upstream(adstask)

dwdtask >> dwstask

dimtask1 >> dwstask

dimtask2 >> dwstask

dimtask3 >> dwstask

dimtask4 >> dwstask

注意:DAG中的参数depends_on_pass设置为True时,表示上一次调度成功了才可以触发。

10.3 检查部署的DAG

命令行查看

airflow list_dags

airflow list_tasks coretrade --tree

airflow list_tasks tutorial --tree

WEB页面树形图展示

WEB页面Graph展示

评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值