1.安装
-
版本
airflow 1.10.6
python 3.6 -
安装
- 安装python3.6, 创建虚拟环境
[root@ip-172-27-0-4 upload]#yum install python36
[root@ip-172-27-0-4 upload]# virtualenv -p /usr/bin/python3 ./air-flow
[root@ip-172-27-0-4 bin]# source activate - 安装依赖包
[root@ip-172-27-0-4 upload] yum install gcc
[root@ip-172-27-0-4 upload] yum install python36-devel - 安装airflow
pip install airflow
2.初始化
----- (optional)
export AIRFLOW_HOME=/home/upload/airflow_home
----- install from pypi using pip
pip install apache-airflow
----- initialize the database
airflow initdb
----- start the web server, default port is 8080
airflow webserver -p 8080
----- start the scheduler
airflow scheduler
3. Hello World
- AIRFLOW_HOME 创建一个dags 目录 (注意此时的操作都是在 python virtualenv下面执行)
(air-flow) [root@ip-172-27-0-4 airflow_home]# mkdir dags
- dags 编辑hello_world.py
# -*- coding: utf-8 -*-
import airflow
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator
from datetime import timedelta
#-------------------------------------------------------------------------------
# these args will get passed on to each operator
# you can override them on a per-task basis during operator initialization
default_args = {
'owner': 'airflow,
'depends_on_past': False,
'start_date': airflow.utils.dates.days_ago(2),
'retries': 1,
'retry_delay': timedelta(minutes=5)
}
#-------------------------------------------------------------------------------
# dag
dag = DAG(
'example_hello_world_dag',
default_args=default_args,
description='my first DAG',
schedule_interval=timedelta(days=1))
#-------------------------------------------------------------------------------
# first operator
date_operator = BashOperator(
task_id='date_task',
bash_command='date',
dag=dag)
#-------------------------------------------------------------------------------
# second operator
sleep_operator = BashOperator(
task_id='sleep_task',
depends_on_past=False,
bash_command='sleep 5',
dag=dag)
#-------------------------------------------------------------------------------
# third operator
def print_hello():
return 'Hello world!'
hello_operator = PythonOperator(
task_id='hello_task',
python_callable=print_hello,
dag=dag)
#-------------------------------------------------------------------------------
# dependencies
sleep_operator.set_upstream(date_operator)
hello_operator.set_upstream(date_operator)
-
通过命令查看hello_example 中的dag
(air-flow) [root@ip-172-27-0-4 dags]# airflow list_tasks example_hello_world_dag
[2019-12-15 11:10:03,960] {init.py:51} INFO - Using executor SequentialExecut or
[2019-12-15 11:10:03,961] {dagbag.py:92} INFO - Filling up the DagBag from /home/ upload/airflow_home/dags
date_task
hello_task
sleep_task -
启动scheduler
(air-flow) [root@ip-172-27-0-4 dags]# airflow scheduler
如果同时打开webUI 和scheduler会抱错:
ERROR - Cannot use more than 1 thread when using sqlite. Setting parallelism to 1
但是不会影响 schedluer 的运行
-
启动DAG
-
在界面中能看到加入的dag的运行状态