crontab定时任务不利于平时的监控,决定使用一种新的调度框架
1.安装依赖
# 避免连接密码以明文形式存储
pip3 install cryptography
pip3 install paramiko
# AttributeError: module 'enum' has no attribute 'IntFlag'
pip3 uninstall enum34
pip3 install celery
pip3 install redis
pip3 install dask
yum install mysql-devel
pip3 install mysqlclient
pip3 install apache-airflow
# 避免产生大量日志
cd /usr/local/lib/python3.7/site-packages/airflow
vim settings.py
# LOGGING_LEVEL = logging.INFO
LOGGING_LEVEL = logging.WARN
2.配置环境变量
# vim /etc/profile
# 指定airflow工作目录,airflow的工作目录默认在当前用户目录下
export AIRFLOW_HOME=/usr/local/airflow
# source /etc/profile
3.第一次初始化airflow
airflow initdb
# 查看其生成文件
cd /usr/local/airflow
ls
airflow.cfg airflow.db logs unittests.cfg
4.配置MySQL数据库(创建airflow数据库,并赋予 airflow用户访问该数据库的权限)
CREATE DATABASE airflow;
# 注意这里如果密码不是设置为airflow,需要修改airflow.cfg里面的broker_url,result_backend
grant all PRIVILEGES on airflow.* to airflow@'localhost' identified by 'airflow_123';
FLUSH PRIVILEGES;
5.更改MySQL配置
# 更改MySQL配置
Exception: Global variable explicit_defaults_for_timestamp needs to be on (1) for mysql
vim /etc/my.cnf
[mysqld]
explicit_defaults_for_timestamp=1
# 重启mysql
service mysqld restart
6.修改airflow配置
vim airflow/airflow.cfg
# 修改默认时区,不然web界面显示的时间不是国内的时间
# default_timezone = utc
default_timezone = Asia/Shanghai
# 修改执行器类型
executor = CeleryExecutor
# 不加载范例dag
load_example = False
# 不让同个dag并行操作,在ETL过程中,还是线性执行会比较好控制,如果里面需要批量操作,可以在ETL的具体处理过程中加入多线程或者多进程方式执行,不要在dag中体现
max_active_runs_per_dag = 1
# 最高的dag并发数量,一般配置成服务器的CPU核数,默认16也没问题
dag_concurrency = 16
# 最高的任务并发数量,CeleryExecutor在Airflow的worker线程中执行的,这里配置的是启动多少个worker
worker_concurrency = 16
# 数据库配置,我们一般是用MySQL来配合Airflow的运行
# 这里之前使用pymysql导致scheduler 无法在后台运行
sql_alchemy_conn = mysql://airflow:airflow_123@127.0.0.1:3306/airflow?charset=utf8
# 我的redis密码也是airflow_123
# Celery Broker 默认配置中两个redis配置被分到两个redis区
broker_url = redis://:airflow_123@127.0.0.1:6379/0
# Celery Result backend 默认配置中两个redis配置被分到两个redis区
result_backend = redis://:airflow_123@127.0.0.1:6379/1
7.修改为中国时间
cd /usr/local/lib/python3.7/site-packages/airflow
cd utils/
vim timezone.py
# 第一处修改
# 在 utc = pendulum.timezone(‘UTC’) 这行(第27行)代码下添加
from airflow import configuration as conf
try:
tz = conf.get("core", "default_timezone")
if tz == "system":
utc = pendulum.local_timezone()
else:
utc = pendulum.timezone(tz)
except Exception:
pass
# 第二处修改
def utcnow():
"""
Get the current date and time in UTC
:return:
"""
# pendulum utcnow() is not used as that sets a TimezoneInfo object
# instead of a Timezone. This is not pickable and also creates issues
# when using replace()
#d = dt.datetime.utcnow()
# 修改为:
d = dt.datetime.now()
d = d.replace(tzinfo=utc)
return d
# 第三处修改
vim sqlalchemy.py
# 在utc = pendulum.timezone(‘UTC’) 这行(第37行)代码下添加
from airflow import configuration as conf
try:
tz = conf.get("core", "default_timezone")
if tz == "system":
utc = pendulum.local_timezone()
else:
utc = pendulum.timezone(tz)
except Exception:
pass
# 第四处修改
# 注释airflow/utils/sqlalchemy.py中的 (第124行)
# cursor.execute(“SET time_zone = ‘+00:00’”)
# 第五处修改
# 修改airflow/www/templates/admin/master.html(第31行)
# var UTCseconds = (x.getTime() + x.getTimezoneOffset()*60*1000);
var UTCseconds = x.getTime();
# 第六处修改
#"timeFormat":"H:i:s %UTC%",
"timeFormat":"H:i:s",
8.再次初始化airflow
airflow initdb
9.运行 由于使用的是CeleryExecutor,需要顺序执行三个进程
airflow webserver -D
airflow scheduler -D
airflow worker -D
10.验证
10.1 The scheduler does not appear to be running. Last heartbeat was received 45 seconds ago.
The DAGs list may not update, and new tasks will not be scheduled.
解决方法:
scheduler 没有起来,查看一下scheduler的日志
10.2 AttributeError: module ‘enum’ has no attribute ‘IntFlag’
pip uninstall enum34
10.3 界面显示,安装成功
参考:https://baijiahao.baidu.com/s?id=1647375438415223701&wfr=spider&for=pc
参考:https://blog.csdn.net/crazy__hope/article/details/83688986