cli解析参数
airflow.py:
if __name__ == '__main__':
if configuration.get("core", "security") == 'kerberos':
os.environ['KRB5CCNAME'] = configuration.get('kerberos', 'ccache')
os.environ['KRB5_KTNAME'] = configuration.get('kerberos', 'keytab')
# 命令解析工厂初始化
parser = CLIFactory.get_parser()
# 解析参数
args = parser.parse_args()
# 执行参数指定的func
args.func(args)
接受的cli命令通过在args根据给定命令匹配到指定的Arg(tuple数组),在以Arg名称去subparsers匹配到相应的执行函数func。
scheduler()
#cli.py 调度器的执行入口
def scheduler(args):
print(settings.HEADER)
# 生成SchedulerJob类
job = jobs.SchedulerJob(
dag_id=args.dag_id,
subdir=process_subdir(args.subdir),
run_duration=args.run_duration,
num_runs=args.num_runs,
do_pickle=args.do_pickle)
# 守护进程
if args.daemon:
# 设置进程的pid, stdout, stderr, log等信息
pid, stdout, stderr, log_file = setup_locations("scheduler", args.pid, args.stdout, args.stderr, args.log_file)
handle = setup_logging(log_file)
stdout = open(stdout, 'w+')
stderr = open(stderr, 'w+')
# DaemonContext上下文应用于创建守护进程
ctx = daemon.DaemonContext(
pidfile=TimeoutPIDLockFile(pid, -1),# 记录守护进程id
files_preserve=[handle],# 指定哪些文件需要在守护进程模式下保持打开状态
stdout=stdout,
stderr=stderr,
)
with ctx:
# 运行在守护进程模式下
job.run()
stdout.close()
stderr.close()
else:
# 设置信号捕捉处理器
signal.signal(signal.SIGINT, sigint_handler)
signal.signal(signal.SIGTERM, sigint_handler)
signal.signal(signal.SIGQUIT, sigquit_handler)
job.run()
Dag
BaseDag和BaseDagBag
# 有向无环图
class BaseDag(object):
"""
虚类
Base DAG object that both the SimpleDag and DAG inherit.
"""
__metaclass__ = ABCMeta
@abstractproperty
def dag_id(self):
raise NotImplementedError()
@abstractproperty
def task_ids(self):
"""
含有的任务ID
"""
raise NotImplementedError()
@abstractproperty
def full_filepath(self):
"""
文件路径
"""
raise NotImplementedError()
@abstractmethod
def concurrency(self):
"""
并行度
:return: maximum number of tasks that can run simultaneously from this DAG
:rtype: int
"""
raise NotImplementedError()
@abstractmethod
def is_paused(self):
"""
是否暂停
"""
raise NotImplementedError()
@abstractmethod
def pickle_id(self):
"""
序列化ID
"""
raise NotImplementedError
# BaseDagBag表示有向无环图的组合
class BaseDagBag(object):
"""
dag集合
Base object that both the SimpleDagBag and DagBag inherit.
"""
@abstractproperty
def dag_ids(self):
"""
包含的dag_id
"""
raise NotImplementedError()
@abstractmethod
def get_dag(self, dag_id):
"""
获取dag
"""
raise NotImplementedError()
Dag 见代码注释
Operator算子
BaseOperator
算子本身可以是一个dag图,运行时会占用一定资源,以虚拟的pool_slot进行和物理resources表示
pool_slots是含有若干个slot,当并行计算的任务占用全部资源时,则报错
resources表示对于实际资源的需求,比如cpu和内存
sla后算子仍然没有完成,则报警
触发规则
trigger_rule【all_success | all_failed | all_done | one_success】表示对于上游算子 的依赖;
depends_on_past 表示是否依赖于上次算子运行是否成功
wait_for_downstream表示当且上次的dagrun的下游算子完成之后,才能触发该次算子运行
算子与算子之间,不能直接通信
通过 xom,第三方。
算子的具体逻辑实现在execute()中
BashOperator
class BashOperator(BaseOperator):
template_fields = ('bash_command', 'env')
template_ext = ('.sh', '.bash',)
ui_color = '#f0ede4'
@apply_defaults
def __init__(
self,
bash_command,
xcom_push=False,
env=None,
output_encoding='utf-8',
*args, **kwargs):
super(BashOperator, self).__init__(*args, **kwargs)
self.bash_command = bash_command
self.env = env
self.xcom_push_flag = xcom_push
self.output_encoding = output_encoding
def execute(self, context):
"""
Execute the bash command in a temporary directory
which will be cleaned afterwards
"""
bash_command = self.bash_command
self.log.info("Tmp dir root location: \n %s", gettempdir())
with TemporaryDirectory(prefix='airflowtmp') as tmp_dir:
with NamedTemporaryFile(dir=tmp_dir, prefix=self.task_id) as f:
# 临时目录下
f.write(bytes(bash_command, 'utf_8'))
f.flush()
fname = f.name
script_location = tmp_dir + "/" + fname
self.log.info(
"Temporary script location: %s",
script_location
)
def pre_exec():
# Restore default signal disposition and invoke setsid
for sig in ('SIGPIPE', 'SIGXFZ', 'SIGXFSZ'):
if hasattr(signal, sig):
signal.signal(getattr(signal, sig), signal.SIG_DFL)
os.setsid()
self.log.info("Running command: %s", bash_command)
# 执行bash脚本
sp = Popen(
['bash', fname],
stdout=PIPE, stderr=STDOUT,
cwd=tmp_dir, env=self.env,
preexec_fn=pre_exec)
self.sp = sp
self.log.info("Output:")
line = ''
for line in iter(sp.stdout.readline, b''):
line = line.decode(self.output_encoding).strip()
self.log.info(line)
sp.wait()
self.log.info(
"Command exited with return code %s",
sp.returncode
)
if sp.returncode:
raise AirflowException("Bash command failed")
if self.xcom_push_flag:
return line
def on_kill(self):
self.log.info('Sending SIGTERM signal to bash process group')
# 发送信号
os.killpg(os.getpgid(self.sp.pid), signal.SIGTERM)