LocalTaskJob 本地作业Job
LocalTaskJob 也是基于 BaseJob 实现,run_job.run 调用实现子类的 _execute 方法:
一次只服务一个task_instance
核心属性taskrunner,任务运行器在运行任务时,会打开一个新的进程。
当要终止job时,可以修改数据库中任务实例的状态,也可以通修改LocalTaskJob的状态来实现。
class LocalTaskJob(BaseJob):
__mapper_args__ = {
'polymorphic_identity': 'LocalTaskJob'
}
def __init__(
self,
task_instance,
ignore_all_deps=False,
ignore_depends_on_past=False,
ignore_task_deps=False,
ignore_ti_state=False,
mark_success=False,
pickle_id=None,
pool=None,
*args, **kwargs):
self.task_instance = task_instance
self.ignore_all_deps = ignore_all_deps
self.ignore_depends_on_past = ignore_depends_on_past
self.ignore_task_deps = ignore_task_deps
self.ignore_ti_state = ignore_ti_state
self.pool = pool
self.pickle_id = pickle_id
self.mark_success = mark_success
# terminating state is used so that a job don't try to
# terminate multiple times
self.terminating = False
super(LocalTaskJob, self).__init__(*args, **kwargs)
def _execute(self):
# 获取作业运行器
self.task_runner = get_task_runner(self)
def signal_handler(signum, frame):
""" 信号处理器 Setting kill signal handler"""
self.log.error("Killing subprocess")
self.on_kill()
raise AirflowException("LocalTaskJob received SIGTERM signal")
signal.signal(signal.SIGTERM, signal_handler)
if not self.task_instance._check_and_change_state_before_execution(
mark_success=self.mark_success,
ignore_all_deps=self.ignore_all_deps,
ignore_depends_on_past=self.ignore_depends_on_past,
ignore_task_deps=self.ignore_task_deps,
ignore_ti_state=self.ignore_ti_state,
job_id=self.id,
pool=self.pool):
self.log.info("Task is not able to be run")
return
try:
# 启动内部任务BashTaskRunner
self.task_runner.start()
last_heartbeat_time = time.time()
heartbeat_time_limit = conf.getint('scheduler',
'scheduler_zombie_task_threshold')
# 外部任务通过循环检测返回码,通过循环监控任务运行返回状态码和发送心跳信号。
# 轮询
while True:
# Monitor the task to see if it's done
# 可能阻塞
return_code = self.task_runner.return_code()
if return_code is not None:
self.log.info("Task exited with return code %s", return_code)
return
# Periodically heartbeat so that the scheduler doesn't think this
# is a zombie
try:
# 发送心跳-->可能阻塞
self.heartbeat()
last_heartbeat_time = time.time()
except OperationalError:
Stats.incr('local_task_job_heartbeat_failure', 1, 1)
self.log.exception(
"Exception while trying to heartbeat! Sleeping for %s seconds",
self.heartrate
)
time.sleep(self.heartrate)
# If it's been too long since we've heartbeat, then it's possible that
# the scheduler rescheduled this task, so kill launched processes.
# 心跳超时
time_since_last_heartbeat = time.time() - last_heartbeat_time
if time_since_last_heartbeat > heartbeat_time_limit:
Stats.incr('local_task_job_prolonged_heartbeat_failure', 1, 1)
self.log.error("Heartbeat time limited exceeded!")
raise AirflowException("Time since last heartbeat({:.2f}s) "
"exceeded limit ({}s)."
.format(time_since_last_heartbeat,
heartbeat_time_limit))
finally:
self.on_kill()
_TASK_RUNNER = configuration.get('core', 'TASK_RUNNER')
def get_task_runner(local_task_job):
"""
Get the task runner that can be used to run the given job.
通过配置字段获取本地作业运行器
"""
if _TASK_RUNNER == "BashTaskRunner":
return BashTaskRunner(local_task_job)
elif _TASK_RUNNER == "CgroupTaskRunner":
from airflow.contrib.task_runner.cgroup_task_runner import CgroupTaskRunner
return CgroupTaskRunner(local_task_job)
else:
raise AirflowException("Unknown task runner type {}".format(_TASK_RUNNER))
BashTaskRunner
在 _execute 方法中异步启动 task_runner,通过循环监控任务运行返回状态码和发送心跳信号。
class BaseTaskRunner(LoggingMixin):
"""
Runs Airflow task instances by invoking the `airflow run` command with raw
mode enabled in a subprocess.
运行任务
具体机制:收到一个命令行命令 airflow run ,开启一个进程执行命令
"""
def __init__(self, local_task_job):
super(BaseTaskRunner, self).__init__(local_task_job.task_instance)
self._task_instance = local_task_job.task_instance
popen_prepend = []
cfg_path = None
# 任务执行者
if self._task_instance.run_as_user:
self.run_as_user = self._task_instance.run_as_user
else:
try:
self.run_as_user = conf.get('core', 'default_impersonation')
except conf.AirflowConfigException:
self.run_as_user = None
# Add sudo commands to change user if we need to. Needed to handle SubDagOperator
# case using a SequentialExecutor.
# 配置相应的sudo 等权限 资源
if self.run_as_user and (self.run_as_user != getpass.getuser()):
self.log.debug("Planning to run as the %s user", self.run_as_user)
cfg_dict = conf.as_dict(display_sensitive=True)
cfg_subset = {
'core': cfg_dict.get('core', {}),
'smtp': cfg_dict.get('smtp', {}),
'scheduler': cfg_dict.get('scheduler', {}),
'webserver': cfg_dict.get('webserver', {}),
}
temp_fd, cfg_path = mkstemp()
# Give ownership of file to user; only they can read and write
# 改变文件所有者
subprocess.call(
['sudo', 'chown', self.run_as_user, cfg_path]
)
subprocess.call(
['sudo', 'chmod', '600', cfg_path]
)
with os.fdopen(temp_fd, 'w') as temp_file:
json.dump(cfg_subset, temp_file)
popen_prepend = ['sudo', '-H', '-u', self.run_as_user]
self._cfg_path = cfg_path
# 生成命令:airflow run <dag_id> <task_id> <execution_date> --local --pool <pool> -sd <python_file>
# airflow run <dag_id> <task_id> <execution_date> --raw --job_id <job_id> --pool <pool> -sd <python_file>
# 未来将要运行的命令
self._command = popen_prepend + self._task_instance.command_as_list(
raw=True,
pickle_id=local_task_job.pickle_id,
mark_success=local_task_job.mark_success,
job_id=local_task_job.id,
pool=local_task_job.pool,
cfg_path=cfg_path,
)
self.process = None
def _read_task_logs(self, stream):
# 读取任务日志
while True:
line = stream.readline()
if isinstance(line, bytes):
line = line.decode('utf-8')
if len(line) == 0:
break
self.log.info('Subtask: %s', line.rstrip('\n'))
def run_command(self, run_with, join_args=False):
"""
Run the task command
开启子进程,运行命令,打印记录,返回结果
:param run_with: list of tokens to run the task command with
E.g. ['bash', '-c']
:type run_with: list
:param join_args: whether to concatenate the list of command tokens
E.g. ['airflow', 'run'] vs ['airflow run']
:param join_args: bool
:return: the process that was run
:rtype: subprocess.Popen
"""
cmd = [" ".join(self._command)] if join_args else self._command
full_cmd = run_with + cmd
self.log.info('Running: %s', full_cmd)
proc = subprocess.Popen(
full_cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
universal_newlines=True
)
# Start daemon thread to read subprocess logging output
log_reader = threading.Thread(
target=self._read_task_logs,
args=(proc.stdout,),
)
log_reader.daemon = True
log_reader.start()
return proc
class BashTaskRunner(BaseTaskRunner):
"""
Runs the raw Airflow task by invoking through the Bash shell.
"""
def __init__(self, local_task_job):
super(BashTaskRunner, self).__init__(local_task_job)
def start(self):
# 开始运行入口
self.process = self.run_command(['bash', '-c'], join_args=True)
def return_code(self):
# 返回状态
return self.process.poll()
def terminate(self):
# 终止
if self.process and psutil.pid_exists(self.process.pid):
kill_process_tree(self.log, self.process.pid)
def on_finish(self):
# 回调函数
super(BashTaskRunner, self).on_finish()
CgroupTaskRunner
class CgroupTaskRunner(BaseTaskRunner):
在一定的资源环境下运行任务
种任务执行器可以做到资源隔离,不受其他任务的影响
BackfillJob 回填任务作业Job
回填任务主要用于运行过去的任务,具体为创建多个dagrun,并创建多个taskinstance,然后将taskinstance发送到执行器,期间发送心跳,用于与外界交互,同时通过监控数据库的状态来监控整个任务的运行情况.
具体见code。