从Job.py SchedulerJob类的_execute方法开始
def _execute(self):
self.log.info("Starting the scheduler")
# DAGs can be pickled for easier remote execution by some executors
pickle_dags = False
if self.do_pickle and self.executor.__class__ not in \
(executors.LocalExecutor, executors.SequentialExecutor):
pickle_dags = True
self.log.info("Running execute loop for %s seconds", self.run_duration)
self.log.info("Processing each file at most %s times", self.num_runs)
# Build up a list of Python files that could contain DAGs
self.log.info("Searching for files in %s", self.subdir)
known_file_paths = list_py_file_paths(self.subdir)
self.log.info("There are %s files in %s", len(known_file_paths), self.subdir)
// 以下方法返回真正进行dag解析的类
def processor_factory(file_path, zombies):
return DagFileProcessor(file_path,
pickle_dags,
self.dag_ids,
zombies)
# When using sqlite, we do not use async_mode
# so the scheduler job and DAG parser don't access the DB at the same time.
async_mode = not self.using_sqlite
// 以下为dag解析的入口类
self.processor_agent = DagFileProcessorAgent(self.subdir,
known_file_paths,
self.num_runs,
processor_factory,
async_mode)
try:
self._execute_helper()
finally:
self.processor_agent.end()
self.log.info("Exited execute loop")
到DagFileProcessorAgent类的start()方法:
"""
Launch DagFileProcessorManager processor and start DAG parsing loop in manager.
"""
self._process = self._launch_process(self._dag_directory,
self._file_paths,
self._max_runs,
self._processor_factory,
self._child_signal_conn,
self._stat_queue,
self._result_queue,
self._async_mode)
self.log.info("Launched DagFileProcessorManager with pid: {}"
.format(self._process.pid))
再到DagFileProcessorAgent类的_launch_process方法
@staticmethod
def _launch_process(dag_directory,
file_paths,
max_runs,
processor_factory,
signal_conn,
_stat_queue,
result_queue,
async_mode):
def helper():
# Reload configurations and settings to avoid collision with parent process.
# Because this process may need custom configurations that cannot be shared,
# e.g. RotatingFileHandler. And it can cause connection corruption if we
# do not recreate the SQLA connection pool.
os.environ['CONFIG_PROCESSOR_MANAGER_LOGGER'] = 'True'
# Replicating the behavior of how logging module was loaded
# in logging_config.py
reload_module(import_module(logging_class_path.rsplit('.', 1)[0]))
reload_module(airflow.settings)
del os.environ['CONFIG_PROCESSOR_MANAGER_LOGGER']
processor_manager = DagFileProcessorManager(dag_directory,
file_paths,
max_runs,
processor_factory,
signal_conn,
_stat_queue,
result_queue,
async_mode)
processor_manager.start()
p = multiprocessing.Process(target=helper,
args=(),
name="DagFileProcessorManager")
p.start()
return p
再到DagFileProcessorManager类的start_in_async方法:
def start_in_async(self):
"""
Parse DAG files repeatedly in a standalone loop.
"""
while True:
loop_start_time = time.time()
if self._signal_conn.poll():
agent_signal &#