Airflow的dag的解析方法

最新推荐文章于 2023-11-16 22:48:46 发布

peking1987

最新推荐文章于 2023-11-16 22:48:46 发布

阅读量2.5k

点赞数

分类专栏： AIRFLOW

本文链接：https://blog.csdn.net/peking1987/article/details/102818085

版权

从Job.py SchedulerJob类的_execute方法开始

    def _execute(self):
        self.log.info("Starting the scheduler")

        # DAGs can be pickled for easier remote execution by some executors
        pickle_dags = False
        if self.do_pickle and self.executor.__class__ not in \
                (executors.LocalExecutor, executors.SequentialExecutor):
            pickle_dags = True

        self.log.info("Running execute loop for %s seconds", self.run_duration)
        self.log.info("Processing each file at most %s times", self.num_runs)

        # Build up a list of Python files that could contain DAGs
        self.log.info("Searching for files in %s", self.subdir)
        known_file_paths = list_py_file_paths(self.subdir)
        self.log.info("There are %s files in %s", len(known_file_paths), self.subdir)
        
        // 以下方法返回真正进行dag解析的类
        def processor_factory(file_path, zombies):
            return DagFileProcessor(file_path,
                                    pickle_dags,
                                    self.dag_ids,
                                    zombies)

        # When using sqlite, we do not use async_mode
        # so the scheduler job and DAG parser don't access the DB at the same time.
        async_mode = not self.using_sqlite

        // 以下为dag解析的入口类
        self.processor_agent = DagFileProcessorAgent(self.subdir,
                                                     known_file_paths,
                                                     self.num_runs,
                                                     processor_factory,
                                                     async_mode)

        try:
            self._execute_helper()
        finally:
            self.processor_agent.end()
            self.log.info("Exited execute loop")

到DagFileProcessorAgent类的start()方法：

"""
        Launch DagFileProcessorManager processor and start DAG parsing loop in manager.
        """
        self._process = self._launch_process(self._dag_directory,
                                             self._file_paths,
                                             self._max_runs,
                                             self._processor_factory,
                                             self._child_signal_conn,
                                             self._stat_queue,
                                             self._result_queue,
                                             self._async_mode)
        self.log.info("Launched DagFileProcessorManager with pid: {}"
                      .format(self._process.pid))

再到DagFileProcessorAgent类的_launch_process方法

@staticmethod
    def _launch_process(dag_directory,
                        file_paths,
                        max_runs,
                        processor_factory,
                        signal_conn,
                        _stat_queue,
                        result_queue,
                        async_mode):
        def helper():
            # Reload configurations and settings to avoid collision with parent process.
            # Because this process may need custom configurations that cannot be shared,
            # e.g. RotatingFileHandler. And it can cause connection corruption if we
            # do not recreate the SQLA connection pool.
            os.environ['CONFIG_PROCESSOR_MANAGER_LOGGER'] = 'True'
            # Replicating the behavior of how logging module was loaded
            # in logging_config.py
            reload_module(import_module(logging_class_path.rsplit('.', 1)[0]))
            reload_module(airflow.settings)
            del os.environ['CONFIG_PROCESSOR_MANAGER_LOGGER']
            processor_manager = DagFileProcessorManager(dag_directory,
                                                        file_paths,
                                                        max_runs,
                                                        processor_factory,
                                                        signal_conn,
                                                        _stat_queue,
                                                        result_queue,
                                                        async_mode)

            processor_manager.start()

        p = multiprocessing.Process(target=helper,
                                    args=(),
                                    name="DagFileProcessorManager")
        p.start()
        return p

再到DagFileProcessorManager类的start_in_async方法：

    def start_in_async(self):
        """
        Parse DAG files repeatedly in a standalone loop.
        """
        while True:
            loop_start_time = time.time()

            if self._signal_conn.poll():
                agent_signal &#