openmetadata提取MySQL数据库元数据源码流程剖析

本文链接：https://blog.csdn.net/jy739761380/article/details/145571216

1. 总体描述

openmetadata提取元数据的逻辑都在ingestion模块中，我们在本地启动ingestion模块，来查看分析提取MySQL数据库元数据的逻辑，同时理解openmetadata系统中代码设计的艺术。

2.代码整体流程图

3. 代码解析描述

上述流程中最总要的是逻辑谁source.run(),下面我们详细讲解一下他的逻辑。

Source类源码预览：

class Source(IterStep, ABC):
    """
    Abstract source implementation. The workflow will run
    its next_record and pass them to the next step.
    """

    metadata: OpenMetadata
    connection_obj: Any
    service_connection: Any

    @abstractmethod
    def prepare(self):
        pass

    @abstractmethod
    def test_connection(self) -> None:
        pass

    @property
    def name(self) -> str:
        return "Source"

    @calculate_execution_time_generator(context="Source")
    def run(self) -> Iterable[Optional[Entity]]:
        yield from super().run()

Soure类是一个父类，真正处理数据库服务的类是DatabaseServiceSource，查看一下它的源码

class DatabaseServiceSource(
    TopologyRunnerMixin, Source, ABC
):  # pylint: disable=too-many-public-methods
    """
    Base class for Database Services.
    It implements the topology and context.
    """

    source_config: DatabaseServiceMetadataPipeline
    config: WorkflowSource
    database_source_state: Set = set()
    stored_procedure_source_state: Set = set()
    # Big union of types we want to fetch dynamically
    service_connection: DatabaseConnection.model_fields["config"].annotation

    # When processing the database, the source will update the inspector if needed
    inspector: Inspector

    topology = DatabaseServiceTopology()
    context = TopologyContextManager(topology)

    @property
    def name(self) -> str:
        return self.service_connection.type.name

    def prepare(self):
        """By default, there is no preparation needed"""

    def get_services(self) -> Iterable[WorkflowSource]:
        yield self.config

    def yield_create_request_database_service(
        self, config: WorkflowSource
    ) -> Iterable[Either[CreateDatabaseServiceRequest]]:
        yield Either(
            right=self.metadata.get_create_service_from_source(
                entity=DatabaseService, config=config
            )
        )

    @abstractmethod
    def get_database_names(self) -> Iterable[str]:
        """
        Prepares the database name to be sent to stage.
        Filtering happens here.
        """

    @abstractmethod
    def get_database_schema_names(self) -> Iterable[str]:
        """
        Prepares the database schema name to be sent to stage.
        Filtering happens here.
        """

    @abstractmethod
    def get_tables_name_and_type(self) -> Optional[Iterable[Tuple[str, TableType]]]:
        """
        Prepares the table name to be sent to stage.
        Filtering happens here.
        """

    @abstractmethod
    def yield_database(
        self, database_name: str
    ) -> Iterable[Either[CreateDatabaseRequest]]:
        """
        From topology.
        Prepare a database request and pass it to the sink.

        Also, update the self.inspector value to the current db.
        """

    @abstractmethod
    def yield_database_schema(
        self, schema_name: str
    ) -> Iterable[Either[CreateDatabaseSchemaRequest]]:
        """
        From topology.
        Prepare a database request and pass it to the sink.

        Also, update the self.inspector value to the current db.
        """

    @abstractmethod
    def yield_tag(
        self, schema_name: str
    ) -> Iterable[Either[OMetaTagAndClassification]]:
        """
        From topology. To be run for each schema
        """

    def yield_database_tag(
        self, database_name: str
    ) -> Iterable[Either[OMetaTagAndClassification]]:
        """
        From topology. To be run for each database
        """

    def yield_table_tags(
        self, table_name_and_type: Tuple[str, TableType]
    ) -> Iterable[Either[OMetaTagAndClassification]]:
        """
        From topology. To be run for each table
        """

    def yield_table_tag_details(
        self, table_name_and_type: Tuple[str, TableType]
    ) -> Iterable[Either[OMetaTagAndClassification]]:
        """
        From topology. To be run for each table
        """
        if self.source_config.includeTags:
            yield from self.yield_table_tags(table_name_and_type) or []

    def yield_database_schema_tag_details(
        self, schema_name: str
    ) -> Iterable[Either[OMetaTagAndClassification]]:
        """
        From topology. To be run for each schema
        """
        if self.source_config.includeTags:
            yield from self.yield_tag(schema_name) or []

    def yield_database_tag_details(
        self, database_name: str
    ) -> Iterable[Either[OMetaTagAndClassification]]:
        """
        From topology. To be run for each database
        """
        if self.source_config.includeTags:
            yield from self.yield_database_tag(database_name) or []

    def update_table_constraints(
        self,
        table_name,
        schema_name,
        db_name,
        table_constraints: List[TableConstraint],
        foreign_columns: [],
        columns,
    ) -> List[TableConstraint]:
        """
        process the table constraints of all tables
        transform SQLAlchemy returned foreign_columns into list of TableConstraint.
        """

    @abstractmethod
    def yield_table(
        self, table_name_and_type: Tuple[str, TableType]
    ) -> Iterable[Either[CreateTableRequest]]:
        """
        From topology.
        Prepare a table request and pass it to the sink.

        Also, update the self.inspector value to the current db.
        """

    @abstractmethod
    def get_stored_procedures(self) -> Iterable[Any]:
        """List stored procedures to process"""

    @abstractmethod
    def yield_stored_procedure(
        self, stored_procedure: Any
    ) -> Iterable[Either[CreateStoredProcedureRequest]]:
        """Process the stored procedure information"""

    def get_raw_database_schema_names(self) -> Iterable[str]:
        """
        fetch all schema names without any filtering.
        """
        yield from self.get_database_schema_names()

    def get_tag_by_fqn(self, entity_fqn: str) -> Optional[List[TagLabel]]:
        """
        Pick up the tags registered in the context
        searching by entity FQN
        """

        tag_labels = []
        for tag_and_category in self.context.get().tags or []:
            if tag_and_category.fqn and tag_and_category.fqn.root == entity_fqn:
                tag_label = get_tag_label(
                    metadata=self.metadata,
                    tag_name=tag_and_category.tag_request.name.root,
                    classification_name=tag_and_category.classification_request.name.root,
                )
                if tag_label:
                    tag_labels.append(tag_label)
        return tag_labels or None

    def get_database_tag_labels(self, database_name: str) -> Optional[List[TagLabel]]:
        """
        Method to get schema tags
        This will only get executed if the tags context
        is properly informed
        """
        database_fqn = fqn.build(
            self.metadata,
            entity_type=Database,
            service_name=self.context.get().database_service,
            database_name=database_name,
        )
        return self.get_tag_by_fqn(entity_fqn=database_fqn)

    def get_schema_tag_labels(self, schema_name: str) -> Optional[List[TagLabel]]:
        """
        Method to get schema tags
        This will only get executed if the tags context
        is properly informed
        """
        schema_fqn = fqn.build(
            self.metadata,
            entity_type=DatabaseSchema,
            service_name=self.context.get().database_service,
            database_name=self.context.get().database,
            schema_name=schema_name,
        )
        return self.get_tag_by_fqn(entity_fqn=schema_fqn)

    @calculate_execution_time()
    def get_tag_labels(self, table_name: str) -> Optional[List[TagLabel]]:
        """
        This will only get executed if the tags context
        is properly informed
        """
        table_fqn = fqn.build(
            self.metadata,
            entity_type=Table,
            service_name=self.context.get().database_service,
            database_name=self.context.get().database,
            schema_name=self.context.get().database_schema,
            table_name=table_name,
            skip_es_search=True,
        )
        return self.get_tag_by_fqn(entity_fqn=table_fqn)

    def get_column_tag_labels(
        self, table_name: str, column: dict
    ) -> Optional[List[TagLabel]]:
        """
        This will only get executed if the tags context
        is properly informed
        """
        col_fqn = fqn.build(
            self.metadata,
            entity_type=Column,
            service_name=self.context.get().database_service,
            database_name=self.context.get().database,
            schema_name=self.context.get().database_schema,
            table_name=table_name,
            column_name=column["name"],
        )
        return self.get_tag_by_fqn(entity_fqn=col_fqn)

    @calculate_execution_time()
    def register_record(self, table_request: CreateTableRequest) -> None:
        """
        Mark the table record as scanned and update the database_source_state
        """
        table_fqn = fqn.build(
            self.metadata,
            entity_type=Table,
            service_name=self.context.get().database_service,
            database_name=self.context.get().database,
            schema_name=self.context.get().database_schema,
            table_name=table_request.name.root,
            skip_es_search=True,
        )

        self.database_source_state.add(table_fqn)

    def register_record_stored_proc_request(
        self, stored_proc_request: CreateStoredProcedureRequest
    ) -> None:
        """
        Mark the table record as scanned and update the database_source_state
        """
        table_fqn = fqn.build(
            self.metadata,
            entity_type=StoredProcedure,
            service_name=self.context.get().database_service,
            database_name=self.context.get().database,
            schema_name=self.context.get().database_schema,
            procedure_name=stored_proc_request.name.root,
        )

        self.stored_procedure_source_state.add(table_fqn)

    def _get_filtered_schema_names(
        self, return_fqn: bool = False, add_to_status: bool = True
    ) -> Iterable[str]:
        for schema_name in self.get_raw_database_schema_names():
            schema_fqn = fqn.build(
                self.metadata,
                entity_type=DatabaseSchema,
                service_name=self.context.get().database_service,
                database_name=self.context.get().database,
                schema_name=schema_name,
            )
            if filter_by_schema(
                self.source_config.schemaFilterPattern,
                schema_fqn if self.source_config.useFqnForFiltering else schema_name,
            ):
                if add_to_status:
                    self.status.filter(schema_fqn, "Schema Filtered Out")
                continue
            yield schema_fqn if return_fqn else schema_name

    @calculate_execution_time()
    def get_owner_ref(self, table_name: str) -> Optional[EntityReferenceList]:
        """
        Method to process the table owners
        """
        try:
            if self.source_config.includeOwners and hasattr(
                self.inspector, "get_table_owner"
            ):
                owner_name = self.inspector.get_table_owner(
                    connection=self.connection,  # pylint: disable=no-member
                    table_name=table_name,
                    schema=self.context.get().database_schema,
                )
                owner_ref = self.metadata.get_reference_by_name(
                    name=owner_name, is_owner=True
                )
                return owner_ref
        except Exception as exc:
            logger.debug(traceback.format_exc())
            logger.warning(f"Error processing owner for table {table_name}: {exc}")
        return None

    def mark_tables_as_deleted(self):
        """
        Use the current inspector to mark tables as deleted
        """
        if not self.context.get().__dict__.get("database"):
            raise ValueError(
                "No Database found in the context. We cannot run the table deletion."
            )

        if self.source_config.markDeletedTables:
            logger.info(
                f"Mark Deleted Tables set to True. Processing database [{self.context.get().database}]"
            )
            schema_fqn_list = self._get_filtered_schema_names(
                return_fqn=True, add_to_status=False
            )

            for schema_fqn in schema_fqn_list:
                yield from delete_entity_from_source(
                    metadata=self.metadata,
                    entity_type=Table,
                    entity_source_state=self.database_source_state,
                    mark_deleted_entity=self.source_config.markDeletedTables,
                    params={"database": schema_fqn},
                )

    def mark_stored_procedures_as_deleted(self):
        """
        Use the current inspector to mark Stored Procedures as deleted
        """
        if self.source_config.markDeletedStoredProcedures:
            logger.info(
                f"Mark Deleted Stored Procedures Processing database [{self.context.get().database}]"
            )

            schema_fqn_list = self._get_filtered_schema_names(
                return_fqn=True, add_to_status=False
            )

            for schema_fqn in schema_fqn_list:
                yield from delete_entity_from_source(
                    metadata=self.metadata,
                    entity_type=StoredProcedure,
                    entity_source_state=self.stored_procedure_source_state,
                    mark_deleted_entity=self.source_config.markDeletedStoredProcedures,
                    params={"databaseSchema": schema_fqn},
                )

    def yield_life_cycle_data(self, _) -> Iterable[Either[OMetaLifeCycleData]]:
        """
        Get the life cycle data of the table
        """

    def yield_external_table_lineage(self) -> Iterable[Either[AddLineageRequest]]:
        """
        Process external table lineage
        """

    def yield_table_constraints(self) -> Iterable[Either[AddLineageRequest]]:
        """
        Process remaining table constraints by patching the table
        """

    def test_connection(self) -> None:
        test_connection_fn = get_test_connection_fn(self.service_connection)
        result = test_connection_fn(
            self.metadata, self.connection_obj, self.service_connection
        )
        raise_test_connection_exception(result)

其实source.run()执行的是一个有向无环图，它的拓扑定义在DatabaseServiceTopology类，源码如下

class DatabaseServiceTopology(ServiceTopology):
    """
    Defines the hierarchy in Database Services.
    service -> db -> schema -> table.

    We could have a topology validator. We can only consume
    data that has been produced by any parent node.
    """

    root: Annotated[
        TopologyNode, Field(description="Root node for the topology")
    ] = TopologyNode(
        producer="get_services",
        stages=[
            NodeStage(
                type_=DatabaseService,
                context="database_service",
                processor="yield_create_request_database_service",
                overwrite=False,
                must_return=True,
                cache_entities=True,
            ),
        ],
        children=["database"],
        post_process=[
            "yield_external_table_lineage",
            "yield_table_constraints",
        ],
    )
    database: Annotated[
        TopologyNode, Field(description="Database Node")
    ] = TopologyNode(
        producer="get_database_names",
        stages=[
            NodeStage(
                type_=OMetaTagAndClassification,
                context="tags",
                processor="yield_database_tag_details",
                nullable=True,
                store_all_in_context=True,
            ),
            NodeStage(
                type_=Database,
                context="database",
                processor="yield_database",
                consumer=["database_service"],
                cache_entities=True,
                use_cache=True,
            ),
        ],
        children=["databaseSchema"],
    )
    databaseSchema: Annotated[
        TopologyNode, Field(description="Database Schema Node")
    ] = TopologyNode(
        producer="get_database_schema_names",
        stages=[
            NodeStage(
                type_=OMetaTagAndClassification,
                context="tags",
                processor="yield_database_schema_tag_details",
                nullable=True,
                store_all_in_context=True,
            ),
            NodeStage(
                type_=DatabaseSchema,
                context="database_schema",
                processor="yield_database_schema",
                consumer=["database_service", "database"],
                cache_entities=True,
                use_cache=True,
            ),
        ],
        children=["table", "stored_procedure"],
        post_process=["mark_tables_as_deleted", "mark_stored_procedures_as_deleted"],
        threads=True,
    )
    table: Annotated[
        TopologyNode, Field(description="Main table processing logic")
    ] = TopologyNode(
        producer="get_tables_name_and_type",
        stages=[
            NodeStage(
                type_=OMetaTagAndClassification,
                context="tags",
                processor="yield_table_tag_details",
                nullable=True,
                store_all_in_context=True,
            ),
            NodeStage(
                type_=Table,
                context="table",
                processor="yield_table",
                consumer=["database_service", "database", "database_schema"],
                use_cache=True,
            ),
            NodeStage(
                type_=OMetaLifeCycleData,
                processor="yield_life_cycle_data",
                nullable=True,
            ),
        ],
    )
    stored_procedure: Annotated[
        TopologyNode, Field(description="Stored Procedure Node")
    ] = TopologyNode(
        producer="get_stored_procedures",
        stages=[
            NodeStage(
                type_=StoredProcedure,
                context="stored_procedures",
                processor="yield_stored_procedure",
                consumer=["database_service", "database", "database_schema"],
                store_all_in_context=True,
                store_fqn=True,
                use_cache=True,
            ),
        ],
    )

这个类中定义了数据库相关获取元数据相关执行顺序及逻辑，类里面的每个属性相当于图中的每个节点，里面的processor，就是具体处理逻辑的方法。