apache-atlas-hive-bridge-源码分析

Hive 元数据类型

public enum HiveDataTypes {
    // Enums 枚举
    HIVE_OBJECT_TYPE,
    HIVE_PRINCIPAL_TYPE,
    HIVE_RESOURCE_TYPE,

    // Structs 结构体
    HIVE_SERDE,
    HIVE_ORDER,
    HIVE_RESOURCEURI,

    // Classes 类
    HIVE_DB,
    HIVE_STORAGEDESC,
    HIVE_TABLE,
    HIVE_COLUMN,
    HIVE_PARTITION,
    HIVE_INDEX,
    HIVE_ROLE,
    HIVE_TYPE,
    HIVE_PROCESS,
    HIVE_COLUMN_LINEAGE,
    HIVE_PROCESS_EXECUTION,
    // HIVE_VIEW,
    ;
    public String getName() {
        return name().toLowerCase();
    }
}

元数据采集

Hive 的元数据采集分为2个部分:

1、 批量采集 HiveMetaStoreBridge
2 、实时变更采集 HiveHook

本文先分析HiveMetaStoreBridge ,批量采集 HiveMetaStoreBridge 流程图如下所示:
在这里插入图片描述
注册tableEntity步骤中,会处理存储描述器sd、hive列、分区键等信息,在注册tableEntity的时候,会同时注册存储描述器sd、hive列、分区键实体等

HiveMetaStoreBridge构造函数

public HiveMetaStoreBridge(Configuration atlasProperties, HiveConf hiveConf, AtlasClientV2 atlasClientV2) throws Exception {
// 元数据命名空间
    this.metadataNamespace          = getMetadataNamespace(atlasProperties);
// hive客户端
    this.hiveClient                 = Hive.get(hiveConf);
//atlas 客户端
    this.atlasClientV2              = atlasClientV2;
// 将tHdfsPath转成小写
    this.convertHdfsPathToLowerCase = atlasProperties.getBoolean(HDFS_PATH_CONVERT_TO_LOWER_CASE, false);
}

HiveMetaStoreBridge#main 方法

public static void main(String[] args) {
//...
    try {
// 1、准备
// 读取命令行参数
        Options options = new Options();
        options.addOption("d", "database", true, "Database name");
        options.addOption("t", "table", true, "Table name");
        options.addOption("f", "filename", true, "Filename");
        options.addOption("failOnError", false, "failOnError");

        CommandLine   cmd              = new BasicParser().parse(options, args);
        boolean       failOnError      = cmd.hasOption("failOnError");
// 指定数据库
        String        databaseToImport = cmd.getOptionValue("d");
// 指定数据库表b
        String        tableToImport    = cmd.getOptionValue("t");
// 从文件中读取
        String        fileToImport     = cmd.getOptionValue("f");
// 获取配置文件
        Configuration atlasConf        = ApplicationProperties.get();
        String[]      atlasEndpoint    = atlasConf.getStringArray(ATLAS_ENDPOINT);

        if (atlasEndpoint == null || atlasEndpoint.length == 0) {
            atlasEndpoint = new String[] { DEFAULT_ATLAS_URL };
        }

//  初始化AtlasClientV2客户端
        if (!AuthenticationUtil.isKerberosAuthenticationEnabled()) {
            String[] basicAuthUsernamePassword = AuthenticationUtil.getBasicAuthenticationInput();
            atlasClientV2 = new AtlasClientV2(atlasEndpoint, basicAuthUsernamePassword);
        } else {
            UserGroupInformation ugi = UserGroupInformation.getCurrentUser();
            atlasClientV2 = new AtlasClientV2(ugi, ugi.getShortUserName(), atlasEndpoint);
        }

// 2、创建HiveMetaStoreBridge
        HiveMetaStoreBridge hiveMetaStoreBridge = new HiveMetaStoreBridge(atlasConf, new HiveConf(), atlasClientV2);
//3、导入
        if (StringUtils.isNotEmpty(fileToImport)) {

            File f = new File(fileToImport);

            if (f.exists() && f.canRead()) {
                BufferedReader br   = new BufferedReader(new FileReader(f));
                String         line = null;

                while((line = br.readLine()) != null) {
                    String val[] = line.split(":");
                    if (ArrayUtils.isNotEmpty(val)) {
                        databaseToImport = val[0];
                        if (val.length > 1) {
                            tableToImport = val[1];
                        } else {
                            tableToImport = "";
                        }
// 导入文件的库表
                        hiveMetaStoreBridge.importHiveMetadata(databaseToImport, tableToImport, failOnError);
                    }
                }


                exitCode = EXIT_CODE_SUCCESS;
            } else {
                LOG.error("Failed to read the input file: " + fileToImport);
            }
        } else {
//  导入命令行的库表
            hiveMetaStoreBridge.importHiveMetadata(databaseToImport, tableToImport, failOnError);
        }

        exitCode = EXIT_CODE_SUCCESS;
    } catch(ParseException e) {
       //...
    } catch(Exception e) {
        LOG.error("Import failed", e);
    } finally {
       //...
    }
    System.exit(exitCode);
}

HiveMetaStoreBridge#importDatabases

importDatabases是提供给外部调用的接口,完成导入Hive库表等元数据信息

private void importDatabases(boolean failOnError, String databaseToImport, String tableToImport) throws Exception {
    List<String> databaseNames = null;

// 获取数据库名
    if (StringUtils.isEmpty(databaseToImport) && StringUtils.isEmpty(tableToImport)) {
        //when both database and table to import are empty, import all
        databaseNames = hiveClient.getAllDatabases();
    } else if (StringUtils.isEmpty(databaseToImport) && StringUtils.isNotEmpty(tableToImport)) {
        //when database is empty and table is not, then check table has database name in it and import that db and table
        if (isTableWithDatabaseName(tableToImport)) {
            String val[] = tableToImport.split("\\.");
            if (val.length > 1) {
                databaseToImport = val[0];
                tableToImport = val[1];
            }
            databaseNames = hiveClient.getDatabasesByPattern(databaseToImport);
        } else {
            databaseNames = hiveClient.getAllDatabases();
        }
    } else {
        //when database to import has some value then, import that db and all table under it.
        databaseNames = hiveClient.getDatabasesByPattern(databaseToImport);
    }

    if(!CollectionUtils.isEmpty(databaseNames)) {
        LOG.info("Found {} databases", databaseNames.size());
// 依次导入库表
        for (String databaseName : databaseNames) {

// 注册数据库实体
            AtlasEntityWithExtInfo dbEntity = registerDatabase(databaseName);
//  导入表
            if (dbEntity != null) {
                importTables(dbEntity.getEntity(), databaseName, tableToImport, failOnError);
            }
        }
    } else {
        LOG.info("No database found");
    }
}

HiveMetaStoreBridge# registerDatabase

registerDatabase方法主要注册数据库信息, toDbEntity主要是构建数据库实体对象

private AtlasEntityWithExtInfo registerDatabase(String databaseName) throws Exception {

    AtlasEntityWithExtInfo ret = null;
    Database               db  = hiveClient.getDatabase(databaseName);
    if (db != null) {
        ret = findDatabase(metadataNamespace, databaseName);

        if (ret == null) {
// toDbEntity 创建数据库实体并设置相关属性Attribute ,registerInstance注册数据库实例
            ret = registerInstance(new AtlasEntityWithExtInfo(toDbEntity(db)));
        } else {
            LOG.info("Database {} is already registered - id={}. Updating it.", databaseName, ret.getEntity().getGuid());

            ret.setEntity(toDbEntity(db, ret.getEntity()));
//updateInstance 注册数据库实例
            updateInstance(ret);
        }
    }
    return ret;
}

HiveMetaStoreBridge#importTable

获取表名并委托给importTables导入

private int importTables(AtlasEntity dbEntity, String databaseName, String tblName, final boolean failOnError) throws Exception {
    int tablesImported = 0;
    final List<String> tableNames;

// 获取表名列表
    if (StringUtils.isEmpty(tblName)) {
        tableNames = hiveClient.getAllTables(databaseName);
    } else {
        tableNames = hiveClient.getTablesByPattern(databaseName, tblName);
    }

    if(!CollectionUtils.isEmpty(tableNames)) {
        LOG.info("Found {} tables to import in database {}", tableNames.size(), databaseName);


        try {
            for (String tableName : tableNames) {
// 导入表,委托给importTable
                int imported = importTable(dbEntity, databaseName, tableName, failOnError);
                tablesImported += imported;
            }
        } finally {
            if (tablesImported == tableNames.size()) {
                LOG.info("Successfully imported {} tables from database {}", tablesImported, databaseName);
            } else {
                LOG.error("Imported {} of {} tables from database {}. Please check logs for errors during import", tablesImported, tableNames.size(), databaseName);
            }
        }
    } else {
        LOG.info("No tables to import in database {}", databaseName);
    }

    return tablesImported;
}

HiveMetaStoreBridge#importTable

importTable方法主要负责注册表实体,已经处理外部表的逻辑

@VisibleForTesting
public int importTable(AtlasEntity dbEntity, String databaseName, String tableName, final boolean failOnError) throws Exception {

    try {
        Table                  table       = hiveClient.getTable(databaseName, tableName);

// 注册表实体,会调用 registerInstance
        AtlasEntityWithExtInfo tableEntity = registerTable(dbEntity, table);

        if (table.getTableType() == TableType.EXTERNAL_TABLE) {
// 外部表 额外逻辑

// getTableProcessQualifiedName 会考虑临时表的情况
            String                 processQualifiedName = getTableProcessQualifiedName(metadataNamespace, table);
            AtlasEntityWithExtInfo processEntity        = findProcessEntity(processQualifiedName);

            if (processEntity == null) {
// processEntity 不存在
                String      tableLocation = isConvertHdfsPathToLowerCase() ? lower(table.getDataLocation().toString()) : table.getDataLocation().toString();
                String      query         = getCreateTableString(table, tableLocation);
// 表hdfs路径实体
                AtlasEntity pathInst      = toHdfsPathEntity(tableLocation);
                AtlasEntity tableInst     = tableEntity.getEntity();
// HIVE_PROCESS 实体
                AtlasEntity processInst   = new AtlasEntity(HiveDataTypes.HIVE_PROCESS.getName());
                long        now           = System.currentTimeMillis();
// 设置processInst的属性
                processInst.setAttribute(ATTRIBUTE_QUALIFIED_NAME, processQualifiedName);
                processInst.setAttribute(ATTRIBUTE_NAME, query);
                processInst.setAttribute(ATTRIBUTE_CLUSTER_NAME, metadataNamespace);
// ATTRIBUTE_INPUTS
                processInst.setRelationshipAttribute(ATTRIBUTE_INPUTS, Collections.singletonList(AtlasTypeUtil.getAtlasRelatedObjectId(pathInst, RELATIONSHIP_DATASET_PROCESS_INPUTS)));
// ATTRIBUTE_OUTPUTS
                processInst.setRelationshipAttribute(ATTRIBUTE_OUTPUTS, Collections.singletonList(AtlasTypeUtil.getAtlasRelatedObjectId(tableInst, RELATIONSHIP_PROCESS_DATASET_OUTPUTS)));
                processInst.setAttribute(ATTRIBUTE_USER_NAME, table.getOwner());
                processInst.setAttribute(ATTRIBUTE_START_TIME, now);
                processInst.setAttribute(ATTRIBUTE_END_TIME, now);
                processInst.setAttribute(ATTRIBUTE_OPERATION_TYPE, "CREATETABLE");
                processInst.setAttribute(ATTRIBUTE_QUERY_TEXT, query);
                processInst.setAttribute(ATTRIBUTE_QUERY_ID, query);
                processInst.setAttribute(ATTRIBUTE_QUERY_PLAN, "{}");
                processInst.setAttribute(ATTRIBUTE_RECENT_QUERIES, Collections.singletonList(query));

                AtlasEntitiesWithExtInfo createTableProcess = new AtlasEntitiesWithExtInfo();
// createTableProcess 实体
                createTableProcess.addEntity(processInst);
                createTableProcess.addEntity(pathInst);
// 注册createTableProcess例子
                registerInstances(createTableProcess);

            } else {
                LOG.info("Process {} is already registered", processQualifiedName);
            }
        }

        return 1;
    } catch (Exception e) {
        LOG.error("Import failed for hive_table {}", tableName, e);
        if (failOnError) {
            throw e;
        }
        return 0;
    }
}

HiveMetaStoreBridge#registerTable

注册表

private AtlasEntityWithExtInfo registerTable(AtlasEntity dbEntity, Table table) throws AtlasHookException {
    try {
        AtlasEntityWithExtInfo ret;
        AtlasEntityWithExtInfo tableEntity = findTableEntity(table);


        if (tableEntity == null) {
            tableEntity = toTableEntity(dbEntity, table);


            ret = registerInstance(tableEntity);
        } else {
            LOG.info("Table {}.{} is already registered with id {}. Updating entity.", table.getDbName(), table.getTableName(), tableEntity.getEntity().getGuid());


            ret = toTableEntity(dbEntity, table, tableEntity);


            updateInstance(ret);
        }


        return ret;
    } catch (Exception e) {
        throw new AtlasHookException("HiveMetaStoreBridge.registerTable() failed.", e);
    }
}

HiveMetaStoreBridge#toTableEntity

toTableEntity方法主要负责组装表实体,主要设置表的属性、与数据库的关系属性,处理取存储描述器、分区键、列质之间的关联关系,返回AtlasEntityWithExtInfo对象


private AtlasEntityWithExtInfo toTableEntity(AtlasEntity database, final Table hiveTable, AtlasEntityWithExtInfo table) throws AtlasHookException {
    if (table == null) {
        table = new AtlasEntityWithExtInfo(new AtlasEntity(HiveDataTypes.HIVE_TABLE.getName()));
    }

    AtlasEntity tableEntity        = table.getEntity();
    String      tableQualifiedName = getTableQualifiedName(metadataNamespace, hiveTable);
    long        createTime         = BaseHiveEvent.getTableCreateTime(hiveTable);
    long        lastAccessTime     = hiveTable.getLastAccessTime() > 0 ? hiveTable.getLastAccessTime() : createTime;

// 设置tableEntity与dbEntity的关系属性
    tableEntity.setRelationshipAttribute(ATTRIBUTE_DB, AtlasTypeUtil.getAtlasRelatedObjectId(database, RELATIONSHIP_HIVE_TABLE_DB));
// 设置tableEntity其他属性
    tableEntity.setAttribute(ATTRIBUTE_QUALIFIED_NAME, tableQualifiedName);
    tableEntity.setAttribute(ATTRIBUTE_NAME, hiveTable.getTableName().toLowerCase());
    tableEntity.setAttribute(ATTRIBUTE_OWNER, hiveTable.getOwner());
//...

// 获取存储描述器实体
    AtlasEntity       sdEntity = toStorageDescEntity(hiveTable.getSd(), tableQualifiedName, getStorageDescQFName(tableQualifiedName), AtlasTypeUtil.getObjectId(tableEntity));

// 处理分区键 RELATIONSHIP_HIVE_TABLE_COLUMNS
    List<AtlasEntity> partKeys = toColumns(hiveTable.getPartitionKeys(), tableEntity, RELATIONSHIP_HIVE_TABLE_PART_KEYS);

//  处理列 RELATIONSHIP_HIVE_TABLE_COLUMNS
    List<AtlasEntity> columns  = toColumns(hiveTable.getCols(), tableEntity, RELATIONSHIP_HIVE_TABLE_COLUMNS);
// 设置关系属性
    tableEntity.setRelationshipAttribute(ATTRIBUTE_STORAGEDESC, AtlasTypeUtil.getAtlasRelatedObjectId(sdEntity, RELATIONSHIP_HIVE_TABLE_STORAGE_DESC));
    tableEntity.setRelationshipAttribute(ATTRIBUTE_PARTITION_KEYS, AtlasTypeUtil.getAtlasRelatedObjectIds(partKeys, RELATIONSHIP_HIVE_TABLE_PART_KEYS));
    tableEntity.setRelationshipAttribute(ATTRIBUTE_COLUMNS, AtlasTypeUtil.getAtlasRelatedObjectIds(columns, RELATIONSHIP_HIVE_TABLE_COLUMNS));

// 表实体与数据库实体关联
    table.addReferredEntity(database);
// 表实体与sd实体关联
    table.addReferredEntity(sdEntity);
    if (partKeys != null) {
        for (AtlasEntity partKey : partKeys) {
// 表实体与分区键实体关联
            table.addReferredEntity(partKey);
        }
    }
    if (columns != null) {
        for (AtlasEntity column : columns) {
// 表实体与列实体关联
            table.addReferredEntity(column);
        }
    }
    table.setEntity(tableEntity);
    return table;
}

  • 2
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

风的心愿

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值