Flink获取维表数据实时join

派大星的海洋ku

已于 2024-03-29 10:34:58 修改

阅读量3.7k

点赞数 1

分类专栏： flink 文章标签： flink 大数据 java mysql hive 数据仓库

于 2023-07-17 09:52:24 首次发布

本文链接：https://blog.csdn.net/weixin_44378305/article/details/131758702

版权

flink 专栏收录该内容

7 篇文章 0 订阅

订阅专栏

Flink获取维表数据实时join

前言
需求
实现
总结

前言

数仓的dim层用于存放业务的维表数据，业务数据一般存放在关系型数据库。维表可用Flink CDC、Maxwell、Canal等方式，通过监听Mysql binlog的方式实现增量同步。

需求

Mysql里有两张表，表结构如下

CREATE TABLE `user_account` (
  `id` int NOT NULL AUTO_INCREMENT,
  `uid` bigint NOT NULL,
  `parent_uid` bigint NOT NULL COMMENT '主帐号uid',
  `acc_name` varchar(64)  NOT NULL COMMENT '帐号',
  `pass` varchar(64) NOT NULL DEFAULT '' COMMENT '密码',
  `phone` varchar(64) NOT NULL DEFAULT '' COMMENT '手机号码',
  `email` varchar(64) NOT NULL DEFAULT '' COMMENT '邮箱',
  `acc_type` tinyint unsigned NOT NULL DEFAULT '0' COMMENT '0:企业 1：个人',
  PRIMARY KEY (`id`),
  UNIQUE KEY `uk_uid` (`uid`),
  UNIQUE KEY `uk_acc` (`acc_name`)
) ENGINE=InnoDB AUTO_INCREMENT=1 COMMENT='用户表';


CREATE TABLE `user_corp` (
  `id` int NOT NULL AUTO_INCREMENT,
  `acc_uid` bigint NOT NULL COMMENT '帐号uID',
  `corp_name` varchar(128) NOT NULL DEFAULT '' COMMENT '公司名称',
  `corp_tel` varchar(64) NOT NULL DEFAULT '' COMMENT '固定电话',
  PRIMARY KEY (`id`),
  UNIQUE KEY `uk_acc_uid` (`acc_uid`),
  UNIQUE KEY `uk_corp_name` (`corp_name`)
) ENGINE=InnoDB AUTO_INCREMENT=1 COMMENT='公司表';

-- user_account里有两种用户，通过acc_type来区分（0:企业 1：个人）
-- user_corp里存放的是公司的信息

-- 在业务上，个人用户注册后，user_acc会新增一个用户，acc_type标识为1，user_corp无变化。
-- 企业用户注册后，user_acc会新增一个用户，acc_type标识为0，user_corp也会新增一个用户，acc_uid即为user_account的parent_uid。

现将ua 和 uc两表通过 ua.parent_qid 和 uc.acc_qid 关联，得到详细的用户信息表。表结构如下：

CREATE TABLE dim.dim_user_account_info
(
    `uid` String,
    `acc_uid` String,
    `acc_name` String,
    `phone` Nullable(String),
    `email` Nullable(String),
    `corp_name` Nullable(String),
    `corp_tel` Nullable(String),
    `acc_type` Int32,
    `acc_source` Nullable(String)
)

若为个人用户，则企业相关信息赋空

实现

public static void main(String[] args) throws Exception {

        // TODO 1. 环境准备
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(12);
        StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
        tableEnv.getConfig().setLocalTimeZone(ZoneId.of("GMT+8"));

        // TODO 2. cdc获取mysql维表
        SingleOutputStreamOperator<String> cdcStream = env.fromSource(
                getMysqlCDCSource(readValue("mysql.host"), readValue("mysql.database"), readValue("mysql.table"),
                        readValue("mysql.username"), readValue("mysql.password")),
                WatermarkStrategy.noWatermarks(),
                "MysqlConfigStream");

        // TODO 3. 侧输出流
        OutputTag<UserAccount> userAccountOutputTag = new OutputTag<UserAccount>("userAccountOutputTag") {
        };
        OutputTag<UserCorp> userCorpOutputTag = new OutputTag<UserCorp>("userCorpOutputTag") {
        };

        // TODO 4. 解析数据、获取侧输出流
        SingleOutputStreamOperator<String> processOutputStreamOperator = cdcStream.process(
                new ParseProcessFunction(
                        userCorpOutputTag,
                        userAccountOutputTag))
                        .uid("ParseProcessFunction_uid").name("ParseProcessFunction");
        DataStream<UserAccount> userAccountDataStream = processOutputStreamOperator.getSideOutput(userAccountOutputTag);
        DataStream<UserCorp> userCorpDataStream = processOutputStreamOperator.getSideOutput(userCorpOutputTag);


        // TODO 5. 将 userAccount 和 userCorp 转化为为 Flink SQL 表
        Table userAccount = tableEnv.fromDataStream(userAccountDataStream);
        Table userCorp = tableEnv.fromDataStream(userCorpDataStream);
        tableEnv.createTemporaryView("userAccount", userAccount);
        tableEnv.createTemporaryView("userCorp", userCorp);

        // TODO 6. 获取用户数据和企业数据并union
        Table userAccountInfo = tableEnv.sqlQuery("SELECT " +
                "   uid, " +
                "   parent_uid as account_uid, " +
                "   acc_name, " +
                "   phone, " +
                "   email, " +
                "   '' as corp_name, " +
                "   '' as corp_tel, " +
                "   acc_type, " +
                "   acc_source " +
                "FROM (SELECT * FROM userAccount WHERE account_type = 1) " +
                "UNION ALL " +
                "SELECT " +
                "   ua.uid as uid, " +
                "   ua.parent_uid as acc_uid, " +
                "   ua.acc_name as acc_name, " +
                "   ua.phone as phone, " +
                "   ua.email as email, " +
                "   uc.corp_name as corp_name, " +
                "   uc.corp_tel as corp_tel, " +
                "   ua.acc_type as acc_type, " +
                "   ua.acc_source as acc_source " +
                "FROM (SELECT * FROM userAccount WHERE acc_type = 0) ua " +
                "   JOIN userCorp uc " +
                "       ON ua.parent_uid = uc.account_uid");

        tableEnv.createTemporaryView("userAccountInfo", userAccountInfo);

        // TODO 7. 将数据转化成流并输出至目标数据库
        DataStream<UserAccountInfo> userAccountInfoDataStream = tableEnv.toAppendStream(userAccountInfo, UserAccountInfo.class);

        userAccountInfoDataStream.addSink(new UserAccountInfoSinkFunction(Constants.PARRAL))
                .uid("UserAccountInfoSink_uid").name("UserAccountInfoSink");
       
        env.execute("DimJob");
    }

总结

其实这就是个常规的left join，即 ua left join uc on ua.parent_sid = uc.account_sid。
但uc里没有个人用户，关联后个人用户的企业信息为空。flink sql在转化成流这一步会报空指针异常。所以只能先将个人用户和企业用户区分，后使用union合并。
这sql join的方式我觉得最简单明了，还有更高级点的使用api的interval join，通过指定一个时间间隔，在这间隔内新增的数据会join。interval join比较复杂，尝试之后没有得到想要的结果