前言
数仓的dim层用于存放业务的维表数据,业务数据一般存放在关系型数据库。维表可用Flink CDC、Maxwell、Canal等方式,通过监听Mysql binlog的方式实现增量同步。
需求
Mysql里有两张表,表结构如下
CREATE TABLE `user_account` (
`id` int NOT NULL AUTO_INCREMENT,
`uid` bigint NOT NULL,
`parent_uid` bigint NOT NULL COMMENT '主帐号uid',
`acc_name` varchar(64) NOT NULL COMMENT '帐号',
`pass` varchar(64) NOT NULL DEFAULT '' COMMENT '密码',
`phone` varchar(64) NOT NULL DEFAULT '' COMMENT '手机号码',
`email` varchar(64) NOT NULL DEFAULT '' COMMENT '邮箱',
`acc_type` tinyint unsigned NOT NULL DEFAULT '0' COMMENT '0:企业 1:个人',
PRIMARY KEY (`id`),
UNIQUE KEY `uk_uid` (`uid`),
UNIQUE KEY `uk_acc` (`acc_name`)
) ENGINE=InnoDB AUTO_INCREMENT=1 COMMENT='用户表';
CREATE TABLE `user_corp` (
`id` int NOT NULL AUTO_INCREMENT,
`acc_uid` bigint NOT NULL COMMENT '帐号uID',
`corp_name` varchar(128) NOT NULL DEFAULT '' COMMENT '公司名称',
`corp_tel` varchar(64) NOT NULL DEFAULT '' COMMENT '固定电话',
PRIMARY KEY (`id`),
UNIQUE KEY `uk_acc_uid` (`acc_uid`),
UNIQUE KEY `uk_corp_name` (`corp_name`)
) ENGINE=InnoDB AUTO_INCREMENT=1 COMMENT='公司表';
-- user_account里有两种用户,通过acc_type来区分(0:企业 1:个人)
-- user_corp里存放的是公司的信息
-- 在业务上,个人用户注册后,user_acc会新增一个用户,acc_type标识为1,user_corp无变化。
-- 企业用户注册后,user_acc会新增一个用户,acc_type标识为0,user_corp也会新增一个用户,acc_uid即为user_account的parent_uid。
现将ua 和 uc两表通过 ua.parent_qid 和 uc.acc_qid 关联,得到详细的用户信息表。表结构如下:
CREATE TABLE dim.dim_user_account_info
(
`uid` String,
`acc_uid` String,
`acc_name` String,
`phone` Nullable(String),
`email` Nullable(String),
`corp_name` Nullable(String),
`corp_tel` Nullable(String),
`acc_type` Int32,
`acc_source` Nullable(String)
)
若为个人用户,则企业相关信息赋空
实现
public static void main(String[] args) throws Exception {
// TODO 1. 环境准备
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(12);
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
tableEnv.getConfig().setLocalTimeZone(ZoneId.of("GMT+8"));
// TODO 2. cdc获取mysql维表
SingleOutputStreamOperator<String> cdcStream = env.fromSource(
getMysqlCDCSource(readValue("mysql.host"), readValue("mysql.database"), readValue("mysql.table"),
readValue("mysql.username"), readValue("mysql.password")),
WatermarkStrategy.noWatermarks(),
"MysqlConfigStream");
// TODO 3. 侧输出流
OutputTag<UserAccount> userAccountOutputTag = new OutputTag<UserAccount>("userAccountOutputTag") {
};
OutputTag<UserCorp> userCorpOutputTag = new OutputTag<UserCorp>("userCorpOutputTag") {
};
// TODO 4. 解析数据、获取侧输出流
SingleOutputStreamOperator<String> processOutputStreamOperator = cdcStream.process(
new ParseProcessFunction(
userCorpOutputTag,
userAccountOutputTag))
.uid("ParseProcessFunction_uid").name("ParseProcessFunction");
DataStream<UserAccount> userAccountDataStream = processOutputStreamOperator.getSideOutput(userAccountOutputTag);
DataStream<UserCorp> userCorpDataStream = processOutputStreamOperator.getSideOutput(userCorpOutputTag);
// TODO 5. 将 userAccount 和 userCorp 转化为为 Flink SQL 表
Table userAccount = tableEnv.fromDataStream(userAccountDataStream);
Table userCorp = tableEnv.fromDataStream(userCorpDataStream);
tableEnv.createTemporaryView("userAccount", userAccount);
tableEnv.createTemporaryView("userCorp", userCorp);
// TODO 6. 获取用户数据和企业数据并union
Table userAccountInfo = tableEnv.sqlQuery("SELECT " +
" uid, " +
" parent_uid as account_uid, " +
" acc_name, " +
" phone, " +
" email, " +
" '' as corp_name, " +
" '' as corp_tel, " +
" acc_type, " +
" acc_source " +
"FROM (SELECT * FROM userAccount WHERE account_type = 1) " +
"UNION ALL " +
"SELECT " +
" ua.uid as uid, " +
" ua.parent_uid as acc_uid, " +
" ua.acc_name as acc_name, " +
" ua.phone as phone, " +
" ua.email as email, " +
" uc.corp_name as corp_name, " +
" uc.corp_tel as corp_tel, " +
" ua.acc_type as acc_type, " +
" ua.acc_source as acc_source " +
"FROM (SELECT * FROM userAccount WHERE acc_type = 0) ua " +
" JOIN userCorp uc " +
" ON ua.parent_uid = uc.account_uid");
tableEnv.createTemporaryView("userAccountInfo", userAccountInfo);
// TODO 7. 将数据转化成流并输出至目标数据库
DataStream<UserAccountInfo> userAccountInfoDataStream = tableEnv.toAppendStream(userAccountInfo, UserAccountInfo.class);
userAccountInfoDataStream.addSink(new UserAccountInfoSinkFunction(Constants.PARRAL))
.uid("UserAccountInfoSink_uid").name("UserAccountInfoSink");
env.execute("DimJob");
}
总结
其实这就是个常规的left join,即 ua left join uc on ua.parent_sid = uc.account_sid。
但uc里没有个人用户,关联后个人用户的企业信息为空。flink sql在转化成流这一步会报空指针异常。所以只能先将个人用户和企业用户区分,后使用union合并。
这sql join的方式我觉得最简单明了,还有更高级点的使用api的interval join,通过指定一个时间间隔,在这间隔内新增的数据会join。interval join比较复杂,尝试之后没有得到想要的结果