hudi cdc导入模板
写入方式
CDC 数据同步
CDC 数据保存了完整的数据库变更,当前可通过两种途径将数据导入 hudi:
第一种:通过 cdc-connector 直接对接 DB 的 binlog 将数据导入 hudi,优点是不依赖消息队列,缺点是对 db server 造成压力。
第二种:对接 cdc format 消费 kafka 数据导入 hudi,优点是可扩展性强,缺点是依赖 kafka。
注意:如果上游数据无法保证顺序,需要指定 write.precombine.field 字段。
1)准备MySQL表
(1)MySQL开启binlog
(2)建表
create table stu3 (
id int unsigned auto_increment primary key COMMENT '自增id',
name varchar(20) not null comment '学生名字',
school varchar(20) not null comment '学校名字',
nickname varchar(20) not null comment '学生小名',
age int not null comment '学生年龄',
class_num int not null comment '班级人数',
phone bigint not null comment '电话号码',
email varchar(64) comment '家庭网络邮箱',
ip varchar(32) comment 'IP地址'
) engine=InnoDB default charset=utf8;
2)添加flinkCDC的依赖
上传flink连接器依赖到flink的lib目录下:
3)flink读取mysql binlog并写入kafka
(1)创建MySQL表
create table stu3_binlog(
id bigint not null,
name string,
school string,
nickname string,
age int not null,
class_num int not null,
phone bigint not null,
email string,
ip string,
primary key (id) not enforced
) with (
'connector' = 'mysql-cdc',
'hostname' = 'hdp1',
'port' = '3306',
'username' = 'root',
'password' = '',
'database-name' = 'gmall',
'table-name' = 'stu3'
);
(2)创建Kafka表
create table stu3_binlog_sink_kafka(
id bigint not null,
name string,
school string,
nickname string,
age int not null,
class_num int not null,
phone bigint not null,
email string,
ip string,
primary key (id) not enforced
) with (
'connector' = 'upsert-kafka'
,'topic' = 'cdc_mysql_stu3_sink'
,'properties.zookeeper.connect' = 'hdp1:2181'
,'properties.bootstrap.servers' = 'hdp1:9092'
,'key.format' = 'json'
,'value.format' = 'json'
);
(3)将mysql binlog日志写入kafka
insert into stu3_binlog_sink_kafka
select * from stu3_binlog;
3)flink读取kafka数据并写入hudi数据湖
(1)创建kafka源表
create table stu3_binlog_source_kafka(
id bigint not null,
name string,
school string,
nickname string,
age int not null,
class_num int not null,
phone bigint not null,
email string,
ip string
) with (
'connector' = 'kafka',
'topic' = 'cdc_mysql_stu3_sink',
'properties.bootstrap.servers' = 'hdp1:9092',
'format' = 'json',
'scan.startup.mode' = 'earliest-offset',
'properties.group.id' = 'testGroup'
);
(2)创建hudi目标表
create table stu3_binlog_sink_hudi(
id bigint not null,
name string,
`school` string,
nickname string,
age int not null,
class_num int not null,
phone bigint not null,
email string,
ip string,
primary key (id) not enforced
)
partitioned by (`school`)
with (
'connector' = 'hudi',
'path' = 'hdfs://hdp1:8020/tmp/hudi_flink/stu3_binlog_sink_hudi',
'table.type' = 'MERGE_ON_READ',
'write.option' = 'insert',
'write.precombine.field' = 'school'
);
(3)将kafka数据写入到hudi中
insert into stu3_binlog_sink_hudi
select * from stu3_binlog_source_kafka;