Flink cdc mysql维表关联
参考
https://ververica.github.io/flink-cdc-connectors/release-2.1/content/connectors/mysql-cdc.html
需求背景
无论是实时还是离线场景,都离不开维表的关联。
痛点: 维表更新
·离线处理还好,更新之后重新处理一下我们的业务逻辑即可。
·但是流式/实时场景中,码表的更新,意味着有关联不上的数据,cdc诞生之前,我们可能会将未匹配到的数据写到‘未识别’的分区中,再跑一次离线任务关联未匹配到的数据,但是这种处理对于实时是很不友好的。
模拟场景
现有流式数据 student(id, name, age, clazz_id, clazz_name(需要匹配关联))。
码表存储在Mysql clazz_table中。
用DataStream方式处理。
·关联出 clazz_name 。
·增删 clazz_table 模拟码表更新。
·将结果输出(验证阶段 print)。
pom文件
多余的自己过滤
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<flink.version>1.13.1</flink.version>
<scala.version>2.11</scala.version>
<!-- <hive.version>2.1.1</hive.version>-->
<hive.version>2.1.1-cdh6.1.1</hive.version>
<!-- <hadoop.version>3.0.0</hadoop.version>-->
<hadoop.version>3.0.0-cdh6.1.1</hadoop.version>
<log4j.version>2.8.2</log4j.version>
<fastjson.version>1.2.7</fastjson.version>
</properties>
<!-- <dependency>-->
<!-- <groupId>org.apache.flink</groupId>-->
<!--<!– <artifactId>flink-sql-connector-hive-${hive.version}_${scala.version}</artifactId>–>-->
<!-- <artifactId>flink-sql-connector-hive-2.2.0_${scala.version}</artifactId>-->
<!-- <version>${flink.version}</version>-->
<!-- <scope>provided</scope>-->
<!-- </dependency>-->
<repositories>
<repository>
<id>cloudera</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
<!-- <repository>-->
<!-- <id>spring-plugin</id>-->
<!-- <url>https://repo.spring.io/plugins-release/</url>-->
<!-- </repository>-->
</repositories>
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-compress -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
<version>1.19</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>${fastjson.version}</version>
</dependency>
<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
<version>1.4</version>
</dependency>
<!-- Apache Flink dependencies -->
<!-- These dependencies are provided, because they should not be packaged into the JAR file. -->
<!-- <dependency>-->
<!-- <groupId>org.apache.flink</groupId>-->
<!-- <artifactId>flink-scala_2.11</artifactId>-->
<!-- <version>${flink.version}</version>-->
<!-- <scope>provided</scope>-->
<!-- </dependency>-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-scala_${scala.version}</artifactId>
<version>${flink.version}</version>
<!-- <scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<!-- <artifactId>flink-table-api-java-bridge_2.11</artifactId>-->
<artifactId>flink-table-api-scala-bridge_2.11</artifactId>
<version>${flink.version}</version>
<!-- <scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-java-bridge_2.11</artifactId>
<!-- <artifactId>flink-table-api-scala-bridge_2.11</artifactId>-->
<version>${flink.version}</version>
<!-- <scope>provided</scope>-->
</dependency>
<!-- <dependency>-->
<!-- <groupId>org.apache.flink</groupId>-->
<!-- <artifactId>flink-streaming-scala_${scala.version}</artifactId>-->
<!-- <version>${flink.version}</version>-->
<!-- <scope>provided</scope>-->
<!-- </dependency>-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_${scala.version}</artifactId>
<version>${flink.version}</version>
<exclusions>
<exclusion>
<artifactId>commons-compress</artifactId>
<groupId>org.apache.commons</groupId>
</exclusion>
</exclusions>
<!-- <scope>provided</scope>-->
</dependency>
<!-- Flink SQL dependencies -->
<!-- <dependency>-->
<!-- <groupId>org.apache.flink</groupId>-->
<!--<!– <artifactId>flink-table-api-java-bridge_2.11</artifactId>–>-->
<!-- <artifactId>flink-table-api-scala-bridge_2.11</artifactId>-->
<!-- <version>${flink.version}</version>-->
<!-- <scope>provided</scope>-->
<!-- </dependency>-->
<!-- <dependency>-->
<!-- <groupId>org.apache.flink</groupId>-->
<!-- <artifactId>flink-table-planner_${scala.version}</artifactId>-->
<!-- <version>${flink.version}</version>-->
<!-- <scope>provided</scope>-->
<!-- <exclusions>-->
<!-- <exclusion>-->
<!-- <artifactId>slf4j-api</artifactId>-->
<!-- <groupId>org.slf4j</groupId>-->
<!-- </exclusion>-->
<!-- </exclusions>-->
<!-- </dependency>-->
<!-- https://mvnrepository.com/artifact/com.ibm/com.ibm.icu -->
<!-- https://mvnrepository.com/artifact/com.ibm/com.ibm.icu -->
<!-- <dependency>-->
<!-- <groupId>com.ibm</groupId>-->
<!-- <artifactId>com.ibm.icu</artifactId>-->
<!-- <version>3.6.1.v20070906</version>-->
<!-- </dependency>-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner-blink_2.11</artifactId>
<version>${flink.version}</version>
<!-- <scope>provided</scope>-->
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-table-runtime-blink -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-runtime-blink_${scala.version}</artifactId>
<version>${flink.version}</version>
<!-- <scope>provided</scope>-->
</dependency>
<!-- https://mvnrepository.com/artifact/com.networknt/kafka-common -->
<!-- https://mvnrepository.com/artifact/com.datamountaineer/kafka-connect-common -->
<dependency>
<groupId>com.datamountaineer</groupId>
<artifactId>kafka-connect-common</artifactId>
<version>1.1.7</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.kafka/kafka-clients -->
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>2.8.0</version>
</dependency>
<!-- <dependency>-->
<!-- <groupId>org.apache.flink</groupId>-->
<!-- <artifactId>flink-streaming-scala_2.11</artifactId>-->
<!-- <version>${flink.version}</version>-->
<!-- <scope>provided</scope>-->
<!-- </dependency>-->
<!-- https://mvnrepository.com/artifact/org.apache.calcite/calcite-core -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-common</artifactId>
<version>${flink.version}</version>
<!-- <scope>provided</scope>-->
</dependency>
<!-- Add connector dependencies here. They must be in the default scope (compile). -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-sql-connector-kafka_${scala.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-hbase-2.2_${scala.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-json</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-hive_2.11</artifactId>
<version>${flink.version}</version>
<!-- <scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>${hive.version}</version>
<!-- <version>2.1.1-cdh6.1.1</version>-->
<exclusions>
<exclusion>
<artifactId>calcite-avatica</artifactId>
<groupId>org.apache.calcite</groupId>
</exclusion>
<exclusion>
<artifactId>calcite-core</artifactId>
<groupId>org.apache.calcite</groupId>
</exclusion>
<exclusion>
<artifactId>calcite-linq4j</artifactId>
<groupId>org.apache.calcite</groupId>
</exclusion>
<exclusion>
<artifactId>commons-compress</artifactId>
<groupId>org.apache.commons</groupId>
</exclusion>
</exclusions>
<!-- <scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
<scope>provided</scope>
<exclusions>
<exclusion>
<artifactId>commons-compress</artifactId>
<groupId>org.apache.commons</groupId>
</exclusion>
</exclusions>
</dependency>
<!--<dependency>-->
<!--<groupId>org.apache.flink</groupId>-->
<!--<artifactId>flink-shaded-hadoop-3</artifactId>-->
<!--<version>3.1.1.7.1.1.0-565-9.0</version>-->
<!--<scope>provided</scope> -->
<!--</dependency>-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-parquet_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-orc_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka_${scala.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-base</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- <dependency>-->
<!-- <groupId>org.apache.kafka</groupId>-->
<!-- <artifactId>kafka-clients</artifactId>-->
<!-- <version>1.0.1</version>-->
<!-- </dependency>-->
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.16.18</version>
</dependency>
<!-- Add logging framework, to produce console output when running in the IDE.-->
<!-- These dependencies are excluded from the application JAR by default.-->
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-slf4j-impl</artifactId>
<version>${log4j.version}</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
<version>${log4j.version}</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>${log4j.version}</version>
<scope>runtime</scope>
</dependency>
<!-- <dependency>-->
<!-- <groupId>com.alibaba</groupId>-->
<!-- <artifactId>fastjson</artifactId>-->
<!-- <version>1.2.72</version>-->
<!-- </dependency>-->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>8.0.16</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-jdbc_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>com.ververica</groupId>
<!-- add the dependency matching your database -->
<artifactId>flink-connector-mysql-cdc</artifactId>
<!-- the dependency is available only for stable releases. -->
<version>2.1.0</version>
</dependency>
<!--<dependency>-->
<!--<groupId>org.apache.flink</groupId>-->
<!--<artifactId>flink-shaded-hadoop-2-uber</artifactId>-->
<!--<version>${flink-shaded-hadoop.version}</version>-->
<!--</dependency>-->
</dependencies>
code
自定义数据源
@Data
@NoArgsConstructor
@AllArgsConstructor
@ToString
public class Student implements Serializable {
private String id;
private String name;
private String age;
private String clazz;
private String clazzName;
}
StudentSource
import com.gwm.saloontech.entity.Student;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.source.RichSourceFunction;
import java.util.Arrays;
import java.util.List;
import java.util.Random;
import java.util.concurrent.TimeUnit;
/**
* @Author: GW00234399
* @Description: sth todo
*/
public class StudentSource extends RichSourceFunction<String> {
private boolean isRunning;
private Random idRandom;
private String str;
private List<String> clazzList;
private List<String> idList;
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
idRandom = new Random();
str ="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
clazzList = Arrays.asList("1", "2", "3", "4", "5" );
idList = Arrays.asList("1", "2", "3", "4", "5", "6", "7");
isRunning = true;
}
@Override
public void run(SourceContext<String> sourceContext) throws Exception {
while (isRunning) {
final int id = idRandom.nextInt(9999);
StringBuffer nameSb = new StringBuffer();
for(int j=0; j<10; j++){
int number=idRandom.nextInt(62);
nameSb.append(str.charAt(number));
}
String age = idRandom.nextInt(30) + "";
String clazzNum = clazzList.get(idRandom.nextInt(5));
// sourceContext.collect(new Student(id+"", nameSb.toString(), age, clazzNum, "").toString());
sourceContext.collect(String.format("{\"id\":\"%s\"," +
"\"name\":\"%s\"," +
"\"age\":\"%s\"," +
"\"clazzId\":\"%s\"," +
"\"clazzName\":\"%s\"," +
"\"uploadTime\":\"%s\"" +
"}", id + "", nameSb.toString(), age, clazzNum, "", System.currentTimeMillis()));
//DEBUG 用
TimeUnit.SECONDS.sleep(1);
}
}
@Override
public void cancel() {
isRunning = false;
}
}
业务代码
数据量小,码表存储在广播变量里, 数据量大可以考虑三级缓存。
import com.alibaba.fastjson.JSONObject;
import com.gwm.entity.Student;
import com.gwm.utils.MyDeseUtil;
import com.gwm.utils.StudentSource;
import com.ververica.cdc.connectors.mysql.source.MySqlSource;
import com.ververica.cdc.debezium.StringDebeziumDeserializationSchema;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.state.BroadcastState;
import org.apache.flink.api.common.state.MapStateDescriptor;
import org.apache.flink.api.common.state.ReadOnlyBroadcastState;
import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.BroadcastStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.streaming.api.functions.co.BroadcastProcessFunction;
import org.apache.flink.util.Collector;
import java.util.Properties;
/**
* @Author: Jhon_yh
* @Description: sth todo
*/
public class FlinkCDCMysql {
public static void main(String[] args) throws Exception{
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
/**
* FAQ: https://github.com/ververica/flink-cdc-connectors/wiki/FAQ(%E4%B8%AD%E6%96%87)
* 使用CDC 2.x版本,只能读取全量数据,无法读取增量(binlog) 数据,怎么回事?
* SET 'execution.checkpointing.interval' = '3s';
*
*/
env.enableCheckpointing(3000);
final DataStreamSource<Student> studentDS = env.addSource(new StudentSource());
Properties debeziumProperties = new Properties();
debeziumProperties.put("snapshot.locking.mode", "none");
final MySqlSource<String> mySqlSource = MySqlSource.<String>builder()
.hostname("")
.port(3306)
.databaseList("test") // set captured database
.tableList("test.clazz_table") // set captured table
.username("root")
.password("123456")
// .deserializer(new StringDebeziumDeserializationSchema()) // converts SourceRecord to String
.deserializer(new MyDeseUtil()) // converts SourceRecord to String
.debeziumProperties(debeziumProperties)
.build();
final DataStreamSource<String> dimSource = env.fromSource(mySqlSource, WatermarkStrategy.noWatermarks(), "mysql_source");
final BroadcastStream<String> dimBc = dimSource.broadcast(new MapStateDescriptor<String, String>("dim_broadcast", BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO));
final SingleOutputStreamOperator<Student> processDs = studentDS.connect(dimBc).process(new BroadcastProcessFunction<Student, String, Student>() {
MapStateDescriptor mapStateDescriptor ;
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
if(mapStateDescriptor == null){
mapStateDescriptor = new MapStateDescriptor("dim_broadcast", BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO);
}
}
@Override
public void processElement(Student student, ReadOnlyContext ctx, Collector<Student> out) throws Exception {
final ReadOnlyBroadcastState broadcastState = ctx.getBroadcastState(mapStateDescriptor);
if (broadcastState.contains(student.getClazz())) {
student.setClazzName((String) broadcastState.get(student.getClazz()));
}else{
student.setClazzName("未识别");
}
out.collect(student);
}
@Override
public void processBroadcastElement(String value, Context ctx, Collector<Student> out) throws Exception {
final BroadcastState broadcastState = ctx.getBroadcastState(mapStateDescriptor);
final JSONObject jsonObject = JSONObject.parseObject(value);
// {"before":{"name":"dawang","id":"7","age":"22"},"dbName":"test","after":{},"operation":"DELETE","tableName":"student"}
// {"before":{},"dbName":"test","after":{"name":"dawang","id":"7","age":"33"},"operation":"READ","tableName":"student"}
// {"before":{"name":"dawang","id":"7","age":"22"},"dbName":"test","after":{"name":"dawang","id":"7","age":"44"},"operation":"UPDATE","tableName":"student"}
final JSONObject after = jsonObject.getJSONObject("after");
final JSONObject before = jsonObject.getJSONObject("before");
final String beforeClazzId = before.getString("clazz_id");
final String clazzId = after.getString("clazz_id");
final String clazzName = after.getString("clazz_name");
final String operation = jsonObject.getString("operation");
switch (operation) {
case "DELETE":
System.out.println("移除: " + beforeClazzId);
broadcastState.remove(beforeClazzId);
break;
case "READ":
case "UPDATE":
default:
broadcastState.put(clazzId, clazzName);
}
}
});
processDs.print("process: ");
// dimSource.print("dim");
// studentDS.map(new MapFunction<Student, Student>() {
// @Override
// public Student map(Student value) throws Exception {
// return null;
// }
// });
// studentDS.print("student: ");
env.execute("student");
}
}
自定义反序列化
import com.alibaba.fastjson.JSONObject;
import com.ververica.cdc.debezium.DebeziumDeserializationSchema;
import io.debezium.data.Envelope;
import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.table.runtime.typeutils.StringDataTypeInfo;
import org.apache.flink.util.Collector;
import org.apache.kafka.connect.data.Field;
import org.apache.kafka.connect.data.Schema;
import org.apache.kafka.connect.data.Struct;
import org.apache.kafka.connect.source.SourceRecord;
import java.util.List;
/**
* @Author: Jhon_yh
* @Description: sth todo
*/
public class MyDeseUtil implements DebeziumDeserializationSchema<String> {
@Override
public void deserialize(SourceRecord record, Collector out) throws Exception {
JSONObject jsonObject = new JSONObject();
// mysql_binlog_source.test.student
final String topic = record.topic();
final String[] splits = topic.split("\\.");
String dbName = splits[1];
String tableName = splits[2];
jsonObject.put("dbName", dbName);
jsonObject.put("tableName", tableName);
final Schema schema = record.valueSchema();
final Struct value = (Struct) record.value();
// final Struct before = value.getStruct("before");
final JSONObject before = supplyJSONObject(value, "before");
final JSONObject after = supplyJSONObject(value, "after");
jsonObject.put("before", before);
jsonObject.put("after", after);
//获取操作类型
final Envelope.Operation operation = Envelope.operationFor(record);
jsonObject.put("operation", operation);
out.collect(jsonObject.toJSONString());
}
@Override
public TypeInformation getProducedType() {
return BasicTypeInfo.STRING_TYPE_INFO;
}
private JSONObject supplyJSONObject(Struct struct, String structName) {
JSONObject jsonObject = new JSONObject();
final Struct udfStruct = struct.getStruct(structName);
if (null != udfStruct) {
final Schema schema = udfStruct.schema();
final List<Field> fields = schema.fields();
fields.forEach( field -> {
jsonObject.put(field.name(), udfStruct.getString(field.name()));
});
}
return jsonObject;
}
}
table_sql
CREATE TABLE `clazz_table` (
`clazz_id` varchar(11) NOT NULL,
`clazz_name` varchar(255) NOT NULL,
PRIMARY KEY (`clazz_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8
运行效果截图
未识别的 calzz_id 为5
添加 clazz_id 为5的 clazz 信息,然后删除。
Mysql 语句。
INSERT into clazz_table VALUES("5", "五班");
DELETE FROM clazz_table where clazz_id = '5';
-- 自行测试 update 语句。
-- TODO test UPDATE clazz_table set clazz_name= "x班" where id = 8;