背景
canal发送到消息队列存在两种数据格式,一种是json,一种是java的序列化对象二进制格式,
json解析比较简单,而二进制解析需要借助自定义UDTF,对二进制数据进行反序列化,解析成json后,再进行sql二次解析,使用行转列的方式,将数据进一步拆分成独立的行数据
自定义UDTF
引入canal依赖
<dependency>
<groupId>com.alibaba.otter</groupId>
<artifactId>canal.client</artifactId>
<version>1.1.6</version>
</dependency>
<dependency>
<groupId>com.alibaba.otter</groupId>
<artifactId>canal.protocol</artifactId>
<version>1.1.6</version>
</dependency>
编写自定义UDTF
此UDTF用于将canal 二进制msg转换成canal FlatMessage
import com.alibaba.otter.canal.client.CanalMessageDeserializer;
import com.alibaba.otter.canal.protocol.CanalEntry;
import com.alibaba.otter.canal.protocol.FlatMessage;
import com.alibaba.otter.canal.protocol.Message;
import com.google.protobuf.InvalidProtocolBufferException;
import org.apache.flink.table.annotation.DataTypeHint;
import org.apache.flink.table.annotation.FunctionHint;
import org.apache.flink.table.functions.TableFunction;
import org.apache.flink.types.Row;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.*;
/**
* canal msg解析
* 因为canal部分字段包含sql的关键字,所以字段名一定要加反引号
*/
@FunctionHint(output = @DataTypeHint("ROW<" +
"`database` STRING," +
"`table` STRING," +
"`pkNames` ARRAY<STRING>," +
"`isDdl` BOOLEAN," +
"`type` STRING," +
"`es` BIGINT," +
"`ts` BIGINT," +
"`sql` STRING," +
"`sqlType` MAP<STRING,INTEGER>," +
"`mysqlType` MAP<STRING,STRING>," +
"`data` ARRAY<MAP<STRING,STRING>>," +
"`old` ARRAY<MAP<STRING,STRING>>" +
">"))
public class CanalParse extends TableFunction<Row> {
private static final Logger LOG = LoggerFactory.getLogger(CanalParseArray.class);
public void eval(byte[] msg) {
try {
Message deserializer = CanalMessageDeserializer.deserializer(msg);
List<FlatMessage> flatMessages = parseMessage(deserializer);