需求:一直使用代码提交的方式繁琐,上手较慢,期望能够像阿里blink以简单的sql方式提交代码实现实时处理,此处以flink自带的sql客户端进行测试和调试,简单的demo作为笔录,方便测试和应用
kafka作为source源
- 单独建一个目录,将如下jar包上传
flink-json-1.12.0.jar
flink-sql-connector-kafka_2.11-1.12.0.jar
flink-connector-jdbc_2.11-1.12.0.jar
mysql-connector-java-5.1.38.jar
- 启动flink
bin/start-cluster.sh
- 启动sql-client
bin/sql-client.sh embedded -l /opt/soft/flink-1.12.0/sql_lib/
- 三种table展示类型,可自行设置
SET execution.result-mode=table;
SET execution.result-mode=tableau;
SET execution.result-mode=changelog;
简单类型json处理
- 创建topic,并控制台上传数据
kafka-topics.sh --create --zookeeper jzy1:2181 --replication-factor 1 --partitions 1 --topic user_behavior
kafka-console-producer.sh --broker-list jzy1:9092 --topic user_behavior
# 以下消息一条一条上传,防止有空格或隐形字符,导致数据不规范
{"user_id":1004080,"item_id":2258662,"category_id":79451,"behavior":"pv","ts":"2017-11-24T23:47:47Z"}
{"user_id":100814,"item_id":5071478,"category_id":1107469,"behavior":"pv","ts":"2017-11-24T23:47:47Z"}
{"user_id":114321,"item_id":4306269,"category_id":4756105,"behavior":"pv","ts":"2017-11-24T23:47:48Z"}
- 创建源表
CREATE TABLE user_behavior (
user_id BIGINT,
item_id BIGINT,
category_id BIGINT,
behavior STRING,
ts TIMESTAMP(3),
proctime as PROCTIME(),
WATERMARK FOR ts as ts - INTERVAL '5' SECOND
) WITH (
'connector.type' = 'kafka',
'connector.version' = 'universal',
'connector.topic' = 'user_behavior',
'connector.startup-mode' = 'earliest-offset',
'connector.properties.zookeeper.connect' = '192.168.56.21:2181',
'connector.properties.bootstrap.servers' = '192.168.56.21:9092',
'format.type' = 'json'
);
- 查询
select * from user_behavior
结果如下所示
复杂类型json处理
- 创建topic,并控制台上传数据
kafka-topics.sh --create --zookeeper jzy1:2181 --replication-factor 1 --partitions 1 --topic moreJson_test
kafka-console-producer.sh --broker-list jzy1:9092 --topic moreJson_test
{"data":[{"id":"1","name":"za","age":"15"}],"database":"mydemo","es":1619610228000,"id":1,"isDdl":false,"mysqlType":{"id":"int(11)","name":"varchar(16)","age":"int(11)"},"old":null,"pkNames":["id"],"sql":"","sqlType":{"id":4,"name":12,"age":4},"table":"students","ts":1619451364936,"type":"INSERT"}
{"data":[{"id":"1","name":"za","age":"15"}],"database":"mydemo","es":1619611522000,"id":1,"isDdl":false,"mysqlType":{"id":"int(11)","name":"varchar(16)","age":"int(11)"},"old":null,"pkNames":["id"],"sql":"","sqlType":{"id":4,"name":12,"age":4},"table":"students","ts":1619451364936,"type":"DELETE"}
{"data":[{"id":"1","name":"sdf","age":"12"}],"database":"mydemo","es":1619612224000,"id":1,"isDdl":false,"mysqlType":{"id":"int(11)","name":"varchar(16)","age":"int(11)"},"old":null,"pkNames":["id"],"sql":"","sqlType":{"id":4,"name":12,"age":4},"table":"students","ts":1619451364936,"type":"INSERT"}
- 我们只需要数据data中的id,name,和age,以及数据库database,ts时间戳,type插入类型
创建源表如下
CREATE TABLE moreJson_test (
data Array<Row<id string,name string,age string>>,
database STRING,
ts BIGINT,
`type` string,
proctime as PROCTIME()
) WITH (
'connector.type' = 'kafka',
'connector.version' = 'universal',
'connector.topic' = 'moreJson_test',
'connector.startup-mode' = 'earliest-offset',
'connector.properties.zookeeper.connect' = '192.168.56.21:2181',
'connector.properties.bootstrap.servers' = '192.168.56.21:9092',
'format.type' = 'json'
)
查看对应内容
select data[1].id as id,data[1].name as name,data[1].age as age,database,ts,`type`,proctime from moreJson_test
more
以上json为canal监控mysql的输出内容,较为简单只有Array类型,如下列举其他类型
- Row<Array <Row<>>>类型
"data": {
"one": [{
"content_type": "web adress",
"url": "https://baidu.com"
}],
"two": [{
"content_type": "web adress",
"url": " https://taobao.com"
}]
}
对应
data ROW<one ARRAY<ROW<content_type string,url string>>,two ARRAY<ROW<content_type string,url string>>>
sql 解析
select data.one[1].content_type,data.one[1].url,data.two[1].content_type,data.two[1].url
- map类型
"doublemap": {
"inner_map": {
"key": "content"
}
}
对应
doublemap Map<STRING,Map<STRING,STRING>>
sql解析
select doublemap['inner_map']['key']
从kafka处理并发送到新的kafka
-- 创建源流数据
CREATE TABLE user_behavior (
user_id BIGINT,
item_id BIGINT,
category_id BIGINT,
behavior STRING,
ts TIMESTAMP(3),
proctime as PROCTIME(),
WATERMARK FOR ts as ts - INTERVAL '5' SECOND
) WITH (
'connector.type' = 'kafka',
'connector.version' = 'universal',
'connector.topic' = 'user_behavior',
'connector.startup-mode' = 'earliest-offset',
'connector.properties.zookeeper.connect' = '192.168.56.21:2181',
'connector.properties.bootstrap.servers' = '192.168.56.21:9092',
'format.type' = 'json'
);
-- 创建输出 topic无需手动创建
CREATE table outputKafka(
user_id BIGINT,
item_id BIGINT,
category_id BIGINT,
behavior STRING,
ts TIMESTAMP(3)
) WITH (
'connector.type' = 'kafka',
'connector.version' = 'universal',
'connector.topic' = 'outputKafka',
'connector.startup-mode' = 'earliest-offset',
'connector.properties.zookeeper.connect' = '192.168.56.21:2181',
'connector.properties.bootstrap.servers' = '192.168.56.21:9092',
'format.type' = 'json'
);
-- 插入
INSERT INTO outputKafka SELECT user_id,item_id,category_id,behavior,ts from user_behavior;
连接mysql
CREATE TABLE userinfos (
userid int,
username varchar,
birthday date
) WITH (
'connector.type' = 'jdbc',
'connector.url' = 'jdbc:mysql://192.168.56.21:3306/mydemo',
'connector.table' = 'userinfos',
'connector.username' = 'canal',
'connector.password' = 'canal',
'connector.write.flush.max-rows' = '1' --默认5000
)
extra
flink sql client 带来了极大的便利,在探索客户端功能的时候,发现所创的原表输出表或者database库,当退出客户端时并不能得到保存,只有经过insert语句之后,job任务便会在后台提交,此时即是在后台运行,退出客户端依旧会运行,但是表和库依旧不保存。所以在编辑流程的时候尽量将表在其他地方做一个备份,防止被删一场空。
ps: 以上以sql client作为基本测试demo,便于了解flink sql的基本语法,若需要类似的平台开发,可以参考streamx。