双流join的三种方式: regular join: 该Join方式需要去保留两个流的状态,持续性地保留并且不会去做清除; 两边的数据对于对方的流都是所有可见的,所以数据就需要持续性的存在State里面,而State不能存的过大,因此这个场景的只适合有界数据流 regular join支持update的和append的流 interval join: Interval Join根据右流相对左流偏移的时间区间(interval)作为关联窗口,在偏移区间窗口中完成join操作 同时支持processing time和even time定义时间: 若使用processing time,则flink会自动使用系统时间划分窗口,并自动做相关state的清理; 若使用even time,则flink会用watermark机制划分窗口,并作state清理 仅支持append-only的流 window join: 将两个流中有相同key并处在相同window里的元素去做Join;必须同时满足Join key相同,且在同一个Window里元素才能够在最终结果中输出
环境初始化:
from datetime import timedelta
from pyflink.datastream import StreamExecutionEnvironment, CheckpointingMode
from pyflink.table import StreamTableEnvironment, EnvironmentSettings
'''
flink双流join+维表join示例,flink1.13.3
'''
settings = EnvironmentSettings.new_instance().in_streaming_mode().use_blink_planner().build()
env = StreamExecutionEnvironment.get_execution_environment()
env.set_parallelism(2)
env.enable_checkpointing(3600000)
env.get_checkpoint_config().set_checkpointing_mode(CheckpointingMode.EXACTLY_ONCE)
env.get_checkpoint_config().set_checkpoint_timeout(600000)
t_env = StreamTableEnvironment.create(stream_execution_environment=env, environment_settings=settings)
t_env.get_config().get_configuration().set_string("taskmanager.memory.task.off-heap.size", '100m')
t_env.get_config().set_python_executable("python3")
# regular join 时须指定state过期时间,否则state不会自动清除,会造成状态爆炸
# 两参数分别为state状态的最小保存时间和最大保存时间
t_env.get_config().set_idle_state_retention_time(timedelta(hours=12), timedelta(hours=24))
# 另一种指定state过期时间的方式
# t_env.get_config().get_configuration().set_string("table.exec.state.ttl",'1h')
初始化各source ddl:
# 订单主表
source_ddl_main = """
create table order_main_source(
order_id STRING NOT NULL, --订单号
order_name STRING, --订单名
order_type STRING, --订单类型
order_time TIMESTAMP(3), --订单时间
is_del STRING, --删除标记
--时间窗口watermark,regular join中不要存在 WATERMARK FOR order_time AS order_time - INTERVAL '3' MINUTES,
proctime as PROCTIME(), --flink处理时间,暂不用
PRIMARY KEY (order_id) NOT ENFORCED
) WITH (
'connector' = 'mysql-cdc',
'hostname' = '192.168.1.132',
'port' = '3306',
'username' = 'cdc',
'password' = '###',
'database-name' = 'test',
'table-name' = 'order_main'
)
"""
# 订单明细表
source_ddl_detail = """
create table order_detail_source(
detail_id STRING NOT NULL, ---订单明细id
order_id STRING, ---订单ID
product_id STRING, ---商品ID
num INT, ---商品数量
amt DECIMAL(10, 2), ---商品总价
order_time TIMESTAMP(3), ---订单时间
--时间窗口watermark,regular join中不要存在 WATERMARK FOR order_time AS order_time - INTERVAL '3' MINUTE,
proctime as PROCTIME(), --flink处理时间,暂不用
PRIMARY KEY (detail_id) NOT ENFORCED
) WITH (
'connector' = 'mysql-cdc',
'hostname' = '192.168.1.132',
'port' = '3306',
'username' = 'cdc',
'password' = '###',
'database-name' = 'test',
'table-name' = 'order_detail'
)
"""
# 商品维表
# jdbc连mysql情况下,注意mysql别用tinyint,转换会出问题
source_ddl_products = """
CREATE TABLE products_source (
product_id STRING,
product_name STRING,
product_type STRING,
unit_price DECIMAL(10,2),
is_del INT,
PRIMARY KEY(product_id) NOT ENFORCED
) WITH (
'connector' = 'jdbc',
'url' = 'jdbc:mysql://192.168.1.132:3306/test',
'username' = 'cdc',
'password' = '###',
'table-name' = 'products'
)
"""
事实表join(regular 和 interval):
# regular join
# 将交易表和交易明细关联,生成view
opt0 = """
create view order_dr as
select order_id,
date_format(order_time,'yyyy-MM-dd') as order_date,
order_time, --订单时间
PROCTIME() as proc_time,
order_type,product_id,num,amt
from
(select m.order_id as order_id,
m.order_time as order_time,
m.order_type as order_type,
d.product_id as product_id,
d.num as num,d.amt as amt,
m.is_del as is_del
from order_detail_source d left join order_main_source m
on m.order_id = d.order_id) t1
where is_del = '0'
"""
# 若order_detail_source和order_main_source是append-only流,也可采用如下interval join
# interval join 需要源表中带watermark
opt1 = """
create view order_dr as
select order_id,
date_format(order_time,'yyyy-MM-dd') as order_date,
order_type,product_id,num,amt
from
(select m.order_id as order_id,
m.order_time as order_time,
m.order_type as order_type,
d.product_id as product_id,
d.num as num,d.amt as amt,
m.is_del as is_del
from order_detail_source d left join order_main_source m
on m.order_id = d.order_id
and m.order_time between d.order_time and d.order_time + INTERVAL '3' MINUTE) t1
where is_del = '0'
"""
sink的定义:
# sink表定义
sink_ddl = """
CREATE TABLE order_result_sink (
order_date STRING,
product_type STRING,
num INT,
amt DECIMAL(10,3),
PRIMARY KEY (order_date,product_type) NOT ENFORCED ---须定义主键,以便实现upsert效果
) WITH (
'connector' = 'jdbc',
'url' = 'jdbc:mysql://192.168.1.132:3306/test',
'driver' = 'com.mysql.cj.jdbc.Driver',
'table-name' = 'order_result',
'username' = 'cdc',
'password' = '###'
)
"""
关联维表、聚合、并sink到指定表:
# 事实表order_dr与当前状态的商品维表join
# 并sink到指定mysql表
opt_dim0 = """
insert into order_result_sink
select order_date,product_type,
sum(num) as num,
sum(amt) as amt
from
(select o.order_date as order_date,
p.product_type as product_type,
o.num as num,o.amt as amt
from order_dr o left join products_source FOR SYSTEM_TIME AS OF o.proc_time as p
on o.product_id = p.product_id ) t1
group by order_date,product_type
"""
# Job ID: 4174e02da07b2d84ff6233305877717b
'''结果:
order_date product_type num amt
2022-07-21 skirt 2 150.00
2022-07-22 shoe 1 305.00
2022-07-25 shoe 2 610.00
2022-07-25 jacket 2 400.00
2022-07-20 jacket 1 200.00
2022-07-20 shoe 1 305.00
2022-07-21 tshirt 2 158.00
'''