pyflink双流join+维表join例

双流join的三种方式:

regular join:
该Join方式需要去保留两个流的状态,持续性地保留并且不会去做清除;
两边的数据对于对方的流都是所有可见的,所以数据就需要持续性的存在State里面,而State不能存的过大,因此这个场景的只适合有界数据流
regular join支持update的和append的流

interval join:
Interval Join根据右流相对左流偏移的时间区间(interval)作为关联窗口,在偏移区间窗口中完成join操作
同时支持processing time和even time定义时间:
若使用processing time,则flink会自动使用系统时间划分窗口,并自动做相关state的清理;
若使用even time,则flink会用watermark机制划分窗口,并作state清理
仅支持append-only的流

window join:
将两个流中有相同key并处在相同window里的元素去做Join;必须同时满足Join key相同,且在同一个Window里元素才能够在最终结果中输出

环境初始化:

from datetime import timedelta
from pyflink.datastream import StreamExecutionEnvironment, CheckpointingMode
from pyflink.table import StreamTableEnvironment, EnvironmentSettings
'''
flink双流join+维表join示例,flink1.13.3
'''
settings = EnvironmentSettings.new_instance().in_streaming_mode().use_blink_planner().build()
env = StreamExecutionEnvironment.get_execution_environment()

env.set_parallelism(2)
env.enable_checkpointing(3600000)
env.get_checkpoint_config().set_checkpointing_mode(CheckpointingMode.EXACTLY_ONCE)
env.get_checkpoint_config().set_checkpoint_timeout(600000)
t_env = StreamTableEnvironment.create(stream_execution_environment=env, environment_settings=settings)
t_env.get_config().get_configuration().set_string("taskmanager.memory.task.off-heap.size", '100m')
t_env.get_config().set_python_executable("python3")
# regular join 时须指定state过期时间,否则state不会自动清除,会造成状态爆炸
# 两参数分别为state状态的最小保存时间和最大保存时间
t_env.get_config().set_idle_state_retention_time(timedelta(hours=12), timedelta(hours=24))
# 另一种指定state过期时间的方式
# t_env.get_config().get_configuration().set_string("table.exec.state.ttl",'1h')

初始化各source ddl:

# 订单主表
source_ddl_main = """
    create table order_main_source(
        order_id STRING NOT NULL,  --订单号
        order_name STRING,         --订单名
        order_type STRING,         --订单类型
        order_time TIMESTAMP(3),   --订单时间
        is_del STRING,             --删除标记
        --时间窗口watermark,regular join中不要存在 WATERMARK FOR order_time AS order_time - INTERVAL '3' MINUTES,  
        proctime as PROCTIME(),    --flink处理时间,暂不用
        PRIMARY KEY (order_id) NOT ENFORCED
    ) WITH (
      'connector' = 'mysql-cdc',
      'hostname' = '192.168.1.132',
      'port' = '3306',
      'username' = 'cdc',
      'password' = '###',
      'database-name' = 'test',
      'table-name' = 'order_main'
    )
"""
# 订单明细表
source_ddl_detail = """
    create table order_detail_source(
        detail_id STRING NOT NULL, ---订单明细id
        order_id STRING,           ---订单ID
        product_id STRING,         ---商品ID
        num INT,                   ---商品数量
        amt DECIMAL(10, 2),        ---商品总价
        order_time TIMESTAMP(3),   ---订单时间
        --时间窗口watermark,regular join中不要存在 WATERMARK FOR order_time AS order_time - INTERVAL '3' MINUTE,
        proctime as PROCTIME(),    --flink处理时间,暂不用
        PRIMARY KEY (detail_id) NOT ENFORCED
    ) WITH (
      'connector' = 'mysql-cdc',
      'hostname' = '192.168.1.132',
      'port' = '3306',
      'username' = 'cdc',
      'password' = '###',
      'database-name' = 'test',
      'table-name' = 'order_detail'
    )
"""
# 商品维表
# jdbc连mysql情况下,注意mysql别用tinyint,转换会出问题
source_ddl_products = """
    CREATE TABLE products_source (
        product_id  STRING,
        product_name  STRING,
        product_type  STRING,
        unit_price   DECIMAL(10,2),
        is_del INT,
        PRIMARY KEY(product_id) NOT ENFORCED
    ) WITH (
        'connector' = 'jdbc',
        'url' = 'jdbc:mysql://192.168.1.132:3306/test',
        'username' = 'cdc',
        'password' = '###',
        'table-name' = 'products'
    )
"""

事实表join(regular 和 interval):

# regular join
# 将交易表和交易明细关联,生成view
opt0 = """
    create view order_dr as 
    select order_id,
           date_format(order_time,'yyyy-MM-dd') as order_date,
           order_time,  --订单时间
           PROCTIME() as proc_time,
           order_type,product_id,num,amt
    from 
    (select m.order_id as order_id,
           m.order_time as order_time,
           m.order_type as order_type,
           d.product_id as product_id,
           d.num as num,d.amt as amt,
           m.is_del as is_del
    from order_detail_source d left join order_main_source m
    on m.order_id = d.order_id) t1
    where is_del = '0'
"""

# 若order_detail_source和order_main_source是append-only流,也可采用如下interval join
# interval join 需要源表中带watermark
opt1 = """
    create view order_dr as 
    select order_id,
           date_format(order_time,'yyyy-MM-dd') as order_date,
           order_type,product_id,num,amt
    from 
    (select m.order_id as order_id,
           m.order_time as order_time,
           m.order_type as order_type,
           d.product_id as product_id,
           d.num as num,d.amt as amt,
           m.is_del as is_del
    from order_detail_source d left join order_main_source m
    on m.order_id = d.order_id
    and m.order_time between d.order_time and d.order_time + INTERVAL '3' MINUTE) t1
    where is_del = '0'
"""

sink的定义:

# sink表定义
sink_ddl = """
        CREATE TABLE order_result_sink (
            order_date STRING,
            product_type STRING,
            num INT,                          
            amt DECIMAL(10,3),                       
            PRIMARY KEY (order_date,product_type) NOT ENFORCED  ---须定义主键,以便实现upsert效果
        ) WITH (
            'connector' = 'jdbc',
            'url' = 'jdbc:mysql://192.168.1.132:3306/test',
            'driver' = 'com.mysql.cj.jdbc.Driver',
            'table-name' = 'order_result',
            'username' = 'cdc',
            'password' = '###'
        )
"""

关联维表、聚合、并sink到指定表:

# 事实表order_dr与当前状态的商品维表join
# 并sink到指定mysql表
opt_dim0 = """
    insert into order_result_sink
    select order_date,product_type,
           sum(num) as num,
           sum(amt) as amt
    from 
    (select o.order_date as order_date,
           p.product_type as product_type,
           o.num as num,o.amt as amt
    from order_dr o left join products_source FOR SYSTEM_TIME AS OF o.proc_time as p
    on o.product_id = p.product_id ) t1
    group by order_date,product_type
"""

# Job ID: 4174e02da07b2d84ff6233305877717b
'''结果:
order_date                   product_type                            num                            amt
                     2022-07-21                          skirt                              2                         150.00
                     2022-07-22                           shoe                              1                         305.00
                     2022-07-25                           shoe                              2                         610.00
                     2022-07-25                         jacket                              2                         400.00
                     2022-07-20                         jacket                              1                         200.00
                     2022-07-20                           shoe                              1                         305.00
                     2022-07-21                         tshirt                              2                         158.00
'''

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值