flinkSQL的Window TVF(表值函数table-valued function)操作

from pyflink.datastream import StreamExecutionEnvironment, CheckpointingMode
from pyflink.table import StreamTableEnvironment, EnvironmentSettings

'''Window TVF(表值函数table-valued function)
本案例包含三种窗口聚合方式:tumbling window aggregation;hopping window aggregation;cumulative window aggregation

注意,所有cdc模式source表都不支持窗口聚合
'''

settings = EnvironmentSettings.new_instance().in_streaming_mode().use_blink_planner().build()
env = StreamExecutionEnvironment.get_execution_environment()

env.set_parallelism(2)
env.enable_checkpointing(3600000)
env.get_checkpoint_config().set_checkpointing_mode(CheckpointingMode.EXACTLY_ONCE)
env.get_checkpoint_config().set_checkpoint_timeout(600000)
t_env = StreamTableEnvironment.create(stream_execution_environment=env, environment_settings=settings)
t_env.get_config().get_configuration().set_string("taskmanager.memory.task.off-heap.size", '100m')
t_env.get_config().set_python_executable("python3")


''' 
create table order_main_source(
        order_id STRING NOT NULL,
        order_name STRING,
        order_type STRING,
        order_time STRING,  
        rowtime as TO_TIMESTAMP(order_time),
        WATERMARK FOR rowtime AS rowtime - INTERVAL '3' MINUTES,  --时间窗口watermark,订单创建时间-3分钟
        PRIMARY KEY (order_id) NOT ENFORCED
    );
desc order_main_source;
SELECT * FROM order_main_source;

orderdetail.csv数据内容:
d001,o001,jacket001,1,200,2022-07-20 10:25:19
d002,o001,shoe001,1,305,2022-07-20 10:25:21
d003,o002,tshirt001,2,158,2022-07-21 10:26:01
d004,o003,skirt001,1,88,2022-07-21 18:26:32
d005,o003,skirt002,1,62,2022-07-21 18:27:02
d006,o004,shoe001,1,305,2022-07-22 12:27:33

ordermain.csv数据内容:
o001,ordername1,online,2022-07-20 10:26:19,0
o002,ordername2,offline,2022-07-21 10:26:51,0
o003,ordername3,online,2022-07-21 18:27:32,0
o004,ordername4,offline,2022-07-22 12:28:33,0
'''
# 订单主表
source_ddl_main = """
    create table order_main_source(
        order_id STRING NOT NULL,  --订单号
        order_name STRING,         --订单名
        order_type STRING,         --订单类型
        order_time TIMESTAMP(3),   --订单时间
        is_del STRING,             --删除标记
        WATERMARK FOR order_time AS order_time - INTERVAL '3' MINUTES,  --时间窗口watermark,订单创建时间-3分钟
        PRIMARY KEY (order_id) NOT ENFORCED
    ) WITH (
      'connector' = 'filesystem',
      'path' = 'file:///srv/flinkcdc/source_tables/order_main',
      'format' = 'csv'
    )
"""
# 订单明细表
source_ddl_detail = """
    create table order_detail_source(
        detail_id STRING NOT NULL, ---订单明细id
        order_id STRING,           ---订单ID
        product_id STRING,         ---商品ID
        num INT,                   ---商品数量
        amt DECIMAL(10, 2),        ---商品总价
        order_time TIMESTAMP(3),   ---订单时间
        WATERMARK FOR order_time AS order_time - INTERVAL '3' MINUTE,  --时间窗口watermark,订单创建时间-3分钟
        PRIMARY KEY (detail_id) NOT ENFORCED
    ) WITH (
      'connector' = 'filesystem',
      'path' = 'file:///srv/flinkcdc/source_tables/order_detail',
      'format' = 'csv'
    )
"""

'''TUMBLING window:滚动窗口
TUMBLE函数根据时间属性列为关系的每一行分配一个窗口
TUMBLE函数有三个必需的参数: TUMBLE(TABLE data, DESCRIPTOR(timecol), size)
@data:表名,该表须有时间属性列
@timecol:指示哪个时间属性列映射到该滚动窗口
@size:滚动窗口的大小
效果:按序,每个窗口固定大小,不重叠
window_start和window_end是常规时间戳,无法在后续程序中作为时间属性列使用,故可用window_time(TVF生成的时间属性列,为window_end-1毫秒)
  作为时间属性进行后续的操作(如级联窗⼝聚合和窗⼝TopN)
'''
# 每个窗口的累计金额
tumble_opt = """
    SELECT window_start, window_end, SUM(amt) as total_amt
    FROM TABLE(
      TUMBLE(TABLE order_detail_source, DESCRIPTOR(order_time), INTERVAL '30' MINUTES))
    GROUP BY window_start,window_end
"""
'''结果:
window_start                     window_end                      total_amt
2022-07-20 10:00:00.000        2022-07-20 10:30:00.000                         505.00
2022-07-21 10:00:00.000        2022-07-21 10:30:00.000                         158.00
2022-07-21 18:00:00.000        2022-07-21 18:30:00.000                         150.00
2022-07-22 12:00:00.000        2022-07-22 12:30:00.000                         305.00
'''
# 根据业务字段聚合的每个窗口累计金额
tumble_opt1 = """
   SELECT window_start, window_end, product_id,SUM(amt) as total_amt
   FROM TABLE(
     TUMBLE(TABLE order_detail_source, DESCRIPTOR(order_time), INTERVAL '30' MINUTES))
   GROUP BY product_id,window_start, window_end
"""
'''结果:
window_start                     window_end                     product_id                      total_amt
2022-07-20 10:00:00.000        2022-07-20 10:30:00.000                      jacket001                         200.00
2022-07-20 10:00:00.000        2022-07-20 10:30:00.000                        shoe001                         305.00
2022-07-21 10:00:00.000        2022-07-21 10:30:00.000                      tshirt001                         158.00
2022-07-21 18:00:00.000        2022-07-21 18:30:00.000                       skirt001                          88.00
2022-07-21 18:00:00.000        2022-07-21 18:30:00.000                       skirt002                          62.00
2022-07-22 12:00:00.000        2022-07-22 12:30:00.000                        shoe001                         305.00
'''
# 含window_time的效果
tumble_opt2 = """
    SELECT window_start,window_end,window_time, SUM(amt) as total_amt
    FROM TABLE(
      TUMBLE(TABLE order_detail_source, DESCRIPTOR(order_time), INTERVAL '30' MINUTES))
    GROUP BY window_start,window_end,window_time
"""
'''结果:
window_start                     window_end                    window_time                      total_amt
2022-07-20 10:00:00.000        2022-07-20 10:30:00.000        2022-07-20 10:29:59.999                         505.00
2022-07-21 10:00:00.000        2022-07-21 10:30:00.000        2022-07-21 10:29:59.999                         158.00
2022-07-21 18:00:00.000        2022-07-21 18:30:00.000        2022-07-21 18:29:59.999                         150.00
2022-07-22 12:00:00.000        2022-07-22 12:30:00.000        2022-07-22 12:29:59.999                         305.00
'''

# 多维聚合效果--所有维度组合均覆盖的情况
tumble_opt3 = """
    SELECT window_start,window_end,window_time,order_id,product_id,SUM(amt) as total_amt
      FROM TABLE(
      TUMBLE(TABLE order_detail_source, DESCRIPTOR(order_time), INTERVAL '30' MINUTES))
    GROUP BY window_start,window_end,window_time,GROUPING SETS ((order_id,product_id),(order_id),(product_id),());
"""
# 多维聚合效果--普通聚合情况
tumble_opt31 = """
    SELECT window_start,window_end,window_time,order_id,product_id,SUM(amt) as total_amt
      FROM TABLE(
      TUMBLE(TABLE order_detail_source, DESCRIPTOR(order_time), INTERVAL '30' MINUTES))
    GROUP BY window_start,window_end,window_time,order_id,product_id
"""

'''
HOPPING window:滑动窗口
HOP函数分配的窗口覆盖大小间隔内的行,并根据时间属性列移动每个窗口,窗口大小必须为间隔的整数倍
HOP(TABLE data, DESCRIPTOR(timecol), slide, size [, offset ])
@data:表名,该表须有时间属性列
@timecol:指示哪个时间属性列映射到该滚动窗口
@slide:指定顺序hopping 窗口开始之间的持续时间
@size:hopping 窗口宽度的持续时间,size为slide的整数倍
'''
# 窗口大小30min,滑动大小10min的情况
hop_opt = """
    SELECT window_start, window_end, SUM(amt) as total_amt
    FROM TABLE(
    HOP(TABLE order_detail_source, DESCRIPTOR(order_time), INTERVAL '10' MINUTES, INTERVAL '30' MINUTES))
    GROUP BY window_start, window_end;
"""
'''结果:
window_start                     window_end                      total_amt
        2022-07-20 10:00:00.000        2022-07-20 10:30:00.000                         505.00
        2022-07-20 10:10:00.000        2022-07-20 10:40:00.000                         505.00
        2022-07-20 10:20:00.000        2022-07-20 10:50:00.000                         505.00
        2022-07-21 10:00:00.000        2022-07-21 10:30:00.000                         158.00
        2022-07-21 10:10:00.000        2022-07-21 10:40:00.000                         158.00
        2022-07-21 10:20:00.000        2022-07-21 10:50:00.000                         158.00
        2022-07-21 18:00:00.000        2022-07-21 18:30:00.000                         150.00
        2022-07-21 18:10:00.000        2022-07-21 18:40:00.000                         150.00
        2022-07-21 18:20:00.000        2022-07-21 18:50:00.000                         150.00
        2022-07-22 12:00:00.000        2022-07-22 12:30:00.000                         305.00
        2022-07-22 12:10:00.000        2022-07-22 12:40:00.000                         305.00
        2022-07-22 12:20:00.000        2022-07-22 12:50:00.000                         305.00
'''
# 含window_time情况、及按不同维度聚合的写法类似tumble例子中的写法

'''cumulative window:累计窗口[应用较多]
场景:有一个1小时步长和1天最大大小的累积窗口,将得到每天的窗口:
   [00:00,01:00),[00:00,02:00),[00:00,03:00),…,[00:00,24:00)
针对每个窗口进行累计数据的计算
CUMULATE(TABLE data, DESCRIPTOR(timecol), step, size)
@data:表名,该表须有时间属性列
@timecol:指示哪个时间属性列映射到该滚动窗口
@step:每个步长持续的时间
@size:累计窗口最大宽度持续时间,须为步长时间的整数倍
'''
cum_opt = """
    SELECT window_start, window_end, SUM(amt) as total_amt
    FROM TABLE(
     CUMULATE(TABLE order_detail_source, DESCRIPTOR(order_time), INTERVAL '1' HOURS, INTERVAL '24' HOURS))
    GROUP BY window_start, window_end;
"""
'''结果:
 
                   window_start                     window_end                      total_amt
        2022-07-20 00:00:00.000        2022-07-20 11:00:00.000                         505.00
        2022-07-20 00:00:00.000        2022-07-20 12:00:00.000                         505.00
        2022-07-20 00:00:00.000        2022-07-20 13:00:00.000                         505.00
        2022-07-20 00:00:00.000        2022-07-20 14:00:00.000                         505.00
        2022-07-20 00:00:00.000        2022-07-20 15:00:00.000                         505.00
        2022-07-20 00:00:00.000        2022-07-20 16:00:00.000                         505.00
        2022-07-20 00:00:00.000        2022-07-20 17:00:00.000                         505.00
        2022-07-20 00:00:00.000        2022-07-20 18:00:00.000                         505.00
        2022-07-20 00:00:00.000        2022-07-20 19:00:00.000                         505.00
        2022-07-20 00:00:00.000        2022-07-20 20:00:00.000                         505.00
        2022-07-20 00:00:00.000        2022-07-20 21:00:00.000                         505.00
        2022-07-20 00:00:00.000        2022-07-20 22:00:00.000                         505.00
        2022-07-20 00:00:00.000        2022-07-20 23:00:00.000                         505.00
        2022-07-20 00:00:00.000        2022-07-21 00:00:00.000                         505.00
        2022-07-21 00:00:00.000        2022-07-21 11:00:00.000                         158.00
        2022-07-21 00:00:00.000        2022-07-21 12:00:00.000                         158.00
        2022-07-21 00:00:00.000        2022-07-21 13:00:00.000                         158.00
        2022-07-21 00:00:00.000        2022-07-21 14:00:00.000                         158.00
        2022-07-21 00:00:00.000        2022-07-21 15:00:00.000                         158.00
        2022-07-21 00:00:00.000        2022-07-21 16:00:00.000                         158.00
        2022-07-21 00:00:00.000        2022-07-21 17:00:00.000                         158.00
        2022-07-21 00:00:00.000        2022-07-21 18:00:00.000                         158.00
        2022-07-21 00:00:00.000        2022-07-21 19:00:00.000                         308.00
        2022-07-21 00:00:00.000        2022-07-21 20:00:00.000                         308.00
        2022-07-21 00:00:00.000        2022-07-21 21:00:00.000                         308.00
        2022-07-21 00:00:00.000        2022-07-21 22:00:00.000                         308.00
        2022-07-21 00:00:00.000        2022-07-21 23:00:00.000                         308.00
        2022-07-21 00:00:00.000        2022-07-22 00:00:00.000                         308.00
        2022-07-22 00:00:00.000        2022-07-22 13:00:00.000                         305.00
        2022-07-22 00:00:00.000        2022-07-22 14:00:00.000                         305.00
        2022-07-22 00:00:00.000        2022-07-22 15:00:00.000                         305.00
'''
# 含window_time情况、及按不同维度聚合的写法类似tumble例子中的写法

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值