基于FlinkSql实时数仓构建

基于FlinkSql实时数仓构建

实时数仓主要解决传统数仓数据时效性低的问题,实时数仓通常会用在实时的OLAP分析,实时大屏展示,实时监控报警各个场景。虽然关于实时数仓架构及技术选型与传统的离线数仓会存在差异,但是关于数仓建设的基本方法论是一致的。接下来主要介绍Flink SQL从0到1搭建一个实时数仓的demo,涉及到数据采集、存储、计算、可视化整个流程。

1、案例简介

本文以电商业务为例,展示实时数仓的数据处理流程。另外,本文旨在说明实时数仓的构建流程,所以不会涉及复杂的数据计算。为了保证案例的可操作性和完整性,本文会给出详细的操作步骤。为了方便演示,本文的所有操作都是在Flink SQL Cli中完成。

1.1 指标

  • 地区消费能力TopN
  • 学历购物爱好TopN
  • 热门商品TopN
  • 消费总金额
  • 文明城市TopN

2、架构设计

2.1 架构设计概要

详细的架构设计如图所示:首先通过CDC方式解析Mysql的binlog日志,将数据存储到Kafka中当做ODS层的数据。然后使用Flink SQL对原始数据进行清洗关联,并将处理之后的明细宽表数据写入到Kafka中。维表数据通过Flink的方式进行回表到本地的Mysql中。通过Flink Sql对明细宽表与维表进行join,将聚合后的数据写入到写入到elasticsearch中,通过Kibana的可视化进行展示。

用户分析

2.2 架构分层设计

用户分析

3、业务数据

3.1 业务数据表关系

asda

3.2 业务数据表

DROP TABLE IF EXISTS `action_type`;
CREATE TABLE `action_type` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `type` varchar(50) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;

-- ----------------------------
-- Table structure for categorys
-- ----------------------------
DROP TABLE IF EXISTS `categorys`;
CREATE TABLE `categorys` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `category` varchar(50) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;

-- ----------------------------
-- Table structure for commoditys
-- ----------------------------
DROP TABLE IF EXISTS `commoditys`;
CREATE TABLE `commoditys` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `category` int(10) DEFAULT NULL,
  `commodity` varchar(50) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;

-- ----------------------------
-- Table structure for evaluate
-- ----------------------------
DROP TABLE IF EXISTS `evaluate`;
CREATE TABLE `evaluate` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `userId` varchar(50) DEFAULT NULL,
  `productId` int(10) DEFAULT NULL,
  `described` int(10) DEFAULT NULL,
  `comment` varchar(255) DEFAULT NULL,
  `logisticsAttitude` int(10) DEFAULT NULL,
  `serviceAttitude` int(10) DEFAULT NULL,
  `merchantId` varchar(50) DEFAULT NULL,
  `orderInformationId` varchar(50) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;

-- ----------------------------
-- Table structure for operation_log
-- ----------------------------
DROP TABLE IF EXISTS `operation_log`;
CREATE TABLE `operation_log` (
  `id` int(10) NOT NULL AUTO_INCREMENT,
  `actionTime` datetime DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP,
  `userId` varchar(50) DEFAULT NULL,
  `sessionId` varchar(50) DEFAULT NULL,
  `pageId` int(10) DEFAULT NULL,
  `actionType` int(10) DEFAULT NULL,
  `keyWord` varchar(255) DEFAULT NULL,
  `categoryId` int(10) DEFAULT NULL,
  `productId` int(10) DEFAULT NULL,
  `advertising` int(10) DEFAULT NULL,
  `advertisingId` int(10) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;

-- ----------------------------
-- Table structure for order
-- ----------------------------
DROP TABLE IF EXISTS `order`;
CREATE TABLE `order` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `timestamps` datetime DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP,
  `orderInformationId` varchar(50) DEFAULT NULL,
  `userId` varchar(50) DEFAULT NULL,
  `categoryId` int(10) DEFAULT NULL,
  `productId` int(10) DEFAULT NULL,
  `price` decimal(10,2) DEFAULT NULL,
  `productCount` int(10) DEFAULT NULL,
  `priceSum` decimal(10,2) DEFAULT NULL,
  `shipAddress` varchar(50) DEFAULT NULL,
  `receiverAddress` varchar(50) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;

-- ----------------------------
-- Table structure for user_profile
-- ----------------------------
DROP TABLE IF EXISTS `user_profile`;
CREATE TABLE `user_profile` (
  `userId` varchar(50) NOT NULL,
  `province` varchar(50) DEFAULT NULL,
  `city` varchar(50) DEFAULT NULL,
  `age` int(10) DEFAULT NULL,
  `education` varchar(50) DEFAULT NULL,
  `jobType` varchar(50) DEFAULT NULL,
  `marriage` varchar(50) DEFAULT NULL,
  `sex` varchar(50) DEFAULT NULL,
  `interest` varchar(50) DEFAULT NULL,
  PRIMARY KEY (`userId`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

4、数据处理流程

4.1 ODS层数据同步

  • kafka Connect 任务配置:
{
    "name":"userAnalysis",
    "config":{
        "connector.class":"io.debezium.connector.mysql.MySqlConnector",
        "database.history.connector.id":"userAnalysis",
        "database.history.connector.class":"org.apache.kafka.connect.source.SourceConnector",
        "database.hostname":"打码",
        "database.port":3306,
        "database.user":"你猜",
        "database.password":"你猜",
        "database.server.name":"ods",
        "database.jdbc.driver":"com.mysql.jdbc.Driver",
        "database.history.kafka.bootstrap.servers":"打码:9092",
        "database.history.kafka.topic":"userAnalysis",
        "database.whitelist":"userAnalysis",
        "tasks.max":"1",
        "key.converter":"org.apache.kafka.connect.storage.StringConverter",
        "value.converter":"io.confluent.connect.avro.AvroConverter",
        "value.converter.schema.registry.url":"http://localhost:8081"
    }
}
  • 通过以上配置同步后的CDC监控数据库表:

image-20201124100951502

  • Flink Sql 加载ODS层数据
-- 用户表
create table ods_users(
 userId string primary key comment '用户id',
 province string comment '省',
 city string comment '市',
 age int comment '年龄',
 education string comment '学历',
 jobType string comment '工作类别',
 marriage string comment '婚姻状态',
 sex string comment '性别',
 interest string comment '兴趣'
)
with(
'connector' = 'kafka',
'format' = 'debezium-avro-confluent', -- avro-confluent
'debezium-avro-confluent.schema-registry.url' = 'http://你猜:8081', -- avro-confluent.schema-registry.url
'topic' = 'ods.userAnalysis.user_profile',
'properties.bootstrap.servers' = '你猜:9092',
'properties.group.id' = 'flink-analysis',
'scan.startup.mode' = 'earliest-offset'
)


-- 订单表
create table if not exists ods_order(
id int PRIMARY KEY comment '订单id',
timestamps bigint comment '订单创建时间',
orderInformationId string comment '订单信息ID',
userId string comment '用户ID',
categoryId int comment '商品类别',
productId int comment '商品ID',
price decimal(10,2) comment '单价',
productCount int comment '购买数量',
priceSum decimal(10,2) comment '订单总价',
shipAddress string comment '商家地址',
receiverAddress string comment '收货地址'
--ts AS TO_TIMESTAMP(FROM_UNIXTIME(timestamps/1000)),
--WATERMARK FOR ts AS ts - INTERVAL '3' SECOND
)
with(
'connector' = 'kafka',
'format' = 'debezium-avro-confluent', -- avro-confluent
'debezium-avro-confluent.schema-registry.url' = 'http://你猜:8081', -- avro-confluent.schema-registry.url
'topic' = 'ods.userAnalysis.order',
'properties.bootstrap.servers' = '你猜:9092',
'properties.group.id' = 'flink-analysis',
'scan.startup.mode' = 'earliest-offset' --earliest-offset   latest-offset
)

-- 操作类型
create table if not exists ods_action_type(
id int PRIMARY KEY comment '操作类型ID',
type string comment '操作类型'
)
with(
'connector' = 'kafka',
'format' = 'debezium-avro-confluent', -- avro-confluent
'debezium-avro-confluent.schema-registry.url' = 'http://你猜:8081', -- avro-confluent.schema-registry.url
'topic' = 'ods.userAnalysis.action_type',
'properties.bootstrap.servers' = '你猜:9092',
'properties.group.id' = 'flink-analysis',
'scan.startup.mode' = 'earliest-offset'
)

--商品品类

create table if not exists ods_category(
id int PRIMARY KEY comment '商品品类ID',
category string comment '商品品类'
)
with(
'connector' = 'kafka',
'format' = 'debezium-avro-confluent', -- avro-confluent
'debezium-avro-confluent.schema-registry.url' = 'http://你猜:8081', -- avro-confluent.schema-registry.url
'topic' = 'ods.userAnalysis.categorys',
'properties.bootstrap.servers' = '你猜:9092',
'properties.group.id' = 'flink-analysis',
'scan.startup.mode' = 'earliest-offset'
)

-- 商品表
create table if not exists ods_commodity(
id int PRIMARY KEY comment '商品ID',
category int comment '商品品类ID',
commodity string comment '商品名称'
)
with(
'connector' = 'kafka',
'format' = 'debezium-avro-confluent', -- avro-confluent
'debezium-avro-confluent.schema-registry.url' = 'http://你猜:8081', -- avro-confluent.schema-registry.url
'topic' = 'ods.userAnalysis.commoditys',
'properties.bootstrap.servers' = '你猜:9092',
'properties.group.id' = 'flink-analysis',
'scan.startup.mode' = 'earliest-offset'
)
    
-- 评价表
create table if not exists ods_evaluate(
id int PRIMARY KEY comment 'ID',
userId string comment '用户ID',
productId int comment '商品ID',
described int comment '评分',
`comment` string comment '评价',
logisticsAttitude int comment '物流评分',
serviceAttitude int comment '服务评分',
merchantId string comment '商家ID',
orderInformationId string comment '订单ID'
-- ts AS PROCTIME()
)
with(
'connector' = 'kafka',
'format' = 'debezium-avro-confluent', -- avro-confluent
'debezium-avro-confluent.schema-registry.url' = 'http://你猜:8081', -- avro-confluent.schema-registry.url
'topic' = 'ods.userAnalysis.evaluate',
'properties.bootstrap.servers' = '你猜:9092',
'properties.group.id' = 'flink-analysis',
'scan.startup.mode' = 'earliest-offset' --latest-offset   earliest-offset
)

-- 操作日志
create table if not exists ods_operation_log(
id int PRIMARY KEY comment 'ID',
actionTime bigint comment '操作时间',
userId string comment '用户ID',
sessionId string comment '会话ID',
pageId int comment '页面ID',
actionType int comment '操作ID',
keyWord string comment '关键词',
categoryId int comment '品类ID',
productId int comment '商品ID',
advertising int comment '广告次数',
advertisingId int comment '广告ID'
)
with(
'connector' = 'kafka',
'format' = 'debezium-avro-confluent', -- avro-confluent
'debezium-avro-confluent.schema-registry.url' = 'http://你猜:8081', -- avro-confluent.schema-registry.url
'topic' = 'ods.userAnalysis.operation_log',
'properties.bootstrap.servers' = '你猜:9092',
'properties.group.id' = 'flink-analysis',
'scan.startup.mode' = 'earliest-offset' --earliest-offset
)

4.2 DIM层数据处理

  • 维表数据回表
-- 用户表
create table if not exists dim_user_profile(
 userId string primary key comment '用户id',
 province string comment '省',
 city string comment '市',
 age int comment '年龄',
 education string comment '学历',
 jobType string comment '工作类别',
 marriage string comment '婚姻状态',
 sex string comment '性别',
 interest string comment '兴趣'
)
WITH (
    'connector' = 'jdbc',
    'url' = 'jdbc:mysql://你猜:3306/userAnalysis?serverTimezone=UTC&useUnicode=true&characterEncoding=utf8&useSSL=false',
    'table-name' = 'user_profile',
    'driver' = 'com.mysql.jdbc.Driver',
    'username' = '你猜',
    'password' = '你猜',
    'sink.buffer-flush.interval' = '1s'
)


-- 用户表
insert into dim_user_profile
select * from ods_users


-- 操作类型
create table if not exists dim_action_type(
id int PRIMARY KEY comment '操作类型ID',
type string comment '操作类型'
)
WITH (
    'connector' = 'jdbc',
    'url' = 'jdbc:mysql://你猜:3306/userAnalysis?serverTimezone=UTC&useUnicode=true&characterEncoding=utf8&useSSL=false',
    'table-name' = 'action_type',
    'driver' = 'com.mysql.jdbc.Driver',
    'username' = '你猜',
    'password' = '你猜',
    'sink.buffer-flush.interval' = '1s'
)

-- 操作类型
insert into dim_action_type
select * from ods_action_type

-- 商品品类
create table if not exists dim_category(
id int PRIMARY KEY comment '商品品类ID',
category string comment '商品品类'
)
WITH (
    'connector' = 'jdbc',
    'url' = 'jdbc:mysql://你猜:3306/userAnalysis?serverTimezone=UTC&useUnicode=true&characterEncoding=utf8&useSSL=false',
    'table-name' = 'category',
    'driver' = 'com.mysql.jdbc.Driver',
    'username' = '你猜',
    'password' = '你猜',
    'sink.buffer-flush.interval' = '1s'
)

-- 商品品类
insert into dim_category
select * from ods_category

-- 商品表
create table if not exists dim_commodity(
id int PRIMARY KEY comment '商品ID',
category int comment '商品品类ID',
commodity string comment '商品名称'
)
WITH (
    'connector' = 'jdbc',
    'url' = 'jdbc:mysql://你猜:3306/userAnalysis?serverTimezone=UTC&useUnicode=true&characterEncoding=utf8&useSSL=false',
    'table-name' = 'commodity',
    'driver' = 'com.mysql.jdbc.Driver',
    'username' = '你猜',
    'password' = '你猜',
    'sink.buffer-flush.interval' = '1s'
)

-- 商品表
insert into dim_commodity
select * from ods_commodity

  • 创建商品表视图
-- 商品表详细信息
-- 在数据库端操作
CREATE
VIEW dim_commodity_info AS
SELECT
	c.id AS categoryId,
	com.id AS commodityId,
	c.category,
	com.commodity
FROM
	categorys c
INNER JOIN commoditys com ON c.id = com.category;

  • Flink 加载维表数据
-- 商品表详细信息

create table if not exists dim_commodity_info(
commodityId int PRIMARY KEY comment '商品ID',
categoryId int comment '商品品类ID',
commodity string comment '商品名称',
category string comment '商品品类',
)
WITH (
    'connector' = 'jdbc',
    'url' = 'jdbc:mysql://你猜:3306/userAnalysis?serverTimezone=UTC&useUnicode=true&characterEncoding=utf8&useSSL=false',
    'table-name' = 'dim_commodity_info',
    'driver' = 'com.mysql.jdbc.Driver',
    'username' = '你猜',
    'password' = '你猜',
    'scan.fetch-size' = '200',
    'lookup.cache.max-rows' = '1000',
    'lookup.cache.ttl' = '60000'
)

4.3 DWD层数据处理

-- 订单与评价宽表
create table if not exists dwd_paid_order_detail(
id int PRIMARY KEY comment 'ID',
userId string comment '用户ID',
described int comment '评分',
evaluate string comment '评价',
logisticsAttitude int comment '物流评分',
serviceAttitude int comment '服务评分',
merchantId string comment '商家ID',
timestamps bigint comment '订单创建时间',
orderInformationId string comment '订单信息ID',
categoryId int comment '商品类别',
productId int comment '商品ID',
price decimal(10,2) comment '单价',
productCount int comment '购买数量',
priceSum decimal(10,2) comment '订单总价',
shipAddress string comment '商家地址',
receiverAddress string comment '收货地址'
)
with(
'connector' = 'kafka',
'format' = 'debezium-avro-confluent', -- avro-confluent
'debezium-avro-confluent.schema-registry.url' = 'http://你猜:8081', -- avro-confluent.schema-registry.url
'debezium-avro-confluent.schema-registry.subject' = 'dwd_paid_order_detail',
'topic' = 'dwd_paid_order_detail',
'properties.bootstrap.servers' = '你猜:9092',
'properties.group.id' = 'flink-analysis',
'sink.partitioner' = 'fixed'
)

-- 插入宽表数据
insert into dwd_paid_order_detail
select
o.id
,e.userId
,e.described
,e.`comment` as evaluate
,e.logisticsAttitude
,e.serviceAttitude
,e.merchantId
,o.timestamps
,o.orderInformationId
,o.categoryId
,o.productId
,o.price
,o.productCount
,o.priceSum
,o.shipAddress
,o.receiverAddress
from
ods_order as o inner join ods_evaluate e
on o.orderInformationId=e.orderInformationId

-- 创建宽表临时表
create table if not exists dwd_paid_order_detail_tmp(
id int PRIMARY KEY comment 'ID',
userId string comment '用户ID',
described int comment '评分',
evaluate string comment '评价',
logisticsAttitude int comment '物流评分',
serviceAttitude int comment '服务评分',
merchantId string comment '商家ID',
timestamps bigint comment '订单创建时间',
orderInformationId string comment '订单信息ID',
categoryId int comment '商品类别',
productId int comment '商品ID',
price decimal(10,2) comment '单价',
productCount int comment '购买数量',
priceSum decimal(10,2) comment '订单总价',
shipAddress string comment '商家地址',
receiverAddress string comment '收货地址'
)
with(
'connector' = 'kafka',
'format' = 'debezium-avro-confluent', -- avro-confluent
'debezium-avro-confluent.schema-registry.url' = 'http://你猜:8081', -- avro-confluent.schema-registry.url
'topic' = 'dwd_paid_order_detail',
'properties.bootstrap.servers' = '你猜:9092',
'properties.group.id' = 'flink-analysis',
'scan.startup.mode' = 'earliest-offset' --earliest-offset
)

4.4 ADS层数据处理

-- 地区消费能力
create table if not exists ads_power_consumption_index(
province string primary key,
totalPrice decimal(20,2)
)
with(
'connector' = 'elasticsearch-7',
'hosts' = 'http://localhost:9200',
'index' = 'area_cost'
)

-- 地区消费能力
insert into ads_power_consumption_index
select u.province as province,sum(o.priceSum) as totalPrice
from dwd_paid_order_detail_tmp o join ods_users u on o.userId = u.userId
group by province

-- 消费总金额
create table if not exists ads_total_consumption_index(
endTime timestamp(3),
money decimal(20,2)
)
with(
'connector' = 'elasticsearch-7',
'hosts' = 'http://localhost:9200',
'index' = 'total_money'
)
-- 消费总金额
insert into ads_total_consumption_index
select cast(max(timestamps)/1000 as timestamp(3)) as endTime,
sum(priceSum) as money from dwd_paid_order_detail_tmp

-- 学历购物爱好
create table if not exists ads_edu_shopping_hobby_index(
education string primary key,
money decimal(20,2)
)
with(
'connector' = 'elasticsearch-7',
'hosts' = 'http://localhost:9200',
'index' = 'edu_cost'
)
       
--学历购物爱好
insert into ads_edu_shopping_hobby_index
select
u.education as education,
sum(o.priceSum) as money
from
ods_users u join dwd_paid_order_detail_tmp o
on u.userId = o.userId
group by u.education    

-- 热门商品
create table if not exists ads_commodity_index(
commodity string primary key,
heat int
)
with(
'connector' = 'elasticsearch-7',
'hosts' = 'http://localhost:9200',
'index' = 'commodity_index'
)
   
-- 热门商品    
insert into ads_commodity_index
select d.commodity,sum(o.productCount) heat
from dwd_paid_order_detail_tmp o join ods_commodity d
on o.productId = d.id
group by d.commodity    
    
-- 文明城市    
create table if not exists ads_civilized_city_index(
city string primary key,
score double
)
with(
'connector' = 'elasticsearch-7',
'hosts' = 'http://localhost:9200',
'index' = 'civilized_city_index'
)
    
-- 文明城市 
insert into ads_civilized_city_index
select a.* from (
select u.city,avg(
(described*0.4+logisticsAttitude*0.3+serviceAttitude*0.3)/3
) score
from dwd_paid_order_detail_tmp o join ods_users u
on o.userId = u.userId
group by u.city
)as a
where a.score> 0.9    

4.5 Flink Sql Client 执行

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

5 、指标可视化

image-20201125163629490
注:此图仅供参考不代表任何立场

6、API

CDC在Sql中无法开窗故用API做demo示例。

package com.bugboy.analysis

import java.sql.Date
import java.text.SimpleDateFormat
import java.util.Properties

import org.apache.flink.formats.avro.registry.confluent.debezium.DebeziumAvroDeserializationSchema
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.AssignerWithPeriodicWatermarks
import org.apache.flink.streaming.api.scala.function.ProcessAllWindowFunction
import org.apache.flink.streaming.api.scala.{StreamExecutionEnvironment, _}
import org.apache.flink.streaming.api.watermark.Watermark
import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer
import org.apache.flink.table.api.{DataTypes, TableSchema}
import org.apache.flink.table.data.RowData
import org.apache.flink.table.runtime.typeutils.InternalTypeInfo
import org.apache.flink.table.types.logical.RowType
import org.apache.flink.types.RowKind
import org.apache.flink.util.Collector
import org.apache.kafka.clients.consumer.ConsumerConfig

/**
 * 消费总金额
 */
object TotalCost {
  val TOPIC: String = "ods.userAnalysis.order"
  val SCHEMA_REGIST_URL: String = "http://你猜:8081"
  val BOOTSTRAP_SERVERS: String = "打码:9092,"
  val GROUP_ID: String = "flink-analysis"

  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
    env.getConfig.setAutoWatermarkInterval(200L)
    env.getCheckpointConfig.setCheckpointInterval(1000L)
    val schema = TableSchema.builder()
      .field("id", DataTypes.INT)
      .field("timestamps", DataTypes.BIGINT())
      .field("orderInformationId", DataTypes.STRING())
      .field("userId", DataTypes.STRING())
      .field("categoryId", DataTypes.INT())
      .field("productId", DataTypes.INT())
      .field("price", DataTypes.DECIMAL(10, 2))
      .field("productCount", DataTypes.INT())
      .field("priceSum", DataTypes.DECIMAL(10, 2))
      .field("shipAddress", DataTypes.STRING())
      .field("receiverAddress", DataTypes.STRING())
      .build()
    val props = new Properties()
    props.setProperty(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, BOOTSTRAP_SERVERS)
    props.setProperty(ConsumerConfig.GROUP_ID_CONFIG, GROUP_ID)

    val rowType = schema.toRowDataType.getLogicalType.asInstanceOf[RowType]
    val deserialization: DebeziumAvroDeserializationSchema = new DebeziumAvroDeserializationSchema(rowType, InternalTypeInfo.of(rowType), SCHEMA_REGIST_URL)
    val kafkaConsumerSource = new FlinkKafkaConsumer(TOPIC, deserialization, props)
      .setStartFromEarliest()
      .assignTimestampsAndWatermarks(new AssignerWithPeriodicWatermarks[RowData] {
        val maxOutOfOrderness = 1000L
        var currentMaxTime: Long = 0L

        override def extractTimestamp(t: RowData, l: Long): Long = {
          val currentTime = t.getLong(1)
          currentMaxTime = scala.math.max(currentMaxTime, currentTime)
          currentTime
        }

        override def getCurrentWatermark: Watermark = {
          new Watermark(currentMaxTime - maxOutOfOrderness)
        }
      })
    env.addSource(kafkaConsumerSource)
      .windowAll(SlidingEventTimeWindows.of(Time.minutes(1), Time.minutes(1)))
      .process(new ProcessAllWindowFunction[RowData, (String, BigDecimal), TimeWindow]() {
        var sum: BigDecimal = BigDecimal(0)
        val FORMAT: String = "yyyy-MM-dd HH:mm"

        def formatTime(time: Long): String = {
          new SimpleDateFormat(FORMAT).format(new Date(time))
        }

        override def process(context: Context, elements: Iterable[RowData], out: Collector[(String, BigDecimal)]): Unit = {
          elements.foreach(row => {
            val priceSum = row.getDecimal(8, 10, 2).toBigDecimal
            row.getRowKind match {
              case RowKind.INSERT => sum = sum + priceSum
              case RowKind.UPDATE_BEFORE => sum = sum - priceSum
              case RowKind.UPDATE_AFTER => sum = sum + priceSum
              case RowKind.DELETE => sum = sum - priceSum
            }
          })
          val windowEnd = context.window.getEnd
          out.collect((formatTime(windowEnd), sum))
        }
      }).print()
    env.execute()
  }
}

6、技术探讨

6.1 延迟数据

所以迟到数据可以说是一种特殊的乱序数据,因为是在窗口关闭后才到达的数据。一般这种情况有三种处理办法:

  1. 重新激活已经关闭的窗口并重新计算以修正结果。
  2. 将迟到数据收集起来另外处理。
  3. 将迟到数据视为错误消息并丢弃。

Flink默认采用第三种方法,将迟到数据视为错误消息丢弃。想要使用前两种方法需要使用到sideOutput机制和allowedLateness机制。

sideOutput机制可以将迟到事件单独放入一个数据流分支,这会作为 window 计算结果的副产品,以便用户获取并对其进行特殊处理。

allowedLateness机制允许用户设置一个允许的最大迟到时长。Flink 会在窗口关闭后一直保存窗口的状态直至超过允许迟到时长,这期间的迟到事件不会被丢弃,而是默认会触发窗口重新计算。

所以,如果要设置允许延迟的时间,可以通过DataStream.allowedLateness(lateness: Time)。如果要保存延迟数据要通过sideOutputLateData(outputTag: OutputTag[T])来保存。而要获取已经保存的延迟数据,则要通过DataStream.getSideOutput(tag: OutputTag[X])。

默认情况下,watermark到达窗口结束后,完成聚合操作,只会执行1次,相关于如果定义了1天的时间窗口,1天之后才
能看下结果。这个时候如果需要实时看到结果,需要定义触发器

watermark到达窗口结束前的发射策略是否开启:table.exec.emit.early-fire.enabled,默认false
table.exec.emit.early-fire.delay,窗口结束前的发射间隔,单位毫秒。=0,无间隔,>0 间隔时间,<0 非法值。无默认值

watermark到达窗口结束后的发射策略是否开启 table.exec.emit.late-fire.enabled,默认fasle
table.exec.emit.late-fire.delay,设置间隔时间

6.2 动态表

官方文档

6.3 时态表

官方文档

6.4 Joins

官方文档

6.5 查询配置

官方文档

6.6 流式聚合

官方文档

7、 FlinkSql存在的Bug

  • fFlinkSql 目前不支持对CDC数据进行开窗计算。
-- sql 
select sum(priceSum) from ods_order group by TUMBLE(ts, INTERVAL '1' DAY)

-- error

Exception in thread "main" org.apache.flink.table.api.TableException: GroupWindowAggregate doesn't support consuming update and delete changes which is produced by node TableSourceScan(table=[[default_catalog, default_database, ods_order, watermark=[-(TO_TIMESTAMP(FROM_UNIXTIME(/($1, 1000))), 3000:INTERVAL SECOND)]]], fields=[id, timestamps, orderInformationId, userId, categoryId, productId, price, productCount, priceSum, shipAddress, receiverAddress])
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.createNewNode(FlinkChangelogModeInferenceProgram.scala:380)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.visit(FlinkChangelogModeInferenceProgram.scala:298)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.visitChild(FlinkChangelogModeInferenceProgram.scala:337)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.$anonfun$visitChildren$1(FlinkChangelogModeInferenceProgram.scala:326)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.$anonfun$visitChildren$1$adapted(FlinkChangelogModeInferenceProgram.scala:325)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:285)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at scala.collection.TraversableLike.map(TraversableLike.scala:285)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:278)
	at scala.collection.AbstractTraversable.map(Traversable.scala:108)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.visitChildren(FlinkChangelogModeInferenceProgram.scala:325)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.visit(FlinkChangelogModeInferenceProgram.scala:275)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.visitChild(FlinkChangelogModeInferenceProgram.scala:337)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.$anonfun$visitChildren$1(FlinkChangelogModeInferenceProgram.scala:326)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.$anonfun$visitChildren$1$adapted(FlinkChangelogModeInferenceProgram.scala:325)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:285)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at scala.collection.TraversableLike.map(TraversableLike.scala:285)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:278)
	at scala.collection.AbstractTraversable.map(Traversable.scala:108)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.visitChildren(FlinkChangelogModeInferenceProgram.scala:325)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.visit(FlinkChangelogModeInferenceProgram.scala:275)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.visitChild(FlinkChangelogModeInferenceProgram.scala:337)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.$anonfun$visitChildren$1(FlinkChangelogModeInferenceProgram.scala:326)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.$anonfun$visitChildren$1$adapted(FlinkChangelogModeInferenceProgram.scala:325)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:285)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at scala.collection.TraversableLike.map(TraversableLike.scala:285)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:278)
	at scala.collection.AbstractTraversable.map(Traversable.scala:108)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.visitChildren(FlinkChangelogModeInferenceProgram.scala:325)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.visitChildren(FlinkChangelogModeInferenceProgram.scala:318)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.visit(FlinkChangelogModeInferenceProgram.scala:200)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.visitChild(FlinkChangelogModeInferenceProgram.scala:337)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.$anonfun$visitChildren$1(FlinkChangelogModeInferenceProgram.scala:326)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.$anonfun$visitChildren$1$adapted(FlinkChangelogModeInferenceProgram.scala:325)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:285)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at scala.collection.TraversableLike.map(TraversableLike.scala:285)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:278)

评论 14
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值